embulk-output-bigquery 0.2.3 → 0.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -12
  3. data/CHANGELOG.md +18 -0
  4. data/Gemfile +8 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.md +165 -39
  7. data/Rakefile +11 -0
  8. data/embulk-output-bigquery.gemspec +20 -0
  9. data/example/config_client_options.yml +33 -0
  10. data/example/config_csv.yml +30 -0
  11. data/example/config_delete_in_advance.yml +29 -0
  12. data/example/config_expose_errors.yml +30 -0
  13. data/example/config_guess_from_embulk_schema.yml +29 -0
  14. data/example/config_guess_with_column_options.yml +40 -0
  15. data/example/config_gzip.yml +30 -0
  16. data/example/config_jsonl.yml +30 -0
  17. data/example/config_mode_append.yml +30 -0
  18. data/example/config_mode_append_direct.yml +30 -0
  19. data/example/config_payload_column.yml +20 -0
  20. data/example/config_payload_column_index.yml +20 -0
  21. data/example/config_prevent_duplicate_insert.yml +30 -0
  22. data/example/config_replace.yml +30 -0
  23. data/example/config_replace_backup.yml +32 -0
  24. data/example/config_skip_file_generation.yml +32 -0
  25. data/example/config_table_strftime.yml +30 -0
  26. data/example/config_template_table.yml +21 -0
  27. data/example/config_uncompressed.yml +30 -0
  28. data/example/config_with_rehearsal.yml +32 -0
  29. data/example/example.csv +17 -0
  30. data/example/example.jsonl +16 -0
  31. data/example/example.yml +30 -0
  32. data/example/json_key.json +12 -0
  33. data/example/nested_example.jsonl +16 -0
  34. data/example/schema.json +30 -0
  35. data/example/schema_expose_errors.json +30 -0
  36. data/lib/embulk/output/bigquery.rb +388 -3
  37. data/lib/embulk/output/bigquery/bigquery_client.rb +396 -0
  38. data/lib/embulk/output/bigquery/file_writer.rb +103 -0
  39. data/lib/embulk/output/bigquery/helper.rb +78 -0
  40. data/lib/embulk/output/bigquery/value_converter_factory.rb +292 -0
  41. data/test/helper.rb +13 -0
  42. data/test/test_bigquery_client.rb +166 -0
  43. data/test/test_configure.rb +254 -0
  44. data/test/test_example.rb +34 -0
  45. data/test/test_file_writer.rb +129 -0
  46. data/test/test_helper.rb +103 -0
  47. data/test/test_transaction.rb +129 -0
  48. data/test/test_value_converter_factory.rb +316 -0
  49. metadata +114 -45
  50. data/build.gradle +0 -80
  51. data/config/checkstyle/checkstyle.xml +0 -128
  52. data/config/checkstyle/default.xml +0 -108
  53. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  54. data/gradle/wrapper/gradle-wrapper.properties +0 -6
  55. data/gradlew +0 -164
  56. data/gradlew.bat +0 -90
  57. data/settings.gradle +0 -2
  58. data/src/main/java/org/embulk/output/BigqueryAuthentication.java +0 -117
  59. data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +0 -508
  60. data/src/main/java/org/embulk/output/BigqueryWriter.java +0 -575
  61. data/src/test/java/org/embulk/output/TestBigqueryAuthentication.java +0 -5
  62. data/src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java +0 -5
  63. data/src/test/java/org/embulk/output/TestBigqueryWriter.java +0 -5
@@ -0,0 +1,78 @@
1
+ require 'digest/md5'
2
+
3
+ module Embulk
4
+ module Output
5
+ class Bigquery < OutputPlugin
6
+ class Helper
7
+ def self.bq_type_from_embulk_type(embulk_type)
8
+ case embulk_type
9
+ when :boolean then 'BOOLEAN'
10
+ when :long then 'INTEGER'
11
+ when :double then 'FLOAT'
12
+ when :string then 'STRING'
13
+ when :timestamp then 'TIMESTAMP'
14
+ when :json then 'STRING' # NOTE: Default is not RECORD since it requires `fields`
15
+ else raise ArgumentError, "embulk type #{embulk_type} is not supported"
16
+ end
17
+ end
18
+
19
+ # @return [Hash] name => column_option.
20
+ # ToDo: recursively map fields?
21
+ def self.column_options_map(column_options)
22
+ (column_options || {}).map do |column_option|
23
+ [column_option['name'], column_option]
24
+ end.to_h
25
+ end
26
+
27
+ def self.fields_from_embulk_schema(task, schema)
28
+ column_options_map = self.column_options_map(task['column_options'])
29
+ schema.map do |column|
30
+ column_name = column[:name]
31
+ embulk_type = column[:type]
32
+ column_option = column_options_map[column_name] || {}
33
+ {}.tap do |field|
34
+ field[:name] = column_name
35
+ field[:type] = (column_option['type'] || bq_type_from_embulk_type(embulk_type)).upcase
36
+ field[:mode] = column_option['mode'] if column_option['mode']
37
+ field[:fields] = deep_symbolize_keys(column_option['fields']) if column_option['fields']
38
+ end
39
+ end
40
+ end
41
+
42
+ def self.deep_symbolize_keys(obj)
43
+ if obj.is_a?(Hash)
44
+ obj.inject({}) do |options, (key, value)|
45
+ options[(key.to_sym rescue key) || key] = deep_symbolize_keys(value)
46
+ options
47
+ end
48
+ elsif obj.is_a?(Array)
49
+ obj.map {|value| deep_symbolize_keys(value) }
50
+ else
51
+ obj
52
+ end
53
+ end
54
+
55
+ def self.create_job_id(task, path, table, fields)
56
+ elements = [
57
+ Digest::MD5.file(path).hexdigest,
58
+ task['dataset'],
59
+ table,
60
+ fields,
61
+ task['source_format'],
62
+ task['max_bad_records'],
63
+ task['field_delimiter'],
64
+ task['encoding'],
65
+ task['ignore_unknown_values'],
66
+ task['allow_quoted_newlines'],
67
+ ]
68
+
69
+ str = elements.map(&:to_s).join('')
70
+ md5 = Digest::MD5.hexdigest(str)
71
+ job_id = "embulk_job_#{md5}"
72
+ Embulk.logger.debug { "embulk-output-bigquery: create_job_id(#{path}, #{table}) #=> #{job_id}" }
73
+ job_id
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,292 @@
1
+ require 'time'
2
+ require 'tzinfo'
3
+ require 'json'
4
+ require_relative 'helper'
5
+
6
+ module Embulk
7
+ module Output
8
+ class Bigquery < OutputPlugin
9
+ class ValueConverterFactory
10
+ class NotSupportedType < StandardError; end
11
+ class TypeCastError < StandardError; end
12
+
13
+ # ref. https://cloud.google.com/bigquery/preparing-data-for-bigquery
14
+
15
+ DEFAULT_TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%6N" # BigQuery timestamp format
16
+ DEFAULT_TIMEZONE = "UTC"
17
+
18
+ # @param [Hash] task
19
+ # @option task [String] default_timestamp_format
20
+ # @option task [String] default_timezone
21
+ # @option task [Hash] column_options user defined column types
22
+ # @param [Schema] schema embulk defined column types
23
+ # @return [Array] an arary whose key is column_index, and value is its converter (Proc)
24
+ def self.create_converters(task, schema)
25
+ column_options_map = Helper.column_options_map(task['column_options'])
26
+ default_timestamp_format = task['default_timestamp_format']
27
+ default_timezone = task['default_timezone']
28
+ schema.map do |column|
29
+ column_name = column[:name]
30
+ embulk_type = column[:type]
31
+ column_option = column_options_map[column_name] || {}
32
+ self.new(
33
+ embulk_type, column_option['type'],
34
+ timestamp_format: column_option['timestamp_format'],
35
+ timezone: column_option['timezone'],
36
+ strict: column_option['strict'],
37
+ default_timestamp_format: default_timestamp_format,
38
+ default_timezone: default_timezone,
39
+ ).create_converter
40
+ end
41
+ end
42
+
43
+ attr_reader :embulk_type, :type, :timestamp_format, :timezone, :zone_offset, :strict
44
+
45
+ def initialize(
46
+ embulk_type, type = nil,
47
+ timestamp_format: nil, timezone: nil, strict: nil,
48
+ default_timestamp_format: DEFAULT_TIMESTAMP_FORMAT,
49
+ default_timezone: DEFAULT_TIMEZONE
50
+ )
51
+ @embulk_type = embulk_type
52
+ @type = (type || Helper.bq_type_from_embulk_type(embulk_type)).upcase
53
+ @timestamp_format = timestamp_format
54
+ @default_timestamp_format = default_timestamp_format
55
+ @timezone = timezone || default_timezone
56
+ @zone_offset = get_zone_offset(@timezone) if @timezone
57
+ @strict = strict.nil? ? true : strict
58
+ end
59
+
60
+ def create_converter
61
+ case embulk_type
62
+ when :boolean then boolean_converter
63
+ when :long then long_converter
64
+ when :double then double_converter
65
+ when :string then string_converter
66
+ when :timestamp then timestamp_converter
67
+ when :json then json_converter
68
+ else raise NotSupportedType, "embulk type #{embulk_type} is not supported"
69
+ end
70
+ end
71
+
72
+ def with_typecast_error(val)
73
+ begin
74
+ yield(val)
75
+ rescue => e
76
+ raise_typecast_error(val)
77
+ end
78
+ end
79
+
80
+ def raise_typecast_error(val)
81
+ message = "cannot cast #{@embulk_type} `#{val}` to #{@type}"
82
+ if @strict
83
+ raise TypeCastError, message
84
+ else
85
+ Embulk.logger.trace { message }
86
+ return nil
87
+ end
88
+ end
89
+
90
+ def boolean_converter
91
+ case type
92
+ when 'BOOLEAN'
93
+ Proc.new {|val|
94
+ val
95
+ }
96
+ when 'STRING'
97
+ Proc.new {|val|
98
+ next nil if val.nil?
99
+ val.to_s
100
+ }
101
+ else
102
+ raise NotSupportedType, "cannot take column type #{type} for boolean column"
103
+ end
104
+ end
105
+
106
+ def long_converter
107
+ case type
108
+ when 'BOOLEAN'
109
+ Proc.new {|val|
110
+ next nil if val.nil?
111
+ next true if val == 1
112
+ next false if val == 0
113
+ raise_typecast_error(val)
114
+ }
115
+ when 'INTEGER'
116
+ Proc.new {|val|
117
+ val
118
+ }
119
+ when 'FLOAT'
120
+ Proc.new {|val|
121
+ next nil if val.nil?
122
+ val.to_f
123
+ }
124
+ when 'STRING'
125
+ Proc.new {|val|
126
+ next nil if val.nil?
127
+ val.to_s
128
+ }
129
+ when 'TIMESTAMP'
130
+ Proc.new {|val|
131
+ next nil if val.nil?
132
+ val # BigQuery supports UNIX timestamp
133
+ }
134
+ else
135
+ raise NotSupportedType, "cannot take column type #{type} for long column"
136
+ end
137
+ end
138
+
139
+ def double_converter
140
+ case type
141
+ when 'INTEGER'
142
+ Proc.new {|val|
143
+ next nil if val.nil?
144
+ val.to_i
145
+ }
146
+ when 'FLOAT'
147
+ Proc.new {|val|
148
+ val
149
+ }
150
+ when 'STRING'
151
+ Proc.new {|val|
152
+ next nil if val.nil?
153
+ val.to_s
154
+ }
155
+ when 'TIMESTAMP'
156
+ Proc.new {|val|
157
+ next nil if val.nil?
158
+ val # BigQuery supports UNIX timestamp
159
+ }
160
+ else
161
+ raise NotSupportedType, "cannot take column type #{type} for double column"
162
+ end
163
+ end
164
+
165
+ def string_converter
166
+ case type
167
+ when 'BOOLEAN'
168
+ Proc.new {|val|
169
+ next nil if val.nil?
170
+ next true if val == 'true'.freeze
171
+ next false if val == 'false'.freeze
172
+ raise_typecast_error(val)
173
+ }
174
+ when 'INTEGER'
175
+ Proc.new {|val|
176
+ next nil if val.nil?
177
+ with_typecast_error(val) do |val|
178
+ Integer(val)
179
+ end
180
+ }
181
+ when 'FLOAT'
182
+ Proc.new {|val|
183
+ next nil if val.nil?
184
+ with_typecast_error(val) do |val|
185
+ Float(val)
186
+ end
187
+ }
188
+ when 'STRING'
189
+ Proc.new {|val|
190
+ val
191
+ }
192
+ when 'TIMESTAMP'
193
+ if @timestamp_format
194
+ Proc.new {|val|
195
+ next nil if val.nil?
196
+ with_typecast_error(val) do |val|
197
+ strptime_with_zone(val, @timestamp_format, zone_offset).to_f
198
+ end
199
+ }
200
+ else
201
+ Proc.new {|val|
202
+ next nil if val.nil?
203
+ val # Users must care of BQ timestamp format
204
+ }
205
+ end
206
+ when 'RECORD'
207
+ Proc.new {|val|
208
+ next nil if val.nil?
209
+ with_typecast_error(val) do |val|
210
+ JSON.parse(val)
211
+ end
212
+ }
213
+ else
214
+ raise NotSupportedType, "cannot take column type #{type} for string column"
215
+ end
216
+ end
217
+
218
+ def timestamp_converter
219
+ case type
220
+ when 'INTEGER'
221
+ Proc.new {|val|
222
+ next nil if val.nil?
223
+ val.to_i
224
+ }
225
+ when 'FLOAT'
226
+ Proc.new {|val|
227
+ next nil if val.nil?
228
+ val.to_f
229
+ }
230
+ when 'STRING'
231
+ _timestamp_format = @timestamp_format || @default_timestamp_format
232
+ Proc.new {|val|
233
+ next nil if val.nil?
234
+ with_typecast_error(val) do |val|
235
+ val.localtime(zone_offset).strftime(_timestamp_format)
236
+ end
237
+ }
238
+ when 'TIMESTAMP'
239
+ Proc.new {|val|
240
+ next nil if val.nil?
241
+ val.to_f # BigQuery supports UNIX timestamp
242
+ }
243
+ else
244
+ raise NotSupportedType, "cannot take column type #{type} for timestamp column"
245
+ end
246
+ end
247
+
248
+ # ToDo: recursive conversion
249
+ def json_converter
250
+ case type
251
+ when 'STRING'
252
+ Proc.new {|val|
253
+ next nil if val.nil?
254
+ val.to_json
255
+ }
256
+ when 'RECORD'
257
+ Proc.new {|val|
258
+ val
259
+ }
260
+ else
261
+ raise NotSupportedType, "cannot take column type #{type} for json column"
262
+ end
263
+ end
264
+
265
+ private
266
+
267
+ # [+-]HH:MM, [+-]HHMM, [+-]HH
268
+ NUMERIC_PATTERN = %r{\A[+-]\d\d(:?\d\d)?\z}
269
+
270
+ # Region/Zone, Region/Zone/Zone
271
+ NAME_PATTERN = %r{\A[^/]+/[^/]+(/[^/]+)?\z}
272
+
273
+ def strptime_with_zone(date, timestamp_format, zone_offset)
274
+ time = Time.strptime(date, timestamp_format)
275
+ utc_offset = time.utc_offset
276
+ time.localtime(zone_offset) + utc_offset - zone_offset
277
+ end
278
+
279
+ def get_zone_offset(timezone)
280
+ if NUMERIC_PATTERN === timezone
281
+ Time.zone_offset(timezone)
282
+ elsif NAME_PATTERN === timezone || 'UTC' == timezone
283
+ tz = TZInfo::Timezone.get(timezone)
284
+ tz.period_for_utc(Time.now).utc_total_offset
285
+ else
286
+ raise ArgumentError, "timezone format is invalid: #{timezone}"
287
+ end
288
+ end
289
+ end
290
+ end
291
+ end
292
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require 'test/unit/rr'
5
+
6
+ # require 'embulk/java/bootstrap'
7
+ require 'embulk'
8
+ Embulk.setup
9
+ Embulk.logger = Embulk::Logger.new('/dev/null')
10
+
11
+ APP_ROOT = File.expand_path('../', __dir__)
12
+ EXAMPLE_ROOT = File.expand_path('../example', __dir__)
13
+ TEST_ROOT = File.expand_path(File.dirname(__FILE__))
@@ -0,0 +1,166 @@
1
+ require_relative './helper'
2
+ require 'embulk/output/bigquery/bigquery_client'
3
+ require 'csv'
4
+
5
+ # 1. Prepare /tmp/your-project-000.json
6
+ # 2. CONNECT=1 bunlde exec ruby test/test_bigquery_client.rb
7
+
8
+ if ENV['CONNECT']
9
+ module Embulk
10
+ class Output::Bigquery
11
+ class TestBigqueryClient < Test::Unit::TestCase
12
+ class << self
13
+ def startup
14
+ FileUtils.mkdir_p('tmp')
15
+ end
16
+
17
+ def shutdown
18
+ FileUtils.rm_rf('tmp')
19
+ end
20
+ end
21
+
22
+ def client(task = {})
23
+ task = least_task.merge(task)
24
+ BigqueryClient.new(task, schema)
25
+ end
26
+
27
+ def least_task
28
+ {
29
+ 'project' => JSON.parse(File.read('/tmp/your-project-000.json'))['project_id'],
30
+ 'dataset' => 'your_dataset_name',
31
+ 'table' => 'your_table_name',
32
+ 'auth_method' => 'json_key',
33
+ 'json_keyfile' => '/tmp/your-project-000.json',
34
+ 'retries' => 3,
35
+ 'timeout_sec' => 300,
36
+ 'open_timeout_sec' => 300,
37
+ 'job_status_max_polling_time' => 3600,
38
+ 'job_status_polling_interval' => 10,
39
+ 'source_format' => 'CSV'
40
+ }
41
+ end
42
+
43
+ def schema
44
+ Schema.new([
45
+ Column.new({index: 0, name: 'boolean', type: :boolean}),
46
+ Column.new({index: 1, name: 'long', type: :long}),
47
+ Column.new({index: 2, name: 'double', type: :double}),
48
+ Column.new({index: 3, name: 'string', type: :string}),
49
+ Column.new({index: 4, name: 'timestamp', type: :timestamp}),
50
+ Column.new({index: 5, name: 'json', type: :json}),
51
+ ])
52
+ end
53
+
54
+ def record
55
+ [true,1,1.1,'1',Time.parse("2016-02-26 +00:00"),'{"foo":"bar"}']
56
+ end
57
+
58
+ sub_test_case "client" do
59
+ def test_json_keyfile
60
+ assert_nothing_raised { BigqueryClient.new(least_task, schema).client }
61
+ end
62
+
63
+ def test_p12_keyfile
64
+ # pending
65
+ end
66
+ end
67
+
68
+ sub_test_case "create_dataset" do
69
+ def test_create_dataset
70
+ assert_nothing_raised { client.create_dataset }
71
+ end
72
+
73
+ def test_create_dataset_with_reference
74
+ response = client.get_dataset
75
+ any_instance_of(BigqueryClient) do |obj|
76
+ mock(obj).get_dataset('your_dataset_name') { response }
77
+ end
78
+ assert_nothing_raised do
79
+ client.create_dataset('your_dataset_name_old', reference: 'your_dataset_name')
80
+ end
81
+ end
82
+ end
83
+
84
+ sub_test_case "get_dataset" do
85
+ def test_get_dataset
86
+ assert_nothing_raised { client.create_dataset }
87
+ assert_nothing_raised { client.get_dataset }
88
+ end
89
+
90
+ def test_get_dataset_not_found
91
+ assert_raise(NotFoundError) {
92
+ client.get_dataset('something_does_not_exist')
93
+ }
94
+ end
95
+ end
96
+
97
+ sub_test_case "create_table" do
98
+ def test_create_table
99
+ client.delete_table('your_table_name')
100
+ assert_nothing_raised { client.create_table('your_table_name') }
101
+ end
102
+
103
+ def test_create_table_already_exists
104
+ assert_nothing_raised { client.create_table('your_table_name') }
105
+ end
106
+ end
107
+
108
+ sub_test_case "delete_table" do
109
+ def test_delete_table
110
+ client.create_table('your_table_name')
111
+ assert_nothing_raised { client.delete_table('your_table_name') }
112
+ end
113
+
114
+ def test_delete_table_not_found
115
+ assert_nothing_raised { client.delete_table('your_table_name') }
116
+ end
117
+ end
118
+
119
+ sub_test_case "get_table" do
120
+ def test_get_table
121
+ client.create_table('your_table_name')
122
+ assert_nothing_raised { client.get_table('your_table_name') }
123
+ end
124
+
125
+ def test_get_table_not_found
126
+ client.delete_table('your_table_name')
127
+ assert_raise(NotFoundError) {
128
+ client.get_table('your_table_name')
129
+ }
130
+ end
131
+ end
132
+
133
+ sub_test_case "fields" do
134
+ def test_fields_from_table
135
+ client.create_table('your_table_name')
136
+ fields = client.fields_from_table('your_table_name')
137
+ expected = [
138
+ {:type=>"BOOLEAN", :name=>"boolean"},
139
+ {:type=>"INTEGER", :name=>"long"},
140
+ {:type=>"FLOAT", :name=>"double"},
141
+ {:type=>"STRING", :name=>"string"},
142
+ {:type=>"TIMESTAMP", :name=>"timestamp"},
143
+ {:type=>"STRING", :name=>"json"},
144
+ ]
145
+ assert_equal expected, fields
146
+ end
147
+ end
148
+
149
+ sub_test_case "copy" do
150
+ def test_create_table
151
+ client.create_table('your_table_name')
152
+ assert_nothing_raised { client.copy('your_table_name', 'your_table_name_old') }
153
+ end
154
+ end
155
+
156
+ sub_test_case "load" do
157
+ def test_load
158
+ client.create_table('your_table_name')
159
+ File.write("tmp/your_file_name.csv", record.to_csv)
160
+ assert_nothing_raised { client.load("/tmp/your_file_name.csv", 'your_table_name') }
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end