embulk-output-bigquery 0.2.3 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -12
  3. data/CHANGELOG.md +18 -0
  4. data/Gemfile +8 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.md +165 -39
  7. data/Rakefile +11 -0
  8. data/embulk-output-bigquery.gemspec +20 -0
  9. data/example/config_client_options.yml +33 -0
  10. data/example/config_csv.yml +30 -0
  11. data/example/config_delete_in_advance.yml +29 -0
  12. data/example/config_expose_errors.yml +30 -0
  13. data/example/config_guess_from_embulk_schema.yml +29 -0
  14. data/example/config_guess_with_column_options.yml +40 -0
  15. data/example/config_gzip.yml +30 -0
  16. data/example/config_jsonl.yml +30 -0
  17. data/example/config_mode_append.yml +30 -0
  18. data/example/config_mode_append_direct.yml +30 -0
  19. data/example/config_payload_column.yml +20 -0
  20. data/example/config_payload_column_index.yml +20 -0
  21. data/example/config_prevent_duplicate_insert.yml +30 -0
  22. data/example/config_replace.yml +30 -0
  23. data/example/config_replace_backup.yml +32 -0
  24. data/example/config_skip_file_generation.yml +32 -0
  25. data/example/config_table_strftime.yml +30 -0
  26. data/example/config_template_table.yml +21 -0
  27. data/example/config_uncompressed.yml +30 -0
  28. data/example/config_with_rehearsal.yml +32 -0
  29. data/example/example.csv +17 -0
  30. data/example/example.jsonl +16 -0
  31. data/example/example.yml +30 -0
  32. data/example/json_key.json +12 -0
  33. data/example/nested_example.jsonl +16 -0
  34. data/example/schema.json +30 -0
  35. data/example/schema_expose_errors.json +30 -0
  36. data/lib/embulk/output/bigquery.rb +388 -3
  37. data/lib/embulk/output/bigquery/bigquery_client.rb +396 -0
  38. data/lib/embulk/output/bigquery/file_writer.rb +103 -0
  39. data/lib/embulk/output/bigquery/helper.rb +78 -0
  40. data/lib/embulk/output/bigquery/value_converter_factory.rb +292 -0
  41. data/test/helper.rb +13 -0
  42. data/test/test_bigquery_client.rb +166 -0
  43. data/test/test_configure.rb +254 -0
  44. data/test/test_example.rb +34 -0
  45. data/test/test_file_writer.rb +129 -0
  46. data/test/test_helper.rb +103 -0
  47. data/test/test_transaction.rb +129 -0
  48. data/test/test_value_converter_factory.rb +316 -0
  49. metadata +114 -45
  50. data/build.gradle +0 -80
  51. data/config/checkstyle/checkstyle.xml +0 -128
  52. data/config/checkstyle/default.xml +0 -108
  53. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  54. data/gradle/wrapper/gradle-wrapper.properties +0 -6
  55. data/gradlew +0 -164
  56. data/gradlew.bat +0 -90
  57. data/settings.gradle +0 -2
  58. data/src/main/java/org/embulk/output/BigqueryAuthentication.java +0 -117
  59. data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +0 -508
  60. data/src/main/java/org/embulk/output/BigqueryWriter.java +0 -575
  61. data/src/test/java/org/embulk/output/TestBigqueryAuthentication.java +0 -5
  62. data/src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java +0 -5
  63. data/src/test/java/org/embulk/output/TestBigqueryWriter.java +0 -5
@@ -0,0 +1,396 @@
1
+ require 'google/apis/bigquery_v2'
2
+ require 'google/api_client/auth/key_utils'
3
+ require 'json'
4
+ require 'thwait'
5
+ require_relative 'helper'
6
+
7
+ module Embulk
8
+ module Output
9
+ class Bigquery < OutputPlugin
10
+ class Error < StandardError; end
11
+ class JobTimeoutError < Error; end
12
+ class NotFoundError < Error; end
13
+
14
+ class BigqueryClient
15
+ def initialize(task, schema, fields = nil)
16
+ @task = task
17
+ @schema = schema
18
+
19
+ @auth_method = task['auth_method']
20
+ @private_key_path = task['p12_keyfile']
21
+ @private_key_passphrase = 'notasecret'
22
+ @json_key = task['json_keyfile']
23
+
24
+ @project = task['project']
25
+ @dataset = task['dataset']
26
+
27
+ reset_fields(fields) if fields
28
+ end
29
+
30
+ def client
31
+ return @cached_client if @cached_client && @cached_client_expiration > Time.now
32
+
33
+ client = Google::Apis::BigqueryV2::BigqueryService.new
34
+ client.client_options.application_name = @task['application_name']
35
+ client.request_options.retries = @task['retries']
36
+ client.request_options.timeout_sec = @task['timeout_sec']
37
+ client.request_options.open_timeout_sec = @task['open_timeout_sec']
38
+ Embulk.logger.debug { "embulk-output-bigquery: client_options: #{client.client_options.to_h}" }
39
+ Embulk.logger.debug { "embulk-output-bigquery: request_options: #{client.request_options.to_h}" }
40
+
41
+ scope = "https://www.googleapis.com/auth/bigquery"
42
+
43
+ case @auth_method
44
+ when 'private_key'
45
+ key = Google::APIClient::KeyUtils.load_from_pkcs12(@private_key_path, @private_key_passphrase)
46
+ auth = Signet::OAuth2::Client.new(
47
+ token_credential_uri: "https://accounts.google.com/o/oauth2/token",
48
+ audience: "https://accounts.google.com/o/oauth2/token",
49
+ scope: scope,
50
+ issuer: @email,
51
+ signing_key: key)
52
+
53
+ when 'compute_engine'
54
+ auth = Google::Auth::GCECredentials.new
55
+
56
+ when 'json_key'
57
+ if File.exist?(@json_key)
58
+ auth = File.open(@json_key) do |f|
59
+ Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: scope)
60
+ end
61
+ else
62
+ key = StringIO.new(@json_key)
63
+ auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
64
+ end
65
+
66
+ when 'application_default'
67
+ auth = Google::Auth.get_application_default([scope])
68
+
69
+ else
70
+ raise ConfigError, "Unknown auth method: #{@auth_method}"
71
+ end
72
+
73
+ client.authorization = auth
74
+
75
+ @cached_client_expiration = Time.now + 1800
76
+ @cached_client = client
77
+ end
78
+
79
+ def fields
80
+ return @fields if @fields
81
+ if @task['schema_file']
82
+ @fields = Helper.deep_symbolize_keys(JSON.parse(File.read(@task['schema_file'])))
83
+ elsif @task['template_table']
84
+ @fields = fields_from_table(@task['template_table'])
85
+ else
86
+ @fields = Helper.fields_from_embulk_schema(@task, @schema)
87
+ end
88
+ end
89
+
90
+ def fields_from_table(table)
91
+ response = get_table(table)
92
+ response.schema.fields.map {|field| field.to_h }
93
+ end
94
+
95
+ def reset_fields(fields = nil)
96
+ @fields = fields
97
+ self.fields
98
+ end
99
+
100
+ def load_in_parallel(paths, table)
101
+ return [] if paths.empty?
102
+ # You may think as, load job is a background job, so sending requests in parallel
103
+ # does not improve performance. However, with actual experiments, this parallel
104
+ # loadings drastically shortened waiting time. It looks one jobs.insert takes about 50 sec.
105
+ # NOTICE: parallel uploadings of files consumes network traffic. With 24 concurrencies
106
+ # with 100MB files consumed about 500Mbps in the experimented environment at a peak.
107
+ #
108
+ # We before had a `max_load_parallels` option, but this was not extensible for map reduce executor
109
+ # So, we dropped it. See https://github.com/embulk/embulk-output-bigquery/pull/35
110
+ max_load_parallels = paths.size # @task['max_load_parallels'] || paths.size
111
+ responses = []
112
+ paths.each_with_index.each_slice(max_load_parallels) do |paths_group|
113
+ Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths_group}" }
114
+ threads = []
115
+ paths_group.each do |path, idx|
116
+ threads << Thread.new do
117
+ # I am not sure whether google-api-ruby-client is thread-safe,
118
+ # so let me create new instances for each thread for safe
119
+ bigquery = self.class.new(@task, @schema, fields)
120
+ response = bigquery.load(path, table)
121
+ [idx, response]
122
+ end
123
+ end
124
+ ThreadsWait.all_waits(*threads) do |th|
125
+ idx, response = th.value # raise errors occurred in threads
126
+ responses[idx] = response
127
+ end
128
+ end
129
+ responses
130
+ end
131
+
132
+ def load(path, table)
133
+ begin
134
+ if File.exist?(path)
135
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} => #{@project}:#{@dataset}.#{table}" }
136
+ else
137
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
138
+ return
139
+ end
140
+
141
+ if @task['prevent_duplicate_insert']
142
+ job_reference = {
143
+ job_reference: {
144
+ project_id: @project,
145
+ job_id: Helper.create_job_id(@task, path, table, fields),
146
+ }
147
+ }
148
+ else
149
+ job_reference = {}
150
+ end
151
+
152
+ body = {
153
+ configuration: {
154
+ load: {
155
+ destination_table: {
156
+ project_id: @project,
157
+ dataset_id: @dataset,
158
+ table_id: table,
159
+ },
160
+ schema: {
161
+ fields: fields,
162
+ },
163
+ write_disposition: 'WRITE_APPEND',
164
+ source_format: @task['source_format'],
165
+ max_bad_records: @task['max_bad_records'],
166
+ field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
167
+ encoding: @task['encoding'],
168
+ ignore_unknown_values: @task['ignore_unknown_values'],
169
+ allow_quoted_newlines: @task['allow_quoted_newlines'],
170
+ }
171
+ }.merge!(job_reference)
172
+ }
173
+ opts = {
174
+ upload_source: path,
175
+ content_type: "application/octet-stream",
176
+ # options: {
177
+ # retries: @task['retries'],
178
+ # timeout_sec: @task['timeout_sec'],
179
+ # open_timeout_sec: @task['open_timeout_sec']
180
+ # },
181
+ }
182
+ Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
183
+ response = client.insert_job(@project, body, opts)
184
+ unless @task['is_skip_job_result_check']
185
+ wait_load('Load', response)
186
+ end
187
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
188
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
189
+ Embulk.logger.error {
190
+ "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
191
+ }
192
+ raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
193
+ end
194
+ end
195
+
196
+ def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
197
+ begin
198
+ destination_dataset ||= @dataset
199
+ Embulk.logger.info {
200
+ "embulk-output-bigquery: Copy job starting... " \
201
+ "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
202
+ }
203
+ body = {
204
+ configuration: {
205
+ copy: {
206
+ create_deposition: 'CREATE_IF_NEEDED',
207
+ write_disposition: write_disposition,
208
+ source_table: {
209
+ project_id: @project,
210
+ dataset_id: @dataset,
211
+ table_id: source_table,
212
+ },
213
+ destination_table: {
214
+ project_id: @project,
215
+ dataset_id: destination_dataset,
216
+ table_id: destination_table,
217
+ },
218
+ }
219
+ }
220
+ }
221
+ opts = {}
222
+ Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
223
+ response = client.insert_job(@project, body, opts)
224
+ wait_load('Copy', response)
225
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
226
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
227
+ Embulk.logger.error {
228
+ "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
229
+ }
230
+ raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
231
+ "to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
232
+ end
233
+ end
234
+
235
+ def wait_load(kind, response)
236
+ started = Time.now
237
+
238
+ wait_interval = @task['job_status_polling_interval']
239
+ max_polling_time = @task['job_status_max_polling_time']
240
+ _response = response
241
+
242
+ while true
243
+ job_id = _response.job_reference.job_id
244
+ elapsed = Time.now - started
245
+ status = _response.status.state
246
+ if status == "DONE"
247
+ Embulk.logger.info {
248
+ "embulk-output-bigquery: #{kind} job completed successfully... " \
249
+ "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
250
+ }
251
+ break
252
+ elsif elapsed.to_i > max_polling_time
253
+ message = "embulk-output-bigquery: Checking #{kind} job status... " \
254
+ "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
255
+ Embulk.logger.info { message }
256
+ raise JobTimeoutError.new(message)
257
+ else
258
+ Embulk.logger.info {
259
+ "embulk-output-bigquery: Checking #{kind} job status... " \
260
+ "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
261
+ }
262
+ sleep wait_interval
263
+ _response = client.get_job(@project, job_id)
264
+ end
265
+ end
266
+
267
+ # cf. http://www.rubydoc.info/github/google/google-api-ruby-client/Google/Apis/BigqueryV2/JobStatus#errors-instance_method
268
+ # `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
269
+ # Otherwise, this returns nil.
270
+ if _errors = _response.status.errors
271
+ Embulk.logger.error {
272
+ "embulk-output-bigquery: get_job(#{@project}, #{job_id}), " \
273
+ "errors:#{_errors.map(&:to_h)}"
274
+ }
275
+ raise Error, "failed during waiting a job, errors:#{_errors.map(&:to_h)}"
276
+ end
277
+
278
+ _response
279
+ end
280
+
281
+ def create_dataset(dataset = nil, reference: nil)
282
+ dataset ||= @dataset
283
+ begin
284
+ Embulk.logger.info { "embulk-output-bigquery: Create dataset... #{@project}:#{dataset}" }
285
+ hint = {}
286
+ if reference
287
+ response = get_dataset(reference)
288
+ hint = { access: response.access }
289
+ end
290
+ body = {
291
+ dataset_reference: {
292
+ project_id: @project,
293
+ dataset_id: dataset,
294
+ },
295
+ }.merge(hint)
296
+ opts = {}
297
+ Embulk.logger.debug { "embulk-output-bigquery: insert_dataset(#{@project}, #{dataset}, #{body}, #{opts})" }
298
+ client.insert_dataset(@project, body, opts)
299
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
300
+ if e.status_code == 409 && /Already Exists:/ =~ e.message
301
+ # ignore 'Already Exists' error
302
+ return
303
+ end
304
+
305
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
306
+ Embulk.logger.error {
307
+ "embulk-output-bigquery: insert_dataset(#{@project}, #{body}, #{opts}), response:#{response}"
308
+ }
309
+ raise Error, "failed to create dataset #{@project}:#{dataset}, response:#{response}"
310
+ end
311
+ end
312
+
313
+ def get_dataset(dataset = nil)
314
+ dataset ||= @dataset
315
+ begin
316
+ Embulk.logger.info { "embulk-output-bigquery: Get dataset... #{@project}:#{@dataset}" }
317
+ client.get_dataset(@project, dataset)
318
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
319
+ if e.status_code == 404
320
+ raise NotFoundError, "Dataset #{@project}:#{dataset} is not found"
321
+ end
322
+
323
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
324
+ Embulk.logger.error {
325
+ "embulk-output-bigquery: get_dataset(#{@project}, #{dataset}), response:#{response}"
326
+ }
327
+ raise Error, "failed to get dataset #{@project}:#{dataset}, response:#{response}"
328
+ end
329
+ end
330
+
331
+ def create_table(table)
332
+ begin
333
+ Embulk.logger.info { "embulk-output-bigquery: Create table... #{@project}:#{@dataset}.#{table}" }
334
+ body = {
335
+ table_reference: {
336
+ table_id: table,
337
+ },
338
+ schema: {
339
+ fields: fields,
340
+ }
341
+ }
342
+ opts = {}
343
+ Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{@dataset}, #{body}, #{opts})" }
344
+ client.insert_table(@project, @dataset, body, opts)
345
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
346
+ if e.status_code == 409 && /Already Exists:/ =~ e.message
347
+ # ignore 'Already Exists' error
348
+ return
349
+ end
350
+
351
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
352
+ Embulk.logger.error {
353
+ "embulk-output-bigquery: insert_table(#{@project}, #{@dataset}, #{body}, #{opts}), response:#{response}"
354
+ }
355
+ raise Error, "failed to create table #{@project}:#{@dataset}.#{table}, response:#{response}"
356
+ end
357
+ end
358
+
359
+ def delete_table(table)
360
+ begin
361
+ Embulk.logger.info { "embulk-output-bigquery: Delete table... #{@project}:#{@dataset}.#{table}" }
362
+ client.delete_table(@project, @dataset, table)
363
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
364
+ if e.status_code == 404 && /Not found:/ =~ e.message
365
+ # ignore 'Not Found' error
366
+ return
367
+ end
368
+
369
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
370
+ Embulk.logger.error {
371
+ "embulk-output-bigquery: delete_table(#{@project}, #{@dataset}, #{table}), response:#{response}"
372
+ }
373
+ raise Error, "failed to delete table #{@project}:#{@dataset}.#{table}, response:#{response}"
374
+ end
375
+ end
376
+
377
+ def get_table(table)
378
+ begin
379
+ Embulk.logger.info { "embulk-output-bigquery: Get table... #{@project}:#{@dataset}.#{table}" }
380
+ client.get_table(@project, @dataset, table)
381
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
382
+ if e.status_code == 404
383
+ raise NotFoundError, "Table #{@project}:#{@dataset}.#{table} is not found"
384
+ end
385
+
386
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
387
+ Embulk.logger.error {
388
+ "embulk-output-bigquery: get_table(#{@project}, #{@dataset}, #{table}), response:#{response}"
389
+ }
390
+ raise Error, "failed to get table #{@project}:#{@dataset}.#{table}, response:#{response}"
391
+ end
392
+ end
393
+ end
394
+ end
395
+ end
396
+ end
@@ -0,0 +1,103 @@
1
+ require 'zlib'
2
+ require 'json'
3
+ require 'csv'
4
+ require_relative 'value_converter_factory'
5
+
6
+ module Embulk
7
+ module Output
8
+ class Bigquery < OutputPlugin
9
+ class FileWriter
10
+ attr_reader :path
11
+
12
+ def initialize(task, schema, index, converters = nil)
13
+ @task = task
14
+ @schema = schema
15
+ @index = index
16
+ @converters = converters || ValueConverterFactory.create_converters(task, schema)
17
+
18
+ @num_input_rows = 0
19
+ @progress_log_timer = Time.now
20
+ @previous_num_input_rows = 0
21
+
22
+ if @task['payload_column_index']
23
+ @payload_column_index = @task['payload_column_index']
24
+ @formatter_proc = self.method(:to_payload)
25
+ else
26
+ case @task['source_format'].downcase
27
+ when 'csv'
28
+ @formatter_proc = self.method(:to_csv)
29
+ else
30
+ @formatter_proc = self.method(:to_jsonl)
31
+ end
32
+ end
33
+
34
+ @path = sprintf("#{@task['path_prefix']}#{@task['sequence_format']}#{@task['file_ext']}", Process.pid, index)
35
+ Embulk.logger.info { "embulk-output-bigquery: will create #{@path}" }
36
+ if File.exist?(@path)
37
+ Embulk.logger.warn { "embulk-output-bigquery: unlink already existing #{@path}" }
38
+ File.unlink(@path) rescue nil
39
+ end
40
+ @file_io = File.open(@path, 'w')
41
+
42
+ case @task['compression'].downcase
43
+ when 'gzip'
44
+ @io = Zlib::GzipWriter.new(@file_io)
45
+ else
46
+ @io = @file_io
47
+ end
48
+ end
49
+
50
+ def to_payload(record)
51
+ "#{record[@payload_column_index]}\n"
52
+ end
53
+
54
+ def to_csv(record)
55
+ record.map.with_index do |value, column_index|
56
+ @converters[column_index].call(value)
57
+ end.to_csv
58
+ end
59
+
60
+ def to_jsonl(record)
61
+ hash = {}
62
+ column_names = @schema.names
63
+ record.each_with_index do |value, column_index|
64
+ column_name = column_names[column_index]
65
+ hash[column_name] = @converters[column_index].call(value)
66
+ end
67
+ "#{hash.to_json}\n"
68
+ end
69
+
70
+ def num_format(number)
71
+ number.to_s.gsub(/(\d)(?=(\d{3})+(?!\d))/, '\1,')
72
+ end
73
+
74
+ def add(page)
75
+ # I once tried to split IO writing into another IO thread using SizedQueue
76
+ # However, it resulted in worse performance, so I removed the codes.
77
+ page.each do |record|
78
+ Embulk.logger.trace { "embulk-output-bigquery: record #{record}" }
79
+ formatted_record = @formatter_proc.call(record)
80
+ Embulk.logger.trace { "embulk-output-bigquery: formatted_record #{formatted_record.chomp}" }
81
+ @io.write formatted_record
82
+ @num_input_rows += 1
83
+ end
84
+ now = Time.now
85
+ if @progress_log_timer < now - 10 # once in 10 seconds
86
+ speed = ((@num_input_rows - @previous_num_input_rows) / (now - @progress_log_timer).to_f).round(1)
87
+ @progress_log_timer = now
88
+ @previous_num_input_rows = @num_input_rows
89
+ Embulk.logger.info { "embulk-output-bigquery: num_input_rows #{num_format(@num_input_rows)} (#{num_format(speed)} rows/sec)" }
90
+ end
91
+ end
92
+
93
+ def commit
94
+ @io.close rescue nil
95
+ task_report = {
96
+ 'num_input_rows' => @num_input_rows,
97
+ 'path' => @path,
98
+ }
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end