embulk-output-bigquery 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 06dd5ff0d084e46e4e6d3c5b4428b5a0500a69e1
4
- data.tar.gz: 974f71b43c073e5f324e27232cac48100ce5c9bf
3
+ metadata.gz: f21e4f5989b1aa631de606560ee75591a113c6f5
4
+ data.tar.gz: da801735b3ad2871a5d78bdde79d4f8e5e87ca30
5
5
  SHA512:
6
- metadata.gz: 55fc22719752768be1f4a45b1e7a4c011f01a75dfd68840f1deadbed8e2aa4cd88071ddbeaf9e3ea63ac0cdbe0b875756a51f9ef024c08f5722545833da6b5e3
7
- data.tar.gz: a5d30638f60ad162950219fbc58a525dd11662cc86b05d94cb02b8fd7dbf089a644a20f5a35a06e51e85f8685f5a1adb1fb07e24c46cc7d240f7453355c77953
6
+ metadata.gz: 582b300dacd9a45e39b424c3d0c0c3a887f5edc860430b2f0341df945ee723c0c8c5458619f28f18b2f028fe214fc3dbf58afd2751735bd2c143addb5ba164b3
7
+ data.tar.gz: 593d02fb4ec66bff1e3095e7e65f4d9b2adc3cb471ec3e998007ccc0fef73cfb48ad1bee6b0ee232d45ad41031affacb941e1d89097520052de007310d769465
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 0.3.1 - 2016-04-15
2
+
3
+ * [new feature] Add `sdk_log_level` option to show log of google-api-client
4
+ * [maintenance] Fix `prevent_duplicate_insert` was not working correctly
5
+ * [maintenance] Change to get `num_output_rows` of `transaction_report` from `get_table` API
6
+ * [maintenance] Log response.statistics of load jobs
7
+ * [maintenance] Always create job_id on client side as [google recommends](https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs) so that duplication not to be occurred
8
+ * [maintenance] Fix a possibility which rehearsal would load 0 rows file
9
+
1
10
  ## 0.3.0 - 2016-04-08
2
11
 
3
12
  Big change is introduced. Now, embulk-output-bigquery is written in JRuby.
data/README.md CHANGED
@@ -39,7 +39,7 @@ OAuth flow for installed applications.
39
39
  | auto_create_table | boolean | optional | false | [See below](#dynamic-table-creating) |
40
40
  | schema_file | string | optional | | /path/to/schema.json |
41
41
  | template_table | string | optional | | template table name [See below](#dynamic-table-creating) |
42
- | prevent_duplicate_insert | boolean | optional | false | [See below](#data-consistency) |
42
+ | prevent_duplicate_insert | boolean | optional | false | [See below](#prevent-duplication) |
43
43
  | job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
44
44
  | job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
45
45
  | is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
@@ -59,6 +59,7 @@ Client or request options
59
59
  | open_timeout_sec | integer | optional | 300 | Seconds to wait for the connection to open |
60
60
  | retries | integer | optional | 5 | Number of retries |
61
61
  | application_name | string | optional | "Embulk BigQuery plugin" | User-Agent |
62
+ | sdk_log_level | string | optional | nil (WARN) | Log level of google api client library |
62
63
 
63
64
  Options for intermediate local files
64
65
 
@@ -317,15 +318,15 @@ out:
317
318
  payload_column_index: 0 # or, payload_column: payload
318
319
  ```
319
320
 
320
- ### Data Consistency
321
+ ### Prevent Duplication
321
322
 
322
- When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options to prevent duplicate data insertion.
323
+ `prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
323
324
 
324
- `job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
325
+ When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
325
326
 
326
- [job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency). So same data can't insert with same settings.
327
+ `job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
327
328
 
328
- In other words, you can retry as many times as you like, in case something bad error(like network error) happens before job insertion.
329
+ [job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
329
330
 
330
331
  ```yaml
331
332
  out:
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.0"
3
+ spec.version = "0.3.1"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -18,7 +18,7 @@ in:
18
18
  - {name: boolean, type: boolean}
19
19
  out:
20
20
  type: bigquery
21
- mode: replace
21
+ mode: append
22
22
  auth_method: json_key
23
23
  json_keyfile: example/your-project-000.json
24
24
  dataset: your_dataset_name
@@ -44,6 +44,7 @@ module Embulk
44
44
  'auto_create_table' => config.param('auto_create_table', :bool, :default => false),
45
45
  'schema_file' => config.param('schema_file', :string, :default => nil),
46
46
  'template_table' => config.param('template_table', :string, :default => nil),
47
+
47
48
  'delete_from_local_when_job_end' => config.param('delete_from_local_when_job_end', :bool, :default => true),
48
49
  'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
49
50
  'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
@@ -62,6 +63,7 @@ module Embulk
62
63
  'open_timeout_sec' => config.param('open_timeout_sec', :integer, :default => 300),
63
64
  'retries' => config.param('retries', :integer, :default => 5),
64
65
  'application_name' => config.param('application_name', :string, :default => 'Embulk BigQuery plugin'),
66
+ 'sdk_log_level' => config.param('sdk_log_level', :string, :default => nil),
65
67
 
66
68
  'path_prefix' => config.param('path_prefix', :string, :default => nil),
67
69
  'sequence_format' => config.param('sequence_format', :string, :default => '.%d.%d'),
@@ -201,6 +203,10 @@ module Embulk
201
203
  task['rehearsal_table'] ||= "LOAD_REHEARSAL_#{unique_name}_#{task['table']}"
202
204
  end
203
205
 
206
+ if task['sdk_log_level']
207
+ Google::Apis.logger.level = eval("::Logger::#{task['sdk_log_level'].upcase}")
208
+ end
209
+
204
210
  task
205
211
  end
206
212
 
@@ -220,16 +226,16 @@ module Embulk
220
226
  @rehearsal_thread = rehearsal_thread
221
227
  end
222
228
 
223
- def self.transaction_report(task_reports, responses)
224
- num_input_rows = task_reports.inject(0) do |sum, task_report|
225
- sum + task_report['num_input_rows']
226
- end
227
- num_output_rows = responses.inject(0) do |sum, response|
229
+ def self.transaction_report(file_writers, responses, target_table)
230
+ num_input_rows = file_writers.empty? ? 0 : file_writers.map(&:num_rows).inject(:+)
231
+ num_response_rows = responses.inject(0) do |sum, response|
228
232
  sum + (response ? response.statistics.load.output_rows.to_i : 0)
229
233
  end
234
+ num_output_rows = bigquery.get_table(target_table).num_rows.to_i
230
235
  num_rejected_rows = num_input_rows - num_output_rows
231
236
  transaction_report = {
232
237
  'num_input_rows' => num_input_rows,
238
+ 'num_response_rows' => num_response_rows,
233
239
  'num_output_rows' => num_output_rows,
234
240
  'num_rejected_rows' => num_rejected_rows,
235
241
  }
@@ -278,12 +284,12 @@ module Embulk
278
284
  path_pattern = "#{task['path_prefix']}*#{task['file_ext']}"
279
285
  Embulk.logger.info { "embulk-output-bigquery: Skip file generation. Get paths from `#{path_pattern}`" }
280
286
  paths = Dir.glob(path_pattern)
281
- task_reports = paths.map {|path| { 'num_input_rows' => 0 } }
282
287
  else
283
288
  task_reports = yield(task) # generates local files
284
- Embulk.logger.info { "embulk-output-bigquery: task_reports: #{task_reports.to_json}" }
285
- paths = FileWriter.paths
286
- FileWriter.ios.values.each do |io|
289
+
290
+ ios = file_writers.map(&:io)
291
+ paths = ios.map(&:path)
292
+ ios.each do |io|
287
293
  Embulk.logger.debug { "close #{io.path}" }
288
294
  io.close rescue nil
289
295
  end
@@ -298,7 +304,7 @@ module Embulk
298
304
  else
299
305
  target_table = task['temp_table'] ? task['temp_table'] : task['table']
300
306
  responses = bigquery.load_in_parallel(paths, target_table)
301
- transaction_report = self.transaction_report(task_reports, responses)
307
+ transaction_report = self.transaction_report(file_writers, responses, target_table)
302
308
  Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
303
309
 
304
310
  if task['mode'] == 'replace_backup'
@@ -339,37 +345,63 @@ module Embulk
339
345
  return next_config_diff
340
346
  end
341
347
 
342
- # instance is created on each thread
348
+ @file_writers_mutex = Mutex.new
349
+ @file_writers = Array.new
350
+
351
+ def self.reset_file_writers
352
+ @file_writers = Array.new
353
+ end
354
+
355
+ def self.file_writers
356
+ @file_writers
357
+ end
358
+
359
+ def self.add_file_writer(file_writer)
360
+ @file_writers_mutex.synchronize do
361
+ @file_writers << file_writer
362
+ end
363
+ end
364
+
365
+ FILE_WRITER_KEY = :embulk_output_bigquery_file_writer
366
+
367
+ # Create one FileWriter object for one output thread, that is, share among tasks.
368
+ # Close theses shared objects in transaction.
369
+ # This is mainly to suppress (or control by -X max_threads) number of files, which
370
+ # equals to number of concurrency to load in parallel, when number of input tasks is many
371
+ #
372
+ # #file_writer must be called at only #add because threads in other methods
373
+ # are different (called from non-output threads). Note also that #add method
374
+ # of the same task instance would be called in different output threads
375
+ def file_writer
376
+ return Thread.current[FILE_WRITER_KEY] if Thread.current[FILE_WRITER_KEY]
377
+ file_writer = FileWriter.new(@task, @schema, @index, self.class.converters)
378
+ self.class.add_file_writer(file_writer)
379
+ Thread.current[FILE_WRITER_KEY] = file_writer
380
+ end
381
+
382
+ # instance is created on each task
343
383
  def initialize(task, schema, index)
344
384
  super
345
385
 
346
386
  if task['with_rehearsal'] and @index == 0
347
387
  @rehearsaled = false
348
- @num_rows = 0
349
- end
350
-
351
- unless task['skip_file_generation']
352
- @file_writer = FileWriter.new(task, schema, index, self.class.converters)
353
388
  end
354
389
  end
355
390
 
356
- # called for each page in each thread
391
+ # called for each page in each task
357
392
  def close
358
393
  end
359
394
 
360
- # called for each page in each thread
395
+ # called for each page in each task
361
396
  def add(page)
397
+ return if task['skip_file_generation']
398
+ num_rows = file_writer.add(page)
399
+
362
400
  if task['with_rehearsal'] and @index == 0 and !@rehearsaled
363
- page = page.to_a # to avoid https://github.com/embulk/embulk/issues/403
364
- if @num_rows >= task['rehearsal_counts']
401
+ if num_rows >= task['rehearsal_counts']
365
402
  load_rehearsal
366
403
  @rehearsaled = true
367
404
  end
368
- @num_rows += page.to_a.size
369
- end
370
-
371
- unless task['skip_file_generation']
372
- @file_writer.add(page)
373
405
  end
374
406
  end
375
407
 
@@ -377,11 +409,11 @@ module Embulk
377
409
  bigquery = self.class.bigquery
378
410
  Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
379
411
 
380
- io = @file_writer.close # need to close once for gzip
412
+ io = file_writer.close # need to close once for gzip
381
413
  rehearsal_path = "#{io.path}.rehearsal"
382
414
  Embulk.logger.debug { "embulk_output_bigquery: cp #{io.path} #{rehearsal_path}" }
383
415
  FileUtils.cp(io.path, rehearsal_path)
384
- @file_writer.reopen
416
+ file_writer.reopen
385
417
 
386
418
  self.class.rehearsal_thread = Thread.new do
387
419
  begin
@@ -403,13 +435,9 @@ module Embulk
403
435
  def abort
404
436
  end
405
437
 
406
- # called after processing all pages in each thread, returns a task_report
438
+ # called after processing all pages in each task, returns a task_report
407
439
  def commit
408
- unless task['skip_file_generation']
409
- @file_writer.commit
410
- else
411
- {}
412
- end
440
+ {}
413
441
  end
414
442
  end
415
443
  end
@@ -107,49 +107,46 @@ module Embulk
107
107
  #
108
108
  # We before had a `max_load_parallels` option, but this was not extensible for map reduce executor
109
109
  # So, we dropped it. See https://github.com/embulk/embulk-output-bigquery/pull/35
110
- max_load_parallels = paths.size # @task['max_load_parallels'] || paths.size
111
110
  responses = []
112
- paths.each_with_index.each_slice(max_load_parallels) do |paths_group|
113
- Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths_group}" }
114
- threads = []
115
- paths_group.each do |path, idx|
116
- threads << Thread.new do
117
- # I am not sure whether google-api-ruby-client is thread-safe,
118
- # so let me create new instances for each thread for safe
119
- bigquery = self.class.new(@task, @schema, fields)
120
- response = bigquery.load(path, table)
121
- [idx, response]
122
- end
123
- end
124
- ThreadsWait.all_waits(*threads) do |th|
125
- idx, response = th.value # raise errors occurred in threads
126
- responses[idx] = response
111
+ threads = []
112
+ Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
113
+ paths.each_with_index do |path, idx|
114
+ threads << Thread.new do
115
+ # I am not sure whether google-api-ruby-client is thread-safe,
116
+ # so let me create new instances for each thread for safe
117
+ bigquery = self.class.new(@task, @schema, fields)
118
+ response = bigquery.load(path, table)
119
+ [idx, response]
127
120
  end
128
121
  end
122
+ ThreadsWait.all_waits(*threads) do |th|
123
+ idx, response = th.value # raise errors occurred in threads
124
+ responses[idx] = response
125
+ end
129
126
  responses
130
127
  end
131
128
 
132
129
  def load(path, table)
133
130
  begin
134
131
  if File.exist?(path)
135
- Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} => #{@project}:#{@dataset}.#{table}" }
132
+ # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
133
+ # we should generate job_id in client code, otherwise, retrying would cause duplication
134
+ if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
135
+ job_id = Helper.create_load_job_id(@task, path, fields)
136
+ else
137
+ job_id = "embulk_load_job_#{SecureRandom.uuid}"
138
+ end
139
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
136
140
  else
137
141
  Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
138
142
  return
139
143
  end
140
144
 
141
- if @task['prevent_duplicate_insert']
142
- job_reference = {
143
- job_reference: {
144
- project_id: @project,
145
- job_id: Helper.create_job_id(@task, path, table, fields),
146
- }
147
- }
148
- else
149
- job_reference = {}
150
- end
151
-
152
145
  body = {
146
+ job_reference: {
147
+ project_id: @project,
148
+ job_id: job_id,
149
+ },
153
150
  configuration: {
154
151
  load: {
155
152
  destination_table: {
@@ -168,8 +165,9 @@ module Embulk
168
165
  ignore_unknown_values: @task['ignore_unknown_values'],
169
166
  allow_quoted_newlines: @task['allow_quoted_newlines'],
170
167
  }
171
- }.merge!(job_reference)
168
+ }
172
169
  }
170
+
173
171
  opts = {
174
172
  upload_source: path,
175
173
  content_type: "application/octet-stream",
@@ -182,7 +180,7 @@ module Embulk
182
180
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
183
181
  response = client.insert_job(@project, body, opts)
184
182
  unless @task['is_skip_job_result_check']
185
- wait_load('Load', response)
183
+ response = wait_load('Load', response)
186
184
  end
187
185
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
188
186
  response = {status_code: e.status_code, message: e.message, error_class: e.class}
@@ -196,11 +194,18 @@ module Embulk
196
194
  def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
197
195
  begin
198
196
  destination_dataset ||= @dataset
197
+ job_id = "embulk_copy_job_#{SecureRandom.uuid}"
198
+
199
199
  Embulk.logger.info {
200
- "embulk-output-bigquery: Copy job starting... " \
200
+ "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
201
201
  "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
202
202
  }
203
+
203
204
  body = {
205
+ job_reference: {
206
+ project_id: @project,
207
+ job_id: job_id,
208
+ },
204
209
  configuration: {
205
210
  copy: {
206
211
  create_deposition: 'CREATE_IF_NEEDED',
@@ -218,6 +223,7 @@ module Embulk
218
223
  }
219
224
  }
220
225
  }
226
+
221
227
  opts = {}
222
228
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
223
229
  response = client.insert_job(@project, body, opts)
@@ -246,18 +252,18 @@ module Embulk
246
252
  if status == "DONE"
247
253
  Embulk.logger.info {
248
254
  "embulk-output-bigquery: #{kind} job completed... " \
249
- "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
255
+ "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
250
256
  }
251
257
  break
252
258
  elsif elapsed.to_i > max_polling_time
253
- message = "embulk-output-bigquery: Checking #{kind} job status... " \
254
- "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
259
+ message = "embulk-output-bigquery: #{kind} job checking... " \
260
+ "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
255
261
  Embulk.logger.info { message }
256
262
  raise JobTimeoutError.new(message)
257
263
  else
258
264
  Embulk.logger.info {
259
- "embulk-output-bigquery: Checking #{kind} job status... " \
260
- "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
265
+ "embulk-output-bigquery: #{kind} job checking... " \
266
+ "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
261
267
  }
262
268
  sleep wait_interval
263
269
  _response = client.get_job(@project, job_id)
@@ -275,6 +281,8 @@ module Embulk
275
281
  raise Error, "failed during waiting a #{kind} job, errors:#{_errors.map(&:to_h)}"
276
282
  end
277
283
 
284
+ Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
285
+
278
286
  _response
279
287
  end
280
288
 
@@ -7,15 +7,17 @@ module Embulk
7
7
  module Output
8
8
  class Bigquery < OutputPlugin
9
9
  class FileWriter
10
+ attr_reader :num_rows
11
+
10
12
  def initialize(task, schema, index, converters = nil)
11
13
  @task = task
12
14
  @schema = schema
13
15
  @index = index
14
16
  @converters = converters || ValueConverterFactory.create_converters(task, schema)
15
17
 
16
- @num_input_rows = 0
18
+ @num_rows = 0
17
19
  @progress_log_timer = Time.now
18
- @previous_num_input_rows = 0
20
+ @previous_num_rows = 0
19
21
 
20
22
  if @task['payload_column_index']
21
23
  @payload_column_index = @task['payload_column_index']
@@ -30,35 +32,8 @@ module Embulk
30
32
  end
31
33
  end
32
34
 
33
- @mutex = Mutex.new
34
- @ios = Hash.new
35
-
36
- def self.mutex
37
- @mutex
38
- end
39
-
40
- def self.reset_ios
41
- @ios = Hash.new
42
- end
43
-
44
- def self.ios
45
- @ios
46
- end
47
-
48
- def self.paths
49
- @ios.keys
50
- end
51
-
52
- THREAD_LOCAL_IO_KEY = :embulk_output_bigquery_file_writer_io
53
-
54
- # Create one io object for one output thread, that is, share among tasks
55
- # Close theses shared io objects in transaction
56
- #
57
- # Thread IO must be created at #add because threads in #initialize or #commit
58
- # are different (called from non-output threads). Note also that #add of the
59
- # same instance would be called in different output threads
60
- def thread_io
61
- return Thread.current[THREAD_LOCAL_IO_KEY] if Thread.current[THREAD_LOCAL_IO_KEY]
35
+ def io
36
+ return @io if @io
62
37
 
63
38
  path = sprintf(
64
39
  "#{@task['path_prefix']}#{@task['sequence_format']}#{@task['file_ext']}",
@@ -70,7 +45,7 @@ module Embulk
70
45
  end
71
46
  Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
72
47
 
73
- open(path, 'w')
48
+ @io = open(path, 'w')
74
49
  end
75
50
 
76
51
  def open(path, mode = 'w')
@@ -81,21 +56,16 @@ module Embulk
81
56
  else
82
57
  io = file_io
83
58
  end
84
- self.class.mutex.synchronize do
85
- self.class.ios[path] = io
86
- end
87
- Thread.current[THREAD_LOCAL_IO_KEY] = io
59
+ io
88
60
  end
89
61
 
90
62
  def close
91
- io = thread_io
92
63
  io.close rescue nil
93
64
  io
94
65
  end
95
66
 
96
67
  def reopen
97
- io = thread_io
98
- open(io.path, 'a')
68
+ @io = open(io.path, 'a')
99
69
  end
100
70
 
101
71
  def to_payload(record)
@@ -123,29 +93,24 @@ module Embulk
123
93
  end
124
94
 
125
95
  def add(page)
126
- io = thread_io
96
+ _io = io
127
97
  # I once tried to split IO writing into another IO thread using SizedQueue
128
98
  # However, it resulted in worse performance, so I removed the codes.
129
99
  page.each do |record|
130
100
  Embulk.logger.trace { "embulk-output-bigquery: record #{record}" }
131
101
  formatted_record = @formatter_proc.call(record)
132
102
  Embulk.logger.trace { "embulk-output-bigquery: formatted_record #{formatted_record.chomp}" }
133
- io.write formatted_record
134
- @num_input_rows += 1
103
+ _io.write formatted_record
104
+ @num_rows += 1
135
105
  end
136
106
  now = Time.now
137
107
  if @progress_log_timer < now - 10 # once in 10 seconds
138
- speed = ((@num_input_rows - @previous_num_input_rows) / (now - @progress_log_timer).to_f).round(1)
108
+ speed = ((@num_rows - @previous_num_rows) / (now - @progress_log_timer).to_f).round(1)
139
109
  @progress_log_timer = now
140
- @previous_num_input_rows = @num_input_rows
141
- Embulk.logger.info { "embulk-output-bigquery: num_input_rows #{num_format(@num_input_rows)} (#{num_format(speed)} rows/sec)" }
110
+ @previous_num_rows = @num_rows
111
+ Embulk.logger.info { "embulk-output-bigquery: num_rows #{num_format(@num_rows)} (#{num_format(speed)} rows/sec)" }
142
112
  end
143
- end
144
-
145
- def commit
146
- task_report = {
147
- 'num_input_rows' => @num_input_rows,
148
- }
113
+ @num_rows
149
114
  end
150
115
  end
151
116
  end
@@ -1,4 +1,5 @@
1
1
  require 'digest/md5'
2
+ require 'securerandom'
2
3
 
3
4
  module Embulk
4
5
  module Output
@@ -52,11 +53,11 @@ module Embulk
52
53
  end
53
54
  end
54
55
 
55
- def self.create_job_id(task, path, table, fields)
56
+ def self.create_load_job_id(task, path, fields)
56
57
  elements = [
57
58
  Digest::MD5.file(path).hexdigest,
58
59
  task['dataset'],
59
- table,
60
+ task['table'],
60
61
  fields,
61
62
  task['source_format'],
62
63
  task['max_bad_records'],
@@ -68,9 +69,7 @@ module Embulk
68
69
 
69
70
  str = elements.map(&:to_s).join('')
70
71
  md5 = Digest::MD5.hexdigest(str)
71
- job_id = "embulk_job_#{md5}"
72
- Embulk.logger.debug { "embulk-output-bigquery: create_job_id(#{path}, #{table}) #=> #{job_id}" }
73
- job_id
72
+ "embulk_load_job_#{md5}"
74
73
  end
75
74
  end
76
75
  end
@@ -16,11 +16,6 @@ module Embulk
16
16
  end
17
17
  end
18
18
 
19
- def setup
20
- Thread.current[FileWriter::THREAD_LOCAL_IO_KEY] = nil
21
- FileWriter.reset_ios
22
- end
23
-
24
19
  def default_task
25
20
  {
26
21
  'compression' => 'GZIP',
@@ -65,7 +60,7 @@ module Embulk
65
60
  ensure
66
61
  io.close rescue nil
67
62
  end
68
- path = FileWriter.paths.first
63
+ path = file_writer.io.path
69
64
  assert_equal 'tmp/foo.1', path
70
65
  end
71
66
  end
@@ -108,12 +103,12 @@ module Embulk
108
103
 
109
104
  begin
110
105
  file_writer.add(page)
111
- io = FileWriter.ios.values.first
106
+ io = file_writer.io
112
107
  assert_equal Zlib::GzipWriter, io.class
113
108
  ensure
114
109
  io.close rescue nil
115
110
  end
116
- path = FileWriter.paths.first
111
+ path = file_writer.io.path
117
112
  assert_true File.exist?(path)
118
113
  assert_nothing_raised { Zlib::GzipReader.open(path) {|gz| } }
119
114
  end
@@ -124,12 +119,12 @@ module Embulk
124
119
 
125
120
  begin
126
121
  file_writer.add(page)
127
- io = FileWriter.ios.values.first
122
+ io = file_writer.io
128
123
  assert_equal File, io.class
129
124
  ensure
130
125
  io.close rescue nil
131
126
  end
132
- path = FileWriter.paths.first
127
+ path = file_writer.io.path
133
128
  assert_true File.exist?(path)
134
129
  assert_raise { Zlib::GzipReader.open(path) {|gz| } }
135
130
  end
data/test/test_helper.rb CHANGED
@@ -81,9 +81,10 @@ module Embulk
81
81
  end
82
82
  end
83
83
 
84
- def test_create_job_id
84
+ def test_create_load_job_id
85
85
  task = {
86
86
  'dataset' => 'your_dataset_name',
87
+ 'table' => 'your_table_name',
87
88
  'source_format' => 'CSV',
88
89
  'max_bad_records' => nil,
89
90
  'field_delimiter' => ',',
@@ -95,7 +96,7 @@ module Embulk
95
96
  name: 'a', type: 'STRING',
96
97
  }
97
98
  File.write("tmp/your_file_name", "foobarbaz")
98
- job_id = Helper.create_job_id(task, 'tmp/your_file_name', 'your_table_name', fields)
99
+ job_id = Helper.create_load_job_id(task, 'tmp/your_file_name', fields)
99
100
  assert job_id.is_a?(String)
100
101
  end
101
102
  end
@@ -40,6 +40,7 @@ module Embulk
40
40
  mock(obj).get_dataset(config['dataset'])
41
41
  mock(obj).create_table(config['temp_table'])
42
42
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
43
+ mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
43
44
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
44
45
  mock(obj).delete_table(config['temp_table'])
45
46
  end
@@ -53,6 +54,7 @@ module Embulk
53
54
  mock(obj).get_dataset(config['dataset'])
54
55
  mock(obj).get_table(config['table'])
55
56
  mock(obj).load_in_parallel(anything, config['table']) { [] }
57
+ mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
56
58
  end
57
59
  Bigquery.transaction(config, schema, processor_count, &control)
58
60
  end
@@ -63,6 +65,7 @@ module Embulk
63
65
  mock(obj).create_dataset(config['dataset'])
64
66
  mock(obj).create_table(config['table'])
65
67
  mock(obj).load_in_parallel(anything, config['table']) { [] }
68
+ mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
66
69
  end
67
70
  Bigquery.transaction(config, schema, processor_count, &control)
68
71
  end
@@ -75,6 +78,7 @@ module Embulk
75
78
  mock(obj).delete_table(config['table'])
76
79
  mock(obj).create_table(config['table'])
77
80
  mock(obj).load_in_parallel(anything, config['table']) { [] }
81
+ mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
78
82
  end
79
83
  Bigquery.transaction(config, schema, processor_count, &control)
80
84
  end
@@ -85,6 +89,7 @@ module Embulk
85
89
  mock(obj).get_dataset(config['dataset'])
86
90
  mock(obj).create_table(config['temp_table'])
87
91
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
92
+ mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
88
93
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
89
94
  mock(obj).delete_table(config['temp_table'])
90
95
  end
@@ -99,6 +104,7 @@ module Embulk
99
104
  mock(obj).get_dataset(config['dataset_old'])
100
105
  mock(obj).create_table(config['temp_table'])
101
106
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
107
+ mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
102
108
 
103
109
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
104
110
 
@@ -115,6 +121,7 @@ module Embulk
115
121
  mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
116
122
  mock(obj).create_table(config['temp_table'])
117
123
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
124
+ mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
118
125
 
119
126
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
120
127
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-04-08 00:00:00.000000000 Z
12
+ date: 2016-04-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client