embulk-output-bigquery 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 06dd5ff0d084e46e4e6d3c5b4428b5a0500a69e1
4
- data.tar.gz: 974f71b43c073e5f324e27232cac48100ce5c9bf
3
+ metadata.gz: f21e4f5989b1aa631de606560ee75591a113c6f5
4
+ data.tar.gz: da801735b3ad2871a5d78bdde79d4f8e5e87ca30
5
5
  SHA512:
6
- metadata.gz: 55fc22719752768be1f4a45b1e7a4c011f01a75dfd68840f1deadbed8e2aa4cd88071ddbeaf9e3ea63ac0cdbe0b875756a51f9ef024c08f5722545833da6b5e3
7
- data.tar.gz: a5d30638f60ad162950219fbc58a525dd11662cc86b05d94cb02b8fd7dbf089a644a20f5a35a06e51e85f8685f5a1adb1fb07e24c46cc7d240f7453355c77953
6
+ metadata.gz: 582b300dacd9a45e39b424c3d0c0c3a887f5edc860430b2f0341df945ee723c0c8c5458619f28f18b2f028fe214fc3dbf58afd2751735bd2c143addb5ba164b3
7
+ data.tar.gz: 593d02fb4ec66bff1e3095e7e65f4d9b2adc3cb471ec3e998007ccc0fef73cfb48ad1bee6b0ee232d45ad41031affacb941e1d89097520052de007310d769465
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 0.3.1 - 2016-04-15
2
+
3
+ * [new feature] Add `sdk_log_level` option to show log of google-api-client
4
+ * [maintenance] Fix `prevent_duplicate_insert` was not working correctly
5
+ * [maintenance] Change to get `num_output_rows` of `transaction_report` from `get_table` API
6
+ * [maintenance] Log response.statistics of load jobs
7
+ * [maintenance] Always create job_id on client side as [google recommends](https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs) so that duplication not to be occurred
8
+ * [maintenance] Fix a possibility which rehearsal would load 0 rows file
9
+
1
10
  ## 0.3.0 - 2016-04-08
2
11
 
3
12
  Big change is introduced. Now, embulk-output-bigquery is written in JRuby.
data/README.md CHANGED
@@ -39,7 +39,7 @@ OAuth flow for installed applications.
39
39
  | auto_create_table | boolean | optional | false | [See below](#dynamic-table-creating) |
40
40
  | schema_file | string | optional | | /path/to/schema.json |
41
41
  | template_table | string | optional | | template table name [See below](#dynamic-table-creating) |
42
- | prevent_duplicate_insert | boolean | optional | false | [See below](#data-consistency) |
42
+ | prevent_duplicate_insert | boolean | optional | false | [See below](#prevent-duplication) |
43
43
  | job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
44
44
  | job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
45
45
  | is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
@@ -59,6 +59,7 @@ Client or request options
59
59
  | open_timeout_sec | integer | optional | 300 | Seconds to wait for the connection to open |
60
60
  | retries | integer | optional | 5 | Number of retries |
61
61
  | application_name | string | optional | "Embulk BigQuery plugin" | User-Agent |
62
+ | sdk_log_level | string | optional | nil (WARN) | Log level of google api client library |
62
63
 
63
64
  Options for intermediate local files
64
65
 
@@ -317,15 +318,15 @@ out:
317
318
  payload_column_index: 0 # or, payload_column: payload
318
319
  ```
319
320
 
320
- ### Data Consistency
321
+ ### Prevent Duplication
321
322
 
322
- When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options to prevent duplicate data insertion.
323
+ `prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
323
324
 
324
- `job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
325
+ When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
325
326
 
326
- [job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency). So same data can't insert with same settings.
327
+ `job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
327
328
 
328
- In other words, you can retry as many times as you like, in case something bad error(like network error) happens before job insertion.
329
+ [job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
329
330
 
330
331
  ```yaml
331
332
  out:
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.0"
3
+ spec.version = "0.3.1"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -18,7 +18,7 @@ in:
18
18
  - {name: boolean, type: boolean}
19
19
  out:
20
20
  type: bigquery
21
- mode: replace
21
+ mode: append
22
22
  auth_method: json_key
23
23
  json_keyfile: example/your-project-000.json
24
24
  dataset: your_dataset_name
@@ -44,6 +44,7 @@ module Embulk
44
44
  'auto_create_table' => config.param('auto_create_table', :bool, :default => false),
45
45
  'schema_file' => config.param('schema_file', :string, :default => nil),
46
46
  'template_table' => config.param('template_table', :string, :default => nil),
47
+
47
48
  'delete_from_local_when_job_end' => config.param('delete_from_local_when_job_end', :bool, :default => true),
48
49
  'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
49
50
  'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
@@ -62,6 +63,7 @@ module Embulk
62
63
  'open_timeout_sec' => config.param('open_timeout_sec', :integer, :default => 300),
63
64
  'retries' => config.param('retries', :integer, :default => 5),
64
65
  'application_name' => config.param('application_name', :string, :default => 'Embulk BigQuery plugin'),
66
+ 'sdk_log_level' => config.param('sdk_log_level', :string, :default => nil),
65
67
 
66
68
  'path_prefix' => config.param('path_prefix', :string, :default => nil),
67
69
  'sequence_format' => config.param('sequence_format', :string, :default => '.%d.%d'),
@@ -201,6 +203,10 @@ module Embulk
201
203
  task['rehearsal_table'] ||= "LOAD_REHEARSAL_#{unique_name}_#{task['table']}"
202
204
  end
203
205
 
206
+ if task['sdk_log_level']
207
+ Google::Apis.logger.level = eval("::Logger::#{task['sdk_log_level'].upcase}")
208
+ end
209
+
204
210
  task
205
211
  end
206
212
 
@@ -220,16 +226,16 @@ module Embulk
220
226
  @rehearsal_thread = rehearsal_thread
221
227
  end
222
228
 
223
- def self.transaction_report(task_reports, responses)
224
- num_input_rows = task_reports.inject(0) do |sum, task_report|
225
- sum + task_report['num_input_rows']
226
- end
227
- num_output_rows = responses.inject(0) do |sum, response|
229
+ def self.transaction_report(file_writers, responses, target_table)
230
+ num_input_rows = file_writers.empty? ? 0 : file_writers.map(&:num_rows).inject(:+)
231
+ num_response_rows = responses.inject(0) do |sum, response|
228
232
  sum + (response ? response.statistics.load.output_rows.to_i : 0)
229
233
  end
234
+ num_output_rows = bigquery.get_table(target_table).num_rows.to_i
230
235
  num_rejected_rows = num_input_rows - num_output_rows
231
236
  transaction_report = {
232
237
  'num_input_rows' => num_input_rows,
238
+ 'num_response_rows' => num_response_rows,
233
239
  'num_output_rows' => num_output_rows,
234
240
  'num_rejected_rows' => num_rejected_rows,
235
241
  }
@@ -278,12 +284,12 @@ module Embulk
278
284
  path_pattern = "#{task['path_prefix']}*#{task['file_ext']}"
279
285
  Embulk.logger.info { "embulk-output-bigquery: Skip file generation. Get paths from `#{path_pattern}`" }
280
286
  paths = Dir.glob(path_pattern)
281
- task_reports = paths.map {|path| { 'num_input_rows' => 0 } }
282
287
  else
283
288
  task_reports = yield(task) # generates local files
284
- Embulk.logger.info { "embulk-output-bigquery: task_reports: #{task_reports.to_json}" }
285
- paths = FileWriter.paths
286
- FileWriter.ios.values.each do |io|
289
+
290
+ ios = file_writers.map(&:io)
291
+ paths = ios.map(&:path)
292
+ ios.each do |io|
287
293
  Embulk.logger.debug { "close #{io.path}" }
288
294
  io.close rescue nil
289
295
  end
@@ -298,7 +304,7 @@ module Embulk
298
304
  else
299
305
  target_table = task['temp_table'] ? task['temp_table'] : task['table']
300
306
  responses = bigquery.load_in_parallel(paths, target_table)
301
- transaction_report = self.transaction_report(task_reports, responses)
307
+ transaction_report = self.transaction_report(file_writers, responses, target_table)
302
308
  Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
303
309
 
304
310
  if task['mode'] == 'replace_backup'
@@ -339,37 +345,63 @@ module Embulk
339
345
  return next_config_diff
340
346
  end
341
347
 
342
- # instance is created on each thread
348
+ @file_writers_mutex = Mutex.new
349
+ @file_writers = Array.new
350
+
351
+ def self.reset_file_writers
352
+ @file_writers = Array.new
353
+ end
354
+
355
+ def self.file_writers
356
+ @file_writers
357
+ end
358
+
359
+ def self.add_file_writer(file_writer)
360
+ @file_writers_mutex.synchronize do
361
+ @file_writers << file_writer
362
+ end
363
+ end
364
+
365
+ FILE_WRITER_KEY = :embulk_output_bigquery_file_writer
366
+
367
+ # Create one FileWriter object for one output thread, that is, share among tasks.
368
+ # Close theses shared objects in transaction.
369
+ # This is mainly to suppress (or control by -X max_threads) number of files, which
370
+ # equals to number of concurrency to load in parallel, when number of input tasks is many
371
+ #
372
+ # #file_writer must be called at only #add because threads in other methods
373
+ # are different (called from non-output threads). Note also that #add method
374
+ # of the same task instance would be called in different output threads
375
+ def file_writer
376
+ return Thread.current[FILE_WRITER_KEY] if Thread.current[FILE_WRITER_KEY]
377
+ file_writer = FileWriter.new(@task, @schema, @index, self.class.converters)
378
+ self.class.add_file_writer(file_writer)
379
+ Thread.current[FILE_WRITER_KEY] = file_writer
380
+ end
381
+
382
+ # instance is created on each task
343
383
  def initialize(task, schema, index)
344
384
  super
345
385
 
346
386
  if task['with_rehearsal'] and @index == 0
347
387
  @rehearsaled = false
348
- @num_rows = 0
349
- end
350
-
351
- unless task['skip_file_generation']
352
- @file_writer = FileWriter.new(task, schema, index, self.class.converters)
353
388
  end
354
389
  end
355
390
 
356
- # called for each page in each thread
391
+ # called for each page in each task
357
392
  def close
358
393
  end
359
394
 
360
- # called for each page in each thread
395
+ # called for each page in each task
361
396
  def add(page)
397
+ return if task['skip_file_generation']
398
+ num_rows = file_writer.add(page)
399
+
362
400
  if task['with_rehearsal'] and @index == 0 and !@rehearsaled
363
- page = page.to_a # to avoid https://github.com/embulk/embulk/issues/403
364
- if @num_rows >= task['rehearsal_counts']
401
+ if num_rows >= task['rehearsal_counts']
365
402
  load_rehearsal
366
403
  @rehearsaled = true
367
404
  end
368
- @num_rows += page.to_a.size
369
- end
370
-
371
- unless task['skip_file_generation']
372
- @file_writer.add(page)
373
405
  end
374
406
  end
375
407
 
@@ -377,11 +409,11 @@ module Embulk
377
409
  bigquery = self.class.bigquery
378
410
  Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
379
411
 
380
- io = @file_writer.close # need to close once for gzip
412
+ io = file_writer.close # need to close once for gzip
381
413
  rehearsal_path = "#{io.path}.rehearsal"
382
414
  Embulk.logger.debug { "embulk_output_bigquery: cp #{io.path} #{rehearsal_path}" }
383
415
  FileUtils.cp(io.path, rehearsal_path)
384
- @file_writer.reopen
416
+ file_writer.reopen
385
417
 
386
418
  self.class.rehearsal_thread = Thread.new do
387
419
  begin
@@ -403,13 +435,9 @@ module Embulk
403
435
  def abort
404
436
  end
405
437
 
406
- # called after processing all pages in each thread, returns a task_report
438
+ # called after processing all pages in each task, returns a task_report
407
439
  def commit
408
- unless task['skip_file_generation']
409
- @file_writer.commit
410
- else
411
- {}
412
- end
440
+ {}
413
441
  end
414
442
  end
415
443
  end
@@ -107,49 +107,46 @@ module Embulk
107
107
  #
108
108
  # We before had a `max_load_parallels` option, but this was not extensible for map reduce executor
109
109
  # So, we dropped it. See https://github.com/embulk/embulk-output-bigquery/pull/35
110
- max_load_parallels = paths.size # @task['max_load_parallels'] || paths.size
111
110
  responses = []
112
- paths.each_with_index.each_slice(max_load_parallels) do |paths_group|
113
- Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths_group}" }
114
- threads = []
115
- paths_group.each do |path, idx|
116
- threads << Thread.new do
117
- # I am not sure whether google-api-ruby-client is thread-safe,
118
- # so let me create new instances for each thread for safe
119
- bigquery = self.class.new(@task, @schema, fields)
120
- response = bigquery.load(path, table)
121
- [idx, response]
122
- end
123
- end
124
- ThreadsWait.all_waits(*threads) do |th|
125
- idx, response = th.value # raise errors occurred in threads
126
- responses[idx] = response
111
+ threads = []
112
+ Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
113
+ paths.each_with_index do |path, idx|
114
+ threads << Thread.new do
115
+ # I am not sure whether google-api-ruby-client is thread-safe,
116
+ # so let me create new instances for each thread for safe
117
+ bigquery = self.class.new(@task, @schema, fields)
118
+ response = bigquery.load(path, table)
119
+ [idx, response]
127
120
  end
128
121
  end
122
+ ThreadsWait.all_waits(*threads) do |th|
123
+ idx, response = th.value # raise errors occurred in threads
124
+ responses[idx] = response
125
+ end
129
126
  responses
130
127
  end
131
128
 
132
129
  def load(path, table)
133
130
  begin
134
131
  if File.exist?(path)
135
- Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} => #{@project}:#{@dataset}.#{table}" }
132
+ # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
133
+ # we should generate job_id in client code, otherwise, retrying would cause duplication
134
+ if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
135
+ job_id = Helper.create_load_job_id(@task, path, fields)
136
+ else
137
+ job_id = "embulk_load_job_#{SecureRandom.uuid}"
138
+ end
139
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
136
140
  else
137
141
  Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
138
142
  return
139
143
  end
140
144
 
141
- if @task['prevent_duplicate_insert']
142
- job_reference = {
143
- job_reference: {
144
- project_id: @project,
145
- job_id: Helper.create_job_id(@task, path, table, fields),
146
- }
147
- }
148
- else
149
- job_reference = {}
150
- end
151
-
152
145
  body = {
146
+ job_reference: {
147
+ project_id: @project,
148
+ job_id: job_id,
149
+ },
153
150
  configuration: {
154
151
  load: {
155
152
  destination_table: {
@@ -168,8 +165,9 @@ module Embulk
168
165
  ignore_unknown_values: @task['ignore_unknown_values'],
169
166
  allow_quoted_newlines: @task['allow_quoted_newlines'],
170
167
  }
171
- }.merge!(job_reference)
168
+ }
172
169
  }
170
+
173
171
  opts = {
174
172
  upload_source: path,
175
173
  content_type: "application/octet-stream",
@@ -182,7 +180,7 @@ module Embulk
182
180
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
183
181
  response = client.insert_job(@project, body, opts)
184
182
  unless @task['is_skip_job_result_check']
185
- wait_load('Load', response)
183
+ response = wait_load('Load', response)
186
184
  end
187
185
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
188
186
  response = {status_code: e.status_code, message: e.message, error_class: e.class}
@@ -196,11 +194,18 @@ module Embulk
196
194
  def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
197
195
  begin
198
196
  destination_dataset ||= @dataset
197
+ job_id = "embulk_copy_job_#{SecureRandom.uuid}"
198
+
199
199
  Embulk.logger.info {
200
- "embulk-output-bigquery: Copy job starting... " \
200
+ "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
201
201
  "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
202
202
  }
203
+
203
204
  body = {
205
+ job_reference: {
206
+ project_id: @project,
207
+ job_id: job_id,
208
+ },
204
209
  configuration: {
205
210
  copy: {
206
211
  create_deposition: 'CREATE_IF_NEEDED',
@@ -218,6 +223,7 @@ module Embulk
218
223
  }
219
224
  }
220
225
  }
226
+
221
227
  opts = {}
222
228
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
223
229
  response = client.insert_job(@project, body, opts)
@@ -246,18 +252,18 @@ module Embulk
246
252
  if status == "DONE"
247
253
  Embulk.logger.info {
248
254
  "embulk-output-bigquery: #{kind} job completed... " \
249
- "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
255
+ "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
250
256
  }
251
257
  break
252
258
  elsif elapsed.to_i > max_polling_time
253
- message = "embulk-output-bigquery: Checking #{kind} job status... " \
254
- "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
259
+ message = "embulk-output-bigquery: #{kind} job checking... " \
260
+ "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
255
261
  Embulk.logger.info { message }
256
262
  raise JobTimeoutError.new(message)
257
263
  else
258
264
  Embulk.logger.info {
259
- "embulk-output-bigquery: Checking #{kind} job status... " \
260
- "job id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
265
+ "embulk-output-bigquery: #{kind} job checking... " \
266
+ "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
261
267
  }
262
268
  sleep wait_interval
263
269
  _response = client.get_job(@project, job_id)
@@ -275,6 +281,8 @@ module Embulk
275
281
  raise Error, "failed during waiting a #{kind} job, errors:#{_errors.map(&:to_h)}"
276
282
  end
277
283
 
284
+ Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
285
+
278
286
  _response
279
287
  end
280
288
 
@@ -7,15 +7,17 @@ module Embulk
7
7
  module Output
8
8
  class Bigquery < OutputPlugin
9
9
  class FileWriter
10
+ attr_reader :num_rows
11
+
10
12
  def initialize(task, schema, index, converters = nil)
11
13
  @task = task
12
14
  @schema = schema
13
15
  @index = index
14
16
  @converters = converters || ValueConverterFactory.create_converters(task, schema)
15
17
 
16
- @num_input_rows = 0
18
+ @num_rows = 0
17
19
  @progress_log_timer = Time.now
18
- @previous_num_input_rows = 0
20
+ @previous_num_rows = 0
19
21
 
20
22
  if @task['payload_column_index']
21
23
  @payload_column_index = @task['payload_column_index']
@@ -30,35 +32,8 @@ module Embulk
30
32
  end
31
33
  end
32
34
 
33
- @mutex = Mutex.new
34
- @ios = Hash.new
35
-
36
- def self.mutex
37
- @mutex
38
- end
39
-
40
- def self.reset_ios
41
- @ios = Hash.new
42
- end
43
-
44
- def self.ios
45
- @ios
46
- end
47
-
48
- def self.paths
49
- @ios.keys
50
- end
51
-
52
- THREAD_LOCAL_IO_KEY = :embulk_output_bigquery_file_writer_io
53
-
54
- # Create one io object for one output thread, that is, share among tasks
55
- # Close theses shared io objects in transaction
56
- #
57
- # Thread IO must be created at #add because threads in #initialize or #commit
58
- # are different (called from non-output threads). Note also that #add of the
59
- # same instance would be called in different output threads
60
- def thread_io
61
- return Thread.current[THREAD_LOCAL_IO_KEY] if Thread.current[THREAD_LOCAL_IO_KEY]
35
+ def io
36
+ return @io if @io
62
37
 
63
38
  path = sprintf(
64
39
  "#{@task['path_prefix']}#{@task['sequence_format']}#{@task['file_ext']}",
@@ -70,7 +45,7 @@ module Embulk
70
45
  end
71
46
  Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
72
47
 
73
- open(path, 'w')
48
+ @io = open(path, 'w')
74
49
  end
75
50
 
76
51
  def open(path, mode = 'w')
@@ -81,21 +56,16 @@ module Embulk
81
56
  else
82
57
  io = file_io
83
58
  end
84
- self.class.mutex.synchronize do
85
- self.class.ios[path] = io
86
- end
87
- Thread.current[THREAD_LOCAL_IO_KEY] = io
59
+ io
88
60
  end
89
61
 
90
62
  def close
91
- io = thread_io
92
63
  io.close rescue nil
93
64
  io
94
65
  end
95
66
 
96
67
  def reopen
97
- io = thread_io
98
- open(io.path, 'a')
68
+ @io = open(io.path, 'a')
99
69
  end
100
70
 
101
71
  def to_payload(record)
@@ -123,29 +93,24 @@ module Embulk
123
93
  end
124
94
 
125
95
  def add(page)
126
- io = thread_io
96
+ _io = io
127
97
  # I once tried to split IO writing into another IO thread using SizedQueue
128
98
  # However, it resulted in worse performance, so I removed the codes.
129
99
  page.each do |record|
130
100
  Embulk.logger.trace { "embulk-output-bigquery: record #{record}" }
131
101
  formatted_record = @formatter_proc.call(record)
132
102
  Embulk.logger.trace { "embulk-output-bigquery: formatted_record #{formatted_record.chomp}" }
133
- io.write formatted_record
134
- @num_input_rows += 1
103
+ _io.write formatted_record
104
+ @num_rows += 1
135
105
  end
136
106
  now = Time.now
137
107
  if @progress_log_timer < now - 10 # once in 10 seconds
138
- speed = ((@num_input_rows - @previous_num_input_rows) / (now - @progress_log_timer).to_f).round(1)
108
+ speed = ((@num_rows - @previous_num_rows) / (now - @progress_log_timer).to_f).round(1)
139
109
  @progress_log_timer = now
140
- @previous_num_input_rows = @num_input_rows
141
- Embulk.logger.info { "embulk-output-bigquery: num_input_rows #{num_format(@num_input_rows)} (#{num_format(speed)} rows/sec)" }
110
+ @previous_num_rows = @num_rows
111
+ Embulk.logger.info { "embulk-output-bigquery: num_rows #{num_format(@num_rows)} (#{num_format(speed)} rows/sec)" }
142
112
  end
143
- end
144
-
145
- def commit
146
- task_report = {
147
- 'num_input_rows' => @num_input_rows,
148
- }
113
+ @num_rows
149
114
  end
150
115
  end
151
116
  end
@@ -1,4 +1,5 @@
1
1
  require 'digest/md5'
2
+ require 'securerandom'
2
3
 
3
4
  module Embulk
4
5
  module Output
@@ -52,11 +53,11 @@ module Embulk
52
53
  end
53
54
  end
54
55
 
55
- def self.create_job_id(task, path, table, fields)
56
+ def self.create_load_job_id(task, path, fields)
56
57
  elements = [
57
58
  Digest::MD5.file(path).hexdigest,
58
59
  task['dataset'],
59
- table,
60
+ task['table'],
60
61
  fields,
61
62
  task['source_format'],
62
63
  task['max_bad_records'],
@@ -68,9 +69,7 @@ module Embulk
68
69
 
69
70
  str = elements.map(&:to_s).join('')
70
71
  md5 = Digest::MD5.hexdigest(str)
71
- job_id = "embulk_job_#{md5}"
72
- Embulk.logger.debug { "embulk-output-bigquery: create_job_id(#{path}, #{table}) #=> #{job_id}" }
73
- job_id
72
+ "embulk_load_job_#{md5}"
74
73
  end
75
74
  end
76
75
  end
@@ -16,11 +16,6 @@ module Embulk
16
16
  end
17
17
  end
18
18
 
19
- def setup
20
- Thread.current[FileWriter::THREAD_LOCAL_IO_KEY] = nil
21
- FileWriter.reset_ios
22
- end
23
-
24
19
  def default_task
25
20
  {
26
21
  'compression' => 'GZIP',
@@ -65,7 +60,7 @@ module Embulk
65
60
  ensure
66
61
  io.close rescue nil
67
62
  end
68
- path = FileWriter.paths.first
63
+ path = file_writer.io.path
69
64
  assert_equal 'tmp/foo.1', path
70
65
  end
71
66
  end
@@ -108,12 +103,12 @@ module Embulk
108
103
 
109
104
  begin
110
105
  file_writer.add(page)
111
- io = FileWriter.ios.values.first
106
+ io = file_writer.io
112
107
  assert_equal Zlib::GzipWriter, io.class
113
108
  ensure
114
109
  io.close rescue nil
115
110
  end
116
- path = FileWriter.paths.first
111
+ path = file_writer.io.path
117
112
  assert_true File.exist?(path)
118
113
  assert_nothing_raised { Zlib::GzipReader.open(path) {|gz| } }
119
114
  end
@@ -124,12 +119,12 @@ module Embulk
124
119
 
125
120
  begin
126
121
  file_writer.add(page)
127
- io = FileWriter.ios.values.first
122
+ io = file_writer.io
128
123
  assert_equal File, io.class
129
124
  ensure
130
125
  io.close rescue nil
131
126
  end
132
- path = FileWriter.paths.first
127
+ path = file_writer.io.path
133
128
  assert_true File.exist?(path)
134
129
  assert_raise { Zlib::GzipReader.open(path) {|gz| } }
135
130
  end
data/test/test_helper.rb CHANGED
@@ -81,9 +81,10 @@ module Embulk
81
81
  end
82
82
  end
83
83
 
84
- def test_create_job_id
84
+ def test_create_load_job_id
85
85
  task = {
86
86
  'dataset' => 'your_dataset_name',
87
+ 'table' => 'your_table_name',
87
88
  'source_format' => 'CSV',
88
89
  'max_bad_records' => nil,
89
90
  'field_delimiter' => ',',
@@ -95,7 +96,7 @@ module Embulk
95
96
  name: 'a', type: 'STRING',
96
97
  }
97
98
  File.write("tmp/your_file_name", "foobarbaz")
98
- job_id = Helper.create_job_id(task, 'tmp/your_file_name', 'your_table_name', fields)
99
+ job_id = Helper.create_load_job_id(task, 'tmp/your_file_name', fields)
99
100
  assert job_id.is_a?(String)
100
101
  end
101
102
  end
@@ -40,6 +40,7 @@ module Embulk
40
40
  mock(obj).get_dataset(config['dataset'])
41
41
  mock(obj).create_table(config['temp_table'])
42
42
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
43
+ mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
43
44
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
44
45
  mock(obj).delete_table(config['temp_table'])
45
46
  end
@@ -53,6 +54,7 @@ module Embulk
53
54
  mock(obj).get_dataset(config['dataset'])
54
55
  mock(obj).get_table(config['table'])
55
56
  mock(obj).load_in_parallel(anything, config['table']) { [] }
57
+ mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
56
58
  end
57
59
  Bigquery.transaction(config, schema, processor_count, &control)
58
60
  end
@@ -63,6 +65,7 @@ module Embulk
63
65
  mock(obj).create_dataset(config['dataset'])
64
66
  mock(obj).create_table(config['table'])
65
67
  mock(obj).load_in_parallel(anything, config['table']) { [] }
68
+ mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
66
69
  end
67
70
  Bigquery.transaction(config, schema, processor_count, &control)
68
71
  end
@@ -75,6 +78,7 @@ module Embulk
75
78
  mock(obj).delete_table(config['table'])
76
79
  mock(obj).create_table(config['table'])
77
80
  mock(obj).load_in_parallel(anything, config['table']) { [] }
81
+ mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
78
82
  end
79
83
  Bigquery.transaction(config, schema, processor_count, &control)
80
84
  end
@@ -85,6 +89,7 @@ module Embulk
85
89
  mock(obj).get_dataset(config['dataset'])
86
90
  mock(obj).create_table(config['temp_table'])
87
91
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
92
+ mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
88
93
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
89
94
  mock(obj).delete_table(config['temp_table'])
90
95
  end
@@ -99,6 +104,7 @@ module Embulk
99
104
  mock(obj).get_dataset(config['dataset_old'])
100
105
  mock(obj).create_table(config['temp_table'])
101
106
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
107
+ mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
102
108
 
103
109
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
104
110
 
@@ -115,6 +121,7 @@ module Embulk
115
121
  mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
116
122
  mock(obj).create_table(config['temp_table'])
117
123
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
124
+ mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
118
125
 
119
126
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
120
127
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-04-08 00:00:00.000000000 Z
12
+ date: 2016-04-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client