embulk-output-bigquery 0.3.0.pre5 → 0.3.0.pre6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 278ad1e9acdd3c7ae22ef63b9304e4c5fec6871e
4
- data.tar.gz: a310f2abd73d6cd07808fedc21a11b649a4339ac
3
+ metadata.gz: ce1897b1eda7e46719b031b5f7e960cdce0ac014
4
+ data.tar.gz: 1bab2f7575b470df798c6aad4ecaa26d00e43b8c
5
5
  SHA512:
6
- metadata.gz: 008bc08aaa1b48718b4e0cd6d79b24def61eaa350274560c42212571fe99c7f67dfbd1644186a8d5c3f87de367ccb0d6346b091a994e87bfb31f8d03b42919f3
7
- data.tar.gz: f0f87a847877b5c70e95088424e657796112d35b044f6b069733446ade00206308ca39d106751750a4a47229c552c3ac91e487dc25b2c5415127c2aeec7ee6a5
6
+ metadata.gz: 772f2b087ede71e6edb9d3ccae7b870b389db66f2b21ef0e559a34c8923eaf2900fe1dc8532bea2bde8505c97434c81289e5f0dff1e1141c9eb31bf5df9773ce
7
+ data.tar.gz: f6037ccfbfbd3a0c440cf3d65b3bb5b1e62976267ed3299e7581542deb04f79308c6e377b4e53f61a8461a8195b938e417d7b69ac227654d41be8d13ae4a28df
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.0.pre5"
3
+ spec.version = "0.3.0.pre6"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -30,3 +30,4 @@ out:
30
30
  with_rehearsal: true
31
31
  rehearsal_counts: 1
32
32
  skip_load: true # for debug
33
+ compression: GZIP
@@ -1,5 +1,6 @@
1
1
  require 'json'
2
2
  require 'tempfile'
3
+ require 'fileutils'
3
4
  require_relative 'bigquery/bigquery_client'
4
5
  require_relative 'bigquery/file_writer'
5
6
  require_relative 'bigquery/value_converter_factory'
@@ -211,23 +212,27 @@ module Embulk
211
212
  @converters
212
213
  end
213
214
 
215
+ def self.rehearsal_thread
216
+ @rehearsal_thread
217
+ end
218
+
219
+ def self.rehearsal_thread=(rehearsal_thread)
220
+ @rehearsal_thread = rehearsal_thread
221
+ end
222
+
214
223
  def self.transaction_report(task_reports, responses)
224
+ num_input_rows = task_reports.inject(0) do |sum, task_report|
225
+ sum + task_report['num_input_rows']
226
+ end
227
+ num_output_rows = responses.inject(0) do |sum, response|
228
+ sum + (response ? response.statistics.load.output_rows.to_i : 0)
229
+ end
230
+ num_rejected_rows = num_input_rows - num_output_rows
215
231
  transaction_report = {
216
- 'num_input_rows' => 0,
217
- 'num_output_rows' => 0,
218
- 'num_rejected_rows' => 0,
232
+ 'num_input_rows' => num_input_rows,
233
+ 'num_output_rows' => num_output_rows,
234
+ 'num_rejected_rows' => num_rejected_rows,
219
235
  }
220
- (0...task_reports.size).each do |idx|
221
- task_report = task_reports[idx]
222
- response = responses[idx]
223
- num_input_rows = task_report['num_input_rows']
224
- num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
225
- num_rejected_rows = num_input_rows - num_output_rows
226
- transaction_report['num_input_rows'] += num_input_rows
227
- transaction_report['num_output_rows'] += num_output_rows
228
- transaction_report['num_rejected_rows'] += num_rejected_rows
229
- end
230
- transaction_report
231
236
  end
232
237
 
233
238
  def self.transaction(config, schema, task_count, &control)
@@ -278,7 +283,14 @@ module Embulk
278
283
  task_reports = yield(task) # generates local files
279
284
  Embulk.logger.info { "embulk-output-bigquery: task_reports: #{task_reports.to_json}" }
280
285
  paths = FileWriter.paths
281
- FileWriter.ios.each {|io| io.close rescue nil }
286
+ FileWriter.ios.values.each do |io|
287
+ Embulk.logger.debug { "close #{io.path}" }
288
+ io.close rescue nil
289
+ end
290
+ end
291
+
292
+ if rehearsal_thread
293
+ rehearsal_thread.join
282
294
  end
283
295
 
284
296
  if task['skip_load'] # only for debug
@@ -332,7 +344,6 @@ module Embulk
332
344
  super
333
345
 
334
346
  if task['with_rehearsal'] and @index == 0
335
- @bigquery = self.class.bigquery
336
347
  @rehearsaled = false
337
348
  @num_rows = 0
338
349
  end
@@ -351,13 +362,7 @@ module Embulk
351
362
  if task['with_rehearsal'] and @index == 0 and !@rehearsaled
352
363
  page = page.to_a # to avoid https://github.com/embulk/embulk/issues/403
353
364
  if @num_rows >= task['rehearsal_counts']
354
- Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
355
- begin
356
- @bigquery.create_table(task['rehearsal_table'])
357
- @bigquery.load(FileWriter.paths.first, task['rehearsal_table'])
358
- ensure
359
- @bigquery.delete_table(task['rehearsal_table'])
360
- end
365
+ load_rehearsal
361
366
  @rehearsaled = true
362
367
  end
363
368
  @num_rows += page.to_a.size
@@ -368,6 +373,30 @@ module Embulk
368
373
  end
369
374
  end
370
375
 
376
+ def load_rehearsal
377
+ bigquery = self.class.bigquery
378
+ Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
379
+
380
+ io = @file_writer.close # need to close once for gzip
381
+ rehearsal_path = "#{io.path}.rehearsal"
382
+ Embulk.logger.debug { "embulk_output_bigquery: cp #{io.path} #{rehearsal_path}" }
383
+ FileUtils.cp(io.path, rehearsal_path)
384
+ @file_writer.reopen
385
+
386
+ self.class.rehearsal_thread = Thread.new do
387
+ begin
388
+ bigquery.create_table(task['rehearsal_table'])
389
+ response = bigquery.load(rehearsal_path, task['rehearsal_table'])
390
+ num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
391
+ Embulk.logger.info { "embulk-output-bigquery: Loaded rehearsal #{num_output_rows}" }
392
+ ensure
393
+ Embulk.logger.debug { "embulk_output_bigquery: delete #{rehearsal_path}" }
394
+ File.unlink(rehearsal_path) rescue nil
395
+ bigquery.delete_table(task['rehearsal_table'])
396
+ end
397
+ end
398
+ end
399
+
371
400
  def finish
372
401
  end
373
402
 
@@ -31,14 +31,14 @@ module Embulk
31
31
  end
32
32
 
33
33
  @mutex = Mutex.new
34
- @ios = Set.new
34
+ @ios = Hash.new
35
35
 
36
36
  def self.mutex
37
37
  @mutex
38
38
  end
39
39
 
40
40
  def self.reset_ios
41
- @ios = Set.new
41
+ @ios = Hash.new
42
42
  end
43
43
 
44
44
  def self.ios
@@ -46,7 +46,7 @@ module Embulk
46
46
  end
47
47
 
48
48
  def self.paths
49
- ios.map {|io| io.path }
49
+ @ios.keys
50
50
  end
51
51
 
52
52
  THREAD_LOCAL_IO_KEY = :embulk_output_bigquery_file_writer_io
@@ -69,22 +69,35 @@ module Embulk
69
69
  File.unlink(path) rescue nil
70
70
  end
71
71
  Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
72
- file_io = File.open(path, 'w')
73
72
 
73
+ open(path, 'w')
74
+ end
75
+
76
+ def open(path, mode = 'w')
77
+ file_io = File.open(path, mode)
74
78
  case @task['compression'].downcase
75
79
  when 'gzip'
76
80
  io = Zlib::GzipWriter.new(file_io)
77
81
  else
78
82
  io = file_io
79
83
  end
80
-
81
84
  self.class.mutex.synchronize do
82
- self.class.ios.add(io)
85
+ self.class.ios[path] = io
83
86
  end
84
-
85
87
  Thread.current[THREAD_LOCAL_IO_KEY] = io
86
88
  end
87
89
 
90
+ def close
91
+ io = thread_io
92
+ io.close rescue nil
93
+ io
94
+ end
95
+
96
+ def reopen
97
+ io = thread_io
98
+ open(io.path, 'a')
99
+ end
100
+
88
101
  def to_payload(record)
89
102
  "#{record[@payload_column_index]}\n"
90
103
  end
@@ -108,7 +108,7 @@ module Embulk
108
108
 
109
109
  begin
110
110
  file_writer.add(page)
111
- io = FileWriter.ios.first
111
+ io = FileWriter.ios.values.first
112
112
  assert_equal Zlib::GzipWriter, io.class
113
113
  ensure
114
114
  io.close rescue nil
@@ -124,7 +124,7 @@ module Embulk
124
124
 
125
125
  begin
126
126
  file_writer.add(page)
127
- io = FileWriter.ios.first
127
+ io = FileWriter.ios.values.first
128
128
  assert_equal File, io.class
129
129
  ensure
130
130
  io.close rescue nil
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.pre5
4
+ version: 0.3.0.pre6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-03-24 00:00:00.000000000 Z
12
+ date: 2016-03-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client