embulk-output-bigquery 0.3.0.pre5 → 0.3.0.pre6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 278ad1e9acdd3c7ae22ef63b9304e4c5fec6871e
4
- data.tar.gz: a310f2abd73d6cd07808fedc21a11b649a4339ac
3
+ metadata.gz: ce1897b1eda7e46719b031b5f7e960cdce0ac014
4
+ data.tar.gz: 1bab2f7575b470df798c6aad4ecaa26d00e43b8c
5
5
  SHA512:
6
- metadata.gz: 008bc08aaa1b48718b4e0cd6d79b24def61eaa350274560c42212571fe99c7f67dfbd1644186a8d5c3f87de367ccb0d6346b091a994e87bfb31f8d03b42919f3
7
- data.tar.gz: f0f87a847877b5c70e95088424e657796112d35b044f6b069733446ade00206308ca39d106751750a4a47229c552c3ac91e487dc25b2c5415127c2aeec7ee6a5
6
+ metadata.gz: 772f2b087ede71e6edb9d3ccae7b870b389db66f2b21ef0e559a34c8923eaf2900fe1dc8532bea2bde8505c97434c81289e5f0dff1e1141c9eb31bf5df9773ce
7
+ data.tar.gz: f6037ccfbfbd3a0c440cf3d65b3bb5b1e62976267ed3299e7581542deb04f79308c6e377b4e53f61a8461a8195b938e417d7b69ac227654d41be8d13ae4a28df
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.0.pre5"
3
+ spec.version = "0.3.0.pre6"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -30,3 +30,4 @@ out:
30
30
  with_rehearsal: true
31
31
  rehearsal_counts: 1
32
32
  skip_load: true # for debug
33
+ compression: GZIP
@@ -1,5 +1,6 @@
1
1
  require 'json'
2
2
  require 'tempfile'
3
+ require 'fileutils'
3
4
  require_relative 'bigquery/bigquery_client'
4
5
  require_relative 'bigquery/file_writer'
5
6
  require_relative 'bigquery/value_converter_factory'
@@ -211,23 +212,27 @@ module Embulk
211
212
  @converters
212
213
  end
213
214
 
215
+ def self.rehearsal_thread
216
+ @rehearsal_thread
217
+ end
218
+
219
+ def self.rehearsal_thread=(rehearsal_thread)
220
+ @rehearsal_thread = rehearsal_thread
221
+ end
222
+
214
223
  def self.transaction_report(task_reports, responses)
224
+ num_input_rows = task_reports.inject(0) do |sum, task_report|
225
+ sum + task_report['num_input_rows']
226
+ end
227
+ num_output_rows = responses.inject(0) do |sum, response|
228
+ sum + (response ? response.statistics.load.output_rows.to_i : 0)
229
+ end
230
+ num_rejected_rows = num_input_rows - num_output_rows
215
231
  transaction_report = {
216
- 'num_input_rows' => 0,
217
- 'num_output_rows' => 0,
218
- 'num_rejected_rows' => 0,
232
+ 'num_input_rows' => num_input_rows,
233
+ 'num_output_rows' => num_output_rows,
234
+ 'num_rejected_rows' => num_rejected_rows,
219
235
  }
220
- (0...task_reports.size).each do |idx|
221
- task_report = task_reports[idx]
222
- response = responses[idx]
223
- num_input_rows = task_report['num_input_rows']
224
- num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
225
- num_rejected_rows = num_input_rows - num_output_rows
226
- transaction_report['num_input_rows'] += num_input_rows
227
- transaction_report['num_output_rows'] += num_output_rows
228
- transaction_report['num_rejected_rows'] += num_rejected_rows
229
- end
230
- transaction_report
231
236
  end
232
237
 
233
238
  def self.transaction(config, schema, task_count, &control)
@@ -278,7 +283,14 @@ module Embulk
278
283
  task_reports = yield(task) # generates local files
279
284
  Embulk.logger.info { "embulk-output-bigquery: task_reports: #{task_reports.to_json}" }
280
285
  paths = FileWriter.paths
281
- FileWriter.ios.each {|io| io.close rescue nil }
286
+ FileWriter.ios.values.each do |io|
287
+ Embulk.logger.debug { "close #{io.path}" }
288
+ io.close rescue nil
289
+ end
290
+ end
291
+
292
+ if rehearsal_thread
293
+ rehearsal_thread.join
282
294
  end
283
295
 
284
296
  if task['skip_load'] # only for debug
@@ -332,7 +344,6 @@ module Embulk
332
344
  super
333
345
 
334
346
  if task['with_rehearsal'] and @index == 0
335
- @bigquery = self.class.bigquery
336
347
  @rehearsaled = false
337
348
  @num_rows = 0
338
349
  end
@@ -351,13 +362,7 @@ module Embulk
351
362
  if task['with_rehearsal'] and @index == 0 and !@rehearsaled
352
363
  page = page.to_a # to avoid https://github.com/embulk/embulk/issues/403
353
364
  if @num_rows >= task['rehearsal_counts']
354
- Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
355
- begin
356
- @bigquery.create_table(task['rehearsal_table'])
357
- @bigquery.load(FileWriter.paths.first, task['rehearsal_table'])
358
- ensure
359
- @bigquery.delete_table(task['rehearsal_table'])
360
- end
365
+ load_rehearsal
361
366
  @rehearsaled = true
362
367
  end
363
368
  @num_rows += page.to_a.size
@@ -368,6 +373,30 @@ module Embulk
368
373
  end
369
374
  end
370
375
 
376
+ def load_rehearsal
377
+ bigquery = self.class.bigquery
378
+ Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
379
+
380
+ io = @file_writer.close # need to close once for gzip
381
+ rehearsal_path = "#{io.path}.rehearsal"
382
+ Embulk.logger.debug { "embulk_output_bigquery: cp #{io.path} #{rehearsal_path}" }
383
+ FileUtils.cp(io.path, rehearsal_path)
384
+ @file_writer.reopen
385
+
386
+ self.class.rehearsal_thread = Thread.new do
387
+ begin
388
+ bigquery.create_table(task['rehearsal_table'])
389
+ response = bigquery.load(rehearsal_path, task['rehearsal_table'])
390
+ num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
391
+ Embulk.logger.info { "embulk-output-bigquery: Loaded rehearsal #{num_output_rows}" }
392
+ ensure
393
+ Embulk.logger.debug { "embulk_output_bigquery: delete #{rehearsal_path}" }
394
+ File.unlink(rehearsal_path) rescue nil
395
+ bigquery.delete_table(task['rehearsal_table'])
396
+ end
397
+ end
398
+ end
399
+
371
400
  def finish
372
401
  end
373
402
 
@@ -31,14 +31,14 @@ module Embulk
31
31
  end
32
32
 
33
33
  @mutex = Mutex.new
34
- @ios = Set.new
34
+ @ios = Hash.new
35
35
 
36
36
  def self.mutex
37
37
  @mutex
38
38
  end
39
39
 
40
40
  def self.reset_ios
41
- @ios = Set.new
41
+ @ios = Hash.new
42
42
  end
43
43
 
44
44
  def self.ios
@@ -46,7 +46,7 @@ module Embulk
46
46
  end
47
47
 
48
48
  def self.paths
49
- ios.map {|io| io.path }
49
+ @ios.keys
50
50
  end
51
51
 
52
52
  THREAD_LOCAL_IO_KEY = :embulk_output_bigquery_file_writer_io
@@ -69,22 +69,35 @@ module Embulk
69
69
  File.unlink(path) rescue nil
70
70
  end
71
71
  Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
72
- file_io = File.open(path, 'w')
73
72
 
73
+ open(path, 'w')
74
+ end
75
+
76
+ def open(path, mode = 'w')
77
+ file_io = File.open(path, mode)
74
78
  case @task['compression'].downcase
75
79
  when 'gzip'
76
80
  io = Zlib::GzipWriter.new(file_io)
77
81
  else
78
82
  io = file_io
79
83
  end
80
-
81
84
  self.class.mutex.synchronize do
82
- self.class.ios.add(io)
85
+ self.class.ios[path] = io
83
86
  end
84
-
85
87
  Thread.current[THREAD_LOCAL_IO_KEY] = io
86
88
  end
87
89
 
90
+ def close
91
+ io = thread_io
92
+ io.close rescue nil
93
+ io
94
+ end
95
+
96
+ def reopen
97
+ io = thread_io
98
+ open(io.path, 'a')
99
+ end
100
+
88
101
  def to_payload(record)
89
102
  "#{record[@payload_column_index]}\n"
90
103
  end
@@ -108,7 +108,7 @@ module Embulk
108
108
 
109
109
  begin
110
110
  file_writer.add(page)
111
- io = FileWriter.ios.first
111
+ io = FileWriter.ios.values.first
112
112
  assert_equal Zlib::GzipWriter, io.class
113
113
  ensure
114
114
  io.close rescue nil
@@ -124,7 +124,7 @@ module Embulk
124
124
 
125
125
  begin
126
126
  file_writer.add(page)
127
- io = FileWriter.ios.first
127
+ io = FileWriter.ios.values.first
128
128
  assert_equal File, io.class
129
129
  ensure
130
130
  io.close rescue nil
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.pre5
4
+ version: 0.3.0.pre6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-03-24 00:00:00.000000000 Z
12
+ date: 2016-03-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client