embulk-output-bigquery 0.3.0.pre5 → 0.3.0.pre6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/embulk-output-bigquery.gemspec +1 -1
- data/example/config_with_rehearsal.yml +1 -0
- data/lib/embulk/output/bigquery.rb +52 -23
- data/lib/embulk/output/bigquery/file_writer.rb +20 -7
- data/test/test_file_writer.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce1897b1eda7e46719b031b5f7e960cdce0ac014
|
4
|
+
data.tar.gz: 1bab2f7575b470df798c6aad4ecaa26d00e43b8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 772f2b087ede71e6edb9d3ccae7b870b389db66f2b21ef0e559a34c8923eaf2900fe1dc8532bea2bde8505c97434c81289e5f0dff1e1141c9eb31bf5df9773ce
|
7
|
+
data.tar.gz: f6037ccfbfbd3a0c440cf3d65b3bb5b1e62976267ed3299e7581542deb04f79308c6e377b4e53f61a8461a8195b938e417d7b69ac227654d41be8d13ae4a28df
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.0.
|
3
|
+
spec.version = "0.3.0.pre6"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'tempfile'
|
3
|
+
require 'fileutils'
|
3
4
|
require_relative 'bigquery/bigquery_client'
|
4
5
|
require_relative 'bigquery/file_writer'
|
5
6
|
require_relative 'bigquery/value_converter_factory'
|
@@ -211,23 +212,27 @@ module Embulk
|
|
211
212
|
@converters
|
212
213
|
end
|
213
214
|
|
215
|
+
def self.rehearsal_thread
|
216
|
+
@rehearsal_thread
|
217
|
+
end
|
218
|
+
|
219
|
+
def self.rehearsal_thread=(rehearsal_thread)
|
220
|
+
@rehearsal_thread = rehearsal_thread
|
221
|
+
end
|
222
|
+
|
214
223
|
def self.transaction_report(task_reports, responses)
|
224
|
+
num_input_rows = task_reports.inject(0) do |sum, task_report|
|
225
|
+
sum + task_report['num_input_rows']
|
226
|
+
end
|
227
|
+
num_output_rows = responses.inject(0) do |sum, response|
|
228
|
+
sum + (response ? response.statistics.load.output_rows.to_i : 0)
|
229
|
+
end
|
230
|
+
num_rejected_rows = num_input_rows - num_output_rows
|
215
231
|
transaction_report = {
|
216
|
-
'num_input_rows' =>
|
217
|
-
'num_output_rows' =>
|
218
|
-
'num_rejected_rows' =>
|
232
|
+
'num_input_rows' => num_input_rows,
|
233
|
+
'num_output_rows' => num_output_rows,
|
234
|
+
'num_rejected_rows' => num_rejected_rows,
|
219
235
|
}
|
220
|
-
(0...task_reports.size).each do |idx|
|
221
|
-
task_report = task_reports[idx]
|
222
|
-
response = responses[idx]
|
223
|
-
num_input_rows = task_report['num_input_rows']
|
224
|
-
num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
|
225
|
-
num_rejected_rows = num_input_rows - num_output_rows
|
226
|
-
transaction_report['num_input_rows'] += num_input_rows
|
227
|
-
transaction_report['num_output_rows'] += num_output_rows
|
228
|
-
transaction_report['num_rejected_rows'] += num_rejected_rows
|
229
|
-
end
|
230
|
-
transaction_report
|
231
236
|
end
|
232
237
|
|
233
238
|
def self.transaction(config, schema, task_count, &control)
|
@@ -278,7 +283,14 @@ module Embulk
|
|
278
283
|
task_reports = yield(task) # generates local files
|
279
284
|
Embulk.logger.info { "embulk-output-bigquery: task_reports: #{task_reports.to_json}" }
|
280
285
|
paths = FileWriter.paths
|
281
|
-
FileWriter.ios.each
|
286
|
+
FileWriter.ios.values.each do |io|
|
287
|
+
Embulk.logger.debug { "close #{io.path}" }
|
288
|
+
io.close rescue nil
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
if rehearsal_thread
|
293
|
+
rehearsal_thread.join
|
282
294
|
end
|
283
295
|
|
284
296
|
if task['skip_load'] # only for debug
|
@@ -332,7 +344,6 @@ module Embulk
|
|
332
344
|
super
|
333
345
|
|
334
346
|
if task['with_rehearsal'] and @index == 0
|
335
|
-
@bigquery = self.class.bigquery
|
336
347
|
@rehearsaled = false
|
337
348
|
@num_rows = 0
|
338
349
|
end
|
@@ -351,13 +362,7 @@ module Embulk
|
|
351
362
|
if task['with_rehearsal'] and @index == 0 and !@rehearsaled
|
352
363
|
page = page.to_a # to avoid https://github.com/embulk/embulk/issues/403
|
353
364
|
if @num_rows >= task['rehearsal_counts']
|
354
|
-
|
355
|
-
begin
|
356
|
-
@bigquery.create_table(task['rehearsal_table'])
|
357
|
-
@bigquery.load(FileWriter.paths.first, task['rehearsal_table'])
|
358
|
-
ensure
|
359
|
-
@bigquery.delete_table(task['rehearsal_table'])
|
360
|
-
end
|
365
|
+
load_rehearsal
|
361
366
|
@rehearsaled = true
|
362
367
|
end
|
363
368
|
@num_rows += page.to_a.size
|
@@ -368,6 +373,30 @@ module Embulk
|
|
368
373
|
end
|
369
374
|
end
|
370
375
|
|
376
|
+
def load_rehearsal
|
377
|
+
bigquery = self.class.bigquery
|
378
|
+
Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
|
379
|
+
|
380
|
+
io = @file_writer.close # need to close once for gzip
|
381
|
+
rehearsal_path = "#{io.path}.rehearsal"
|
382
|
+
Embulk.logger.debug { "embulk_output_bigquery: cp #{io.path} #{rehearsal_path}" }
|
383
|
+
FileUtils.cp(io.path, rehearsal_path)
|
384
|
+
@file_writer.reopen
|
385
|
+
|
386
|
+
self.class.rehearsal_thread = Thread.new do
|
387
|
+
begin
|
388
|
+
bigquery.create_table(task['rehearsal_table'])
|
389
|
+
response = bigquery.load(rehearsal_path, task['rehearsal_table'])
|
390
|
+
num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
|
391
|
+
Embulk.logger.info { "embulk-output-bigquery: Loaded rehearsal #{num_output_rows}" }
|
392
|
+
ensure
|
393
|
+
Embulk.logger.debug { "embulk_output_bigquery: delete #{rehearsal_path}" }
|
394
|
+
File.unlink(rehearsal_path) rescue nil
|
395
|
+
bigquery.delete_table(task['rehearsal_table'])
|
396
|
+
end
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
371
400
|
def finish
|
372
401
|
end
|
373
402
|
|
@@ -31,14 +31,14 @@ module Embulk
|
|
31
31
|
end
|
32
32
|
|
33
33
|
@mutex = Mutex.new
|
34
|
-
@ios =
|
34
|
+
@ios = Hash.new
|
35
35
|
|
36
36
|
def self.mutex
|
37
37
|
@mutex
|
38
38
|
end
|
39
39
|
|
40
40
|
def self.reset_ios
|
41
|
-
@ios =
|
41
|
+
@ios = Hash.new
|
42
42
|
end
|
43
43
|
|
44
44
|
def self.ios
|
@@ -46,7 +46,7 @@ module Embulk
|
|
46
46
|
end
|
47
47
|
|
48
48
|
def self.paths
|
49
|
-
ios.
|
49
|
+
@ios.keys
|
50
50
|
end
|
51
51
|
|
52
52
|
THREAD_LOCAL_IO_KEY = :embulk_output_bigquery_file_writer_io
|
@@ -69,22 +69,35 @@ module Embulk
|
|
69
69
|
File.unlink(path) rescue nil
|
70
70
|
end
|
71
71
|
Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
|
72
|
-
file_io = File.open(path, 'w')
|
73
72
|
|
73
|
+
open(path, 'w')
|
74
|
+
end
|
75
|
+
|
76
|
+
def open(path, mode = 'w')
|
77
|
+
file_io = File.open(path, mode)
|
74
78
|
case @task['compression'].downcase
|
75
79
|
when 'gzip'
|
76
80
|
io = Zlib::GzipWriter.new(file_io)
|
77
81
|
else
|
78
82
|
io = file_io
|
79
83
|
end
|
80
|
-
|
81
84
|
self.class.mutex.synchronize do
|
82
|
-
self.class.ios
|
85
|
+
self.class.ios[path] = io
|
83
86
|
end
|
84
|
-
|
85
87
|
Thread.current[THREAD_LOCAL_IO_KEY] = io
|
86
88
|
end
|
87
89
|
|
90
|
+
def close
|
91
|
+
io = thread_io
|
92
|
+
io.close rescue nil
|
93
|
+
io
|
94
|
+
end
|
95
|
+
|
96
|
+
def reopen
|
97
|
+
io = thread_io
|
98
|
+
open(io.path, 'a')
|
99
|
+
end
|
100
|
+
|
88
101
|
def to_payload(record)
|
89
102
|
"#{record[@payload_column_index]}\n"
|
90
103
|
end
|
data/test/test_file_writer.rb
CHANGED
@@ -108,7 +108,7 @@ module Embulk
|
|
108
108
|
|
109
109
|
begin
|
110
110
|
file_writer.add(page)
|
111
|
-
io = FileWriter.ios.first
|
111
|
+
io = FileWriter.ios.values.first
|
112
112
|
assert_equal Zlib::GzipWriter, io.class
|
113
113
|
ensure
|
114
114
|
io.close rescue nil
|
@@ -124,7 +124,7 @@ module Embulk
|
|
124
124
|
|
125
125
|
begin
|
126
126
|
file_writer.add(page)
|
127
|
-
io = FileWriter.ios.first
|
127
|
+
io = FileWriter.ios.values.first
|
128
128
|
assert_equal File, io.class
|
129
129
|
ensure
|
130
130
|
io.close rescue nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.0.
|
4
|
+
version: 0.3.0.pre6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-03-
|
12
|
+
date: 2016-03-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|