embulk-output-bigquery 0.3.0.pre5 → 0.3.0.pre6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/embulk-output-bigquery.gemspec +1 -1
- data/example/config_with_rehearsal.yml +1 -0
- data/lib/embulk/output/bigquery.rb +52 -23
- data/lib/embulk/output/bigquery/file_writer.rb +20 -7
- data/test/test_file_writer.rb +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce1897b1eda7e46719b031b5f7e960cdce0ac014
|
4
|
+
data.tar.gz: 1bab2f7575b470df798c6aad4ecaa26d00e43b8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 772f2b087ede71e6edb9d3ccae7b870b389db66f2b21ef0e559a34c8923eaf2900fe1dc8532bea2bde8505c97434c81289e5f0dff1e1141c9eb31bf5df9773ce
|
7
|
+
data.tar.gz: f6037ccfbfbd3a0c440cf3d65b3bb5b1e62976267ed3299e7581542deb04f79308c6e377b4e53f61a8461a8195b938e417d7b69ac227654d41be8d13ae4a28df
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.0.
|
3
|
+
spec.version = "0.3.0.pre6"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'tempfile'
|
3
|
+
require 'fileutils'
|
3
4
|
require_relative 'bigquery/bigquery_client'
|
4
5
|
require_relative 'bigquery/file_writer'
|
5
6
|
require_relative 'bigquery/value_converter_factory'
|
@@ -211,23 +212,27 @@ module Embulk
|
|
211
212
|
@converters
|
212
213
|
end
|
213
214
|
|
215
|
+
def self.rehearsal_thread
|
216
|
+
@rehearsal_thread
|
217
|
+
end
|
218
|
+
|
219
|
+
def self.rehearsal_thread=(rehearsal_thread)
|
220
|
+
@rehearsal_thread = rehearsal_thread
|
221
|
+
end
|
222
|
+
|
214
223
|
def self.transaction_report(task_reports, responses)
|
224
|
+
num_input_rows = task_reports.inject(0) do |sum, task_report|
|
225
|
+
sum + task_report['num_input_rows']
|
226
|
+
end
|
227
|
+
num_output_rows = responses.inject(0) do |sum, response|
|
228
|
+
sum + (response ? response.statistics.load.output_rows.to_i : 0)
|
229
|
+
end
|
230
|
+
num_rejected_rows = num_input_rows - num_output_rows
|
215
231
|
transaction_report = {
|
216
|
-
'num_input_rows' =>
|
217
|
-
'num_output_rows' =>
|
218
|
-
'num_rejected_rows' =>
|
232
|
+
'num_input_rows' => num_input_rows,
|
233
|
+
'num_output_rows' => num_output_rows,
|
234
|
+
'num_rejected_rows' => num_rejected_rows,
|
219
235
|
}
|
220
|
-
(0...task_reports.size).each do |idx|
|
221
|
-
task_report = task_reports[idx]
|
222
|
-
response = responses[idx]
|
223
|
-
num_input_rows = task_report['num_input_rows']
|
224
|
-
num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
|
225
|
-
num_rejected_rows = num_input_rows - num_output_rows
|
226
|
-
transaction_report['num_input_rows'] += num_input_rows
|
227
|
-
transaction_report['num_output_rows'] += num_output_rows
|
228
|
-
transaction_report['num_rejected_rows'] += num_rejected_rows
|
229
|
-
end
|
230
|
-
transaction_report
|
231
236
|
end
|
232
237
|
|
233
238
|
def self.transaction(config, schema, task_count, &control)
|
@@ -278,7 +283,14 @@ module Embulk
|
|
278
283
|
task_reports = yield(task) # generates local files
|
279
284
|
Embulk.logger.info { "embulk-output-bigquery: task_reports: #{task_reports.to_json}" }
|
280
285
|
paths = FileWriter.paths
|
281
|
-
FileWriter.ios.each
|
286
|
+
FileWriter.ios.values.each do |io|
|
287
|
+
Embulk.logger.debug { "close #{io.path}" }
|
288
|
+
io.close rescue nil
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
if rehearsal_thread
|
293
|
+
rehearsal_thread.join
|
282
294
|
end
|
283
295
|
|
284
296
|
if task['skip_load'] # only for debug
|
@@ -332,7 +344,6 @@ module Embulk
|
|
332
344
|
super
|
333
345
|
|
334
346
|
if task['with_rehearsal'] and @index == 0
|
335
|
-
@bigquery = self.class.bigquery
|
336
347
|
@rehearsaled = false
|
337
348
|
@num_rows = 0
|
338
349
|
end
|
@@ -351,13 +362,7 @@ module Embulk
|
|
351
362
|
if task['with_rehearsal'] and @index == 0 and !@rehearsaled
|
352
363
|
page = page.to_a # to avoid https://github.com/embulk/embulk/issues/403
|
353
364
|
if @num_rows >= task['rehearsal_counts']
|
354
|
-
|
355
|
-
begin
|
356
|
-
@bigquery.create_table(task['rehearsal_table'])
|
357
|
-
@bigquery.load(FileWriter.paths.first, task['rehearsal_table'])
|
358
|
-
ensure
|
359
|
-
@bigquery.delete_table(task['rehearsal_table'])
|
360
|
-
end
|
365
|
+
load_rehearsal
|
361
366
|
@rehearsaled = true
|
362
367
|
end
|
363
368
|
@num_rows += page.to_a.size
|
@@ -368,6 +373,30 @@ module Embulk
|
|
368
373
|
end
|
369
374
|
end
|
370
375
|
|
376
|
+
def load_rehearsal
|
377
|
+
bigquery = self.class.bigquery
|
378
|
+
Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
|
379
|
+
|
380
|
+
io = @file_writer.close # need to close once for gzip
|
381
|
+
rehearsal_path = "#{io.path}.rehearsal"
|
382
|
+
Embulk.logger.debug { "embulk_output_bigquery: cp #{io.path} #{rehearsal_path}" }
|
383
|
+
FileUtils.cp(io.path, rehearsal_path)
|
384
|
+
@file_writer.reopen
|
385
|
+
|
386
|
+
self.class.rehearsal_thread = Thread.new do
|
387
|
+
begin
|
388
|
+
bigquery.create_table(task['rehearsal_table'])
|
389
|
+
response = bigquery.load(rehearsal_path, task['rehearsal_table'])
|
390
|
+
num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
|
391
|
+
Embulk.logger.info { "embulk-output-bigquery: Loaded rehearsal #{num_output_rows}" }
|
392
|
+
ensure
|
393
|
+
Embulk.logger.debug { "embulk_output_bigquery: delete #{rehearsal_path}" }
|
394
|
+
File.unlink(rehearsal_path) rescue nil
|
395
|
+
bigquery.delete_table(task['rehearsal_table'])
|
396
|
+
end
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
371
400
|
def finish
|
372
401
|
end
|
373
402
|
|
@@ -31,14 +31,14 @@ module Embulk
|
|
31
31
|
end
|
32
32
|
|
33
33
|
@mutex = Mutex.new
|
34
|
-
@ios =
|
34
|
+
@ios = Hash.new
|
35
35
|
|
36
36
|
def self.mutex
|
37
37
|
@mutex
|
38
38
|
end
|
39
39
|
|
40
40
|
def self.reset_ios
|
41
|
-
@ios =
|
41
|
+
@ios = Hash.new
|
42
42
|
end
|
43
43
|
|
44
44
|
def self.ios
|
@@ -46,7 +46,7 @@ module Embulk
|
|
46
46
|
end
|
47
47
|
|
48
48
|
def self.paths
|
49
|
-
ios.
|
49
|
+
@ios.keys
|
50
50
|
end
|
51
51
|
|
52
52
|
THREAD_LOCAL_IO_KEY = :embulk_output_bigquery_file_writer_io
|
@@ -69,22 +69,35 @@ module Embulk
|
|
69
69
|
File.unlink(path) rescue nil
|
70
70
|
end
|
71
71
|
Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
|
72
|
-
file_io = File.open(path, 'w')
|
73
72
|
|
73
|
+
open(path, 'w')
|
74
|
+
end
|
75
|
+
|
76
|
+
def open(path, mode = 'w')
|
77
|
+
file_io = File.open(path, mode)
|
74
78
|
case @task['compression'].downcase
|
75
79
|
when 'gzip'
|
76
80
|
io = Zlib::GzipWriter.new(file_io)
|
77
81
|
else
|
78
82
|
io = file_io
|
79
83
|
end
|
80
|
-
|
81
84
|
self.class.mutex.synchronize do
|
82
|
-
self.class.ios
|
85
|
+
self.class.ios[path] = io
|
83
86
|
end
|
84
|
-
|
85
87
|
Thread.current[THREAD_LOCAL_IO_KEY] = io
|
86
88
|
end
|
87
89
|
|
90
|
+
def close
|
91
|
+
io = thread_io
|
92
|
+
io.close rescue nil
|
93
|
+
io
|
94
|
+
end
|
95
|
+
|
96
|
+
def reopen
|
97
|
+
io = thread_io
|
98
|
+
open(io.path, 'a')
|
99
|
+
end
|
100
|
+
|
88
101
|
def to_payload(record)
|
89
102
|
"#{record[@payload_column_index]}\n"
|
90
103
|
end
|
data/test/test_file_writer.rb
CHANGED
@@ -108,7 +108,7 @@ module Embulk
|
|
108
108
|
|
109
109
|
begin
|
110
110
|
file_writer.add(page)
|
111
|
-
io = FileWriter.ios.first
|
111
|
+
io = FileWriter.ios.values.first
|
112
112
|
assert_equal Zlib::GzipWriter, io.class
|
113
113
|
ensure
|
114
114
|
io.close rescue nil
|
@@ -124,7 +124,7 @@ module Embulk
|
|
124
124
|
|
125
125
|
begin
|
126
126
|
file_writer.add(page)
|
127
|
-
io = FileWriter.ios.first
|
127
|
+
io = FileWriter.ios.values.first
|
128
128
|
assert_equal File, io.class
|
129
129
|
ensure
|
130
130
|
io.close rescue nil
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.0.
|
4
|
+
version: 0.3.0.pre6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-03-
|
12
|
+
date: 2016-03-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|