embulk-output-bigquery 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +7 -6
- data/embulk-output-bigquery.gemspec +1 -1
- data/example/config_prevent_duplicate_insert.yml +1 -1
- data/lib/embulk/output/bigquery.rb +61 -33
- data/lib/embulk/output/bigquery/bigquery_client.rb +44 -36
- data/lib/embulk/output/bigquery/file_writer.rb +16 -51
- data/lib/embulk/output/bigquery/helper.rb +4 -5
- data/test/test_file_writer.rb +5 -10
- data/test/test_helper.rb +3 -2
- data/test/test_transaction.rb +7 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f21e4f5989b1aa631de606560ee75591a113c6f5
|
4
|
+
data.tar.gz: da801735b3ad2871a5d78bdde79d4f8e5e87ca30
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 582b300dacd9a45e39b424c3d0c0c3a887f5edc860430b2f0341df945ee723c0c8c5458619f28f18b2f028fe214fc3dbf58afd2751735bd2c143addb5ba164b3
|
7
|
+
data.tar.gz: 593d02fb4ec66bff1e3095e7e65f4d9b2adc3cb471ec3e998007ccc0fef73cfb48ad1bee6b0ee232d45ad41031affacb941e1d89097520052de007310d769465
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## 0.3.1 - 2016-04-15
|
2
|
+
|
3
|
+
* [new feature] Add `sdk_log_level` option to show log of google-api-client
|
4
|
+
* [maintenance] Fix `prevent_duplicate_insert` was not working correctly
|
5
|
+
* [maintenance] Change to get `num_output_rows` of `transaction_report` from `get_table` API
|
6
|
+
* [maintenance] Log response.statistics of load jobs
|
7
|
+
* [maintenance] Always create job_id on client side as [google recommends](https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs) so that duplication not to be occurred
|
8
|
+
* [maintenance] Fix a possibility which rehearsal would load 0 rows file
|
9
|
+
|
1
10
|
## 0.3.0 - 2016-04-08
|
2
11
|
|
3
12
|
Big change is introduced. Now, embulk-output-bigquery is written in JRuby.
|
data/README.md
CHANGED
@@ -39,7 +39,7 @@ OAuth flow for installed applications.
|
|
39
39
|
| auto_create_table | boolean | optional | false | [See below](#dynamic-table-creating) |
|
40
40
|
| schema_file | string | optional | | /path/to/schema.json |
|
41
41
|
| template_table | string | optional | | template table name [See below](#dynamic-table-creating) |
|
42
|
-
| prevent_duplicate_insert | boolean | optional | false | [See below](#
|
42
|
+
| prevent_duplicate_insert | boolean | optional | false | [See below](#prevent-duplication) |
|
43
43
|
| job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
|
44
44
|
| job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
|
45
45
|
| is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
|
@@ -59,6 +59,7 @@ Client or request options
|
|
59
59
|
| open_timeout_sec | integer | optional | 300 | Seconds to wait for the connection to open |
|
60
60
|
| retries | integer | optional | 5 | Number of retries |
|
61
61
|
| application_name | string | optional | "Embulk BigQuery plugin" | User-Agent |
|
62
|
+
| sdk_log_level | string | optional | nil (WARN) | Log level of google api client library |
|
62
63
|
|
63
64
|
Options for intermediate local files
|
64
65
|
|
@@ -317,15 +318,15 @@ out:
|
|
317
318
|
payload_column_index: 0 # or, payload_column: payload
|
318
319
|
```
|
319
320
|
|
320
|
-
###
|
321
|
+
### Prevent Duplication
|
321
322
|
|
322
|
-
|
323
|
+
`prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
|
323
324
|
|
324
|
-
`
|
325
|
+
When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
|
325
326
|
|
326
|
-
|
327
|
+
`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
|
327
328
|
|
328
|
-
|
329
|
+
[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
|
329
330
|
|
330
331
|
```yaml
|
331
332
|
out:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.1"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -44,6 +44,7 @@ module Embulk
|
|
44
44
|
'auto_create_table' => config.param('auto_create_table', :bool, :default => false),
|
45
45
|
'schema_file' => config.param('schema_file', :string, :default => nil),
|
46
46
|
'template_table' => config.param('template_table', :string, :default => nil),
|
47
|
+
|
47
48
|
'delete_from_local_when_job_end' => config.param('delete_from_local_when_job_end', :bool, :default => true),
|
48
49
|
'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
|
49
50
|
'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
|
@@ -62,6 +63,7 @@ module Embulk
|
|
62
63
|
'open_timeout_sec' => config.param('open_timeout_sec', :integer, :default => 300),
|
63
64
|
'retries' => config.param('retries', :integer, :default => 5),
|
64
65
|
'application_name' => config.param('application_name', :string, :default => 'Embulk BigQuery plugin'),
|
66
|
+
'sdk_log_level' => config.param('sdk_log_level', :string, :default => nil),
|
65
67
|
|
66
68
|
'path_prefix' => config.param('path_prefix', :string, :default => nil),
|
67
69
|
'sequence_format' => config.param('sequence_format', :string, :default => '.%d.%d'),
|
@@ -201,6 +203,10 @@ module Embulk
|
|
201
203
|
task['rehearsal_table'] ||= "LOAD_REHEARSAL_#{unique_name}_#{task['table']}"
|
202
204
|
end
|
203
205
|
|
206
|
+
if task['sdk_log_level']
|
207
|
+
Google::Apis.logger.level = eval("::Logger::#{task['sdk_log_level'].upcase}")
|
208
|
+
end
|
209
|
+
|
204
210
|
task
|
205
211
|
end
|
206
212
|
|
@@ -220,16 +226,16 @@ module Embulk
|
|
220
226
|
@rehearsal_thread = rehearsal_thread
|
221
227
|
end
|
222
228
|
|
223
|
-
def self.transaction_report(
|
224
|
-
num_input_rows =
|
225
|
-
|
226
|
-
end
|
227
|
-
num_output_rows = responses.inject(0) do |sum, response|
|
229
|
+
def self.transaction_report(file_writers, responses, target_table)
|
230
|
+
num_input_rows = file_writers.empty? ? 0 : file_writers.map(&:num_rows).inject(:+)
|
231
|
+
num_response_rows = responses.inject(0) do |sum, response|
|
228
232
|
sum + (response ? response.statistics.load.output_rows.to_i : 0)
|
229
233
|
end
|
234
|
+
num_output_rows = bigquery.get_table(target_table).num_rows.to_i
|
230
235
|
num_rejected_rows = num_input_rows - num_output_rows
|
231
236
|
transaction_report = {
|
232
237
|
'num_input_rows' => num_input_rows,
|
238
|
+
'num_response_rows' => num_response_rows,
|
233
239
|
'num_output_rows' => num_output_rows,
|
234
240
|
'num_rejected_rows' => num_rejected_rows,
|
235
241
|
}
|
@@ -278,12 +284,12 @@ module Embulk
|
|
278
284
|
path_pattern = "#{task['path_prefix']}*#{task['file_ext']}"
|
279
285
|
Embulk.logger.info { "embulk-output-bigquery: Skip file generation. Get paths from `#{path_pattern}`" }
|
280
286
|
paths = Dir.glob(path_pattern)
|
281
|
-
task_reports = paths.map {|path| { 'num_input_rows' => 0 } }
|
282
287
|
else
|
283
288
|
task_reports = yield(task) # generates local files
|
284
|
-
|
285
|
-
|
286
|
-
|
289
|
+
|
290
|
+
ios = file_writers.map(&:io)
|
291
|
+
paths = ios.map(&:path)
|
292
|
+
ios.each do |io|
|
287
293
|
Embulk.logger.debug { "close #{io.path}" }
|
288
294
|
io.close rescue nil
|
289
295
|
end
|
@@ -298,7 +304,7 @@ module Embulk
|
|
298
304
|
else
|
299
305
|
target_table = task['temp_table'] ? task['temp_table'] : task['table']
|
300
306
|
responses = bigquery.load_in_parallel(paths, target_table)
|
301
|
-
transaction_report = self.transaction_report(
|
307
|
+
transaction_report = self.transaction_report(file_writers, responses, target_table)
|
302
308
|
Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
|
303
309
|
|
304
310
|
if task['mode'] == 'replace_backup'
|
@@ -339,37 +345,63 @@ module Embulk
|
|
339
345
|
return next_config_diff
|
340
346
|
end
|
341
347
|
|
342
|
-
|
348
|
+
@file_writers_mutex = Mutex.new
|
349
|
+
@file_writers = Array.new
|
350
|
+
|
351
|
+
def self.reset_file_writers
|
352
|
+
@file_writers = Array.new
|
353
|
+
end
|
354
|
+
|
355
|
+
def self.file_writers
|
356
|
+
@file_writers
|
357
|
+
end
|
358
|
+
|
359
|
+
def self.add_file_writer(file_writer)
|
360
|
+
@file_writers_mutex.synchronize do
|
361
|
+
@file_writers << file_writer
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
FILE_WRITER_KEY = :embulk_output_bigquery_file_writer
|
366
|
+
|
367
|
+
# Create one FileWriter object for one output thread, that is, share among tasks.
|
368
|
+
# Close theses shared objects in transaction.
|
369
|
+
# This is mainly to suppress (or control by -X max_threads) number of files, which
|
370
|
+
# equals to number of concurrency to load in parallel, when number of input tasks is many
|
371
|
+
#
|
372
|
+
# #file_writer must be called at only #add because threads in other methods
|
373
|
+
# are different (called from non-output threads). Note also that #add method
|
374
|
+
# of the same task instance would be called in different output threads
|
375
|
+
def file_writer
|
376
|
+
return Thread.current[FILE_WRITER_KEY] if Thread.current[FILE_WRITER_KEY]
|
377
|
+
file_writer = FileWriter.new(@task, @schema, @index, self.class.converters)
|
378
|
+
self.class.add_file_writer(file_writer)
|
379
|
+
Thread.current[FILE_WRITER_KEY] = file_writer
|
380
|
+
end
|
381
|
+
|
382
|
+
# instance is created on each task
|
343
383
|
def initialize(task, schema, index)
|
344
384
|
super
|
345
385
|
|
346
386
|
if task['with_rehearsal'] and @index == 0
|
347
387
|
@rehearsaled = false
|
348
|
-
@num_rows = 0
|
349
|
-
end
|
350
|
-
|
351
|
-
unless task['skip_file_generation']
|
352
|
-
@file_writer = FileWriter.new(task, schema, index, self.class.converters)
|
353
388
|
end
|
354
389
|
end
|
355
390
|
|
356
|
-
# called for each page in each
|
391
|
+
# called for each page in each task
|
357
392
|
def close
|
358
393
|
end
|
359
394
|
|
360
|
-
# called for each page in each
|
395
|
+
# called for each page in each task
|
361
396
|
def add(page)
|
397
|
+
return if task['skip_file_generation']
|
398
|
+
num_rows = file_writer.add(page)
|
399
|
+
|
362
400
|
if task['with_rehearsal'] and @index == 0 and !@rehearsaled
|
363
|
-
|
364
|
-
if @num_rows >= task['rehearsal_counts']
|
401
|
+
if num_rows >= task['rehearsal_counts']
|
365
402
|
load_rehearsal
|
366
403
|
@rehearsaled = true
|
367
404
|
end
|
368
|
-
@num_rows += page.to_a.size
|
369
|
-
end
|
370
|
-
|
371
|
-
unless task['skip_file_generation']
|
372
|
-
@file_writer.add(page)
|
373
405
|
end
|
374
406
|
end
|
375
407
|
|
@@ -377,11 +409,11 @@ module Embulk
|
|
377
409
|
bigquery = self.class.bigquery
|
378
410
|
Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
|
379
411
|
|
380
|
-
io =
|
412
|
+
io = file_writer.close # need to close once for gzip
|
381
413
|
rehearsal_path = "#{io.path}.rehearsal"
|
382
414
|
Embulk.logger.debug { "embulk_output_bigquery: cp #{io.path} #{rehearsal_path}" }
|
383
415
|
FileUtils.cp(io.path, rehearsal_path)
|
384
|
-
|
416
|
+
file_writer.reopen
|
385
417
|
|
386
418
|
self.class.rehearsal_thread = Thread.new do
|
387
419
|
begin
|
@@ -403,13 +435,9 @@ module Embulk
|
|
403
435
|
def abort
|
404
436
|
end
|
405
437
|
|
406
|
-
# called after processing all pages in each
|
438
|
+
# called after processing all pages in each task, returns a task_report
|
407
439
|
def commit
|
408
|
-
|
409
|
-
@file_writer.commit
|
410
|
-
else
|
411
|
-
{}
|
412
|
-
end
|
440
|
+
{}
|
413
441
|
end
|
414
442
|
end
|
415
443
|
end
|
@@ -107,49 +107,46 @@ module Embulk
|
|
107
107
|
#
|
108
108
|
# We before had a `max_load_parallels` option, but this was not extensible for map reduce executor
|
109
109
|
# So, we dropped it. See https://github.com/embulk/embulk-output-bigquery/pull/35
|
110
|
-
max_load_parallels = paths.size # @task['max_load_parallels'] || paths.size
|
111
110
|
responses = []
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
[idx, response]
|
122
|
-
end
|
123
|
-
end
|
124
|
-
ThreadsWait.all_waits(*threads) do |th|
|
125
|
-
idx, response = th.value # raise errors occurred in threads
|
126
|
-
responses[idx] = response
|
111
|
+
threads = []
|
112
|
+
Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
|
113
|
+
paths.each_with_index do |path, idx|
|
114
|
+
threads << Thread.new do
|
115
|
+
# I am not sure whether google-api-ruby-client is thread-safe,
|
116
|
+
# so let me create new instances for each thread for safe
|
117
|
+
bigquery = self.class.new(@task, @schema, fields)
|
118
|
+
response = bigquery.load(path, table)
|
119
|
+
[idx, response]
|
127
120
|
end
|
128
121
|
end
|
122
|
+
ThreadsWait.all_waits(*threads) do |th|
|
123
|
+
idx, response = th.value # raise errors occurred in threads
|
124
|
+
responses[idx] = response
|
125
|
+
end
|
129
126
|
responses
|
130
127
|
end
|
131
128
|
|
132
129
|
def load(path, table)
|
133
130
|
begin
|
134
131
|
if File.exist?(path)
|
135
|
-
|
132
|
+
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
133
|
+
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
134
|
+
if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
|
135
|
+
job_id = Helper.create_load_job_id(@task, path, fields)
|
136
|
+
else
|
137
|
+
job_id = "embulk_load_job_#{SecureRandom.uuid}"
|
138
|
+
end
|
139
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
|
136
140
|
else
|
137
141
|
Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
|
138
142
|
return
|
139
143
|
end
|
140
144
|
|
141
|
-
if @task['prevent_duplicate_insert']
|
142
|
-
job_reference = {
|
143
|
-
job_reference: {
|
144
|
-
project_id: @project,
|
145
|
-
job_id: Helper.create_job_id(@task, path, table, fields),
|
146
|
-
}
|
147
|
-
}
|
148
|
-
else
|
149
|
-
job_reference = {}
|
150
|
-
end
|
151
|
-
|
152
145
|
body = {
|
146
|
+
job_reference: {
|
147
|
+
project_id: @project,
|
148
|
+
job_id: job_id,
|
149
|
+
},
|
153
150
|
configuration: {
|
154
151
|
load: {
|
155
152
|
destination_table: {
|
@@ -168,8 +165,9 @@ module Embulk
|
|
168
165
|
ignore_unknown_values: @task['ignore_unknown_values'],
|
169
166
|
allow_quoted_newlines: @task['allow_quoted_newlines'],
|
170
167
|
}
|
171
|
-
}
|
168
|
+
}
|
172
169
|
}
|
170
|
+
|
173
171
|
opts = {
|
174
172
|
upload_source: path,
|
175
173
|
content_type: "application/octet-stream",
|
@@ -182,7 +180,7 @@ module Embulk
|
|
182
180
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
183
181
|
response = client.insert_job(@project, body, opts)
|
184
182
|
unless @task['is_skip_job_result_check']
|
185
|
-
wait_load('Load', response)
|
183
|
+
response = wait_load('Load', response)
|
186
184
|
end
|
187
185
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
188
186
|
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
@@ -196,11 +194,18 @@ module Embulk
|
|
196
194
|
def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
|
197
195
|
begin
|
198
196
|
destination_dataset ||= @dataset
|
197
|
+
job_id = "embulk_copy_job_#{SecureRandom.uuid}"
|
198
|
+
|
199
199
|
Embulk.logger.info {
|
200
|
-
"embulk-output-bigquery: Copy job starting... " \
|
200
|
+
"embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
|
201
201
|
"#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
|
202
202
|
}
|
203
|
+
|
203
204
|
body = {
|
205
|
+
job_reference: {
|
206
|
+
project_id: @project,
|
207
|
+
job_id: job_id,
|
208
|
+
},
|
204
209
|
configuration: {
|
205
210
|
copy: {
|
206
211
|
create_deposition: 'CREATE_IF_NEEDED',
|
@@ -218,6 +223,7 @@ module Embulk
|
|
218
223
|
}
|
219
224
|
}
|
220
225
|
}
|
226
|
+
|
221
227
|
opts = {}
|
222
228
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
223
229
|
response = client.insert_job(@project, body, opts)
|
@@ -246,18 +252,18 @@ module Embulk
|
|
246
252
|
if status == "DONE"
|
247
253
|
Embulk.logger.info {
|
248
254
|
"embulk-output-bigquery: #{kind} job completed... " \
|
249
|
-
"
|
255
|
+
"job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
|
250
256
|
}
|
251
257
|
break
|
252
258
|
elsif elapsed.to_i > max_polling_time
|
253
|
-
message = "embulk-output-bigquery:
|
254
|
-
"
|
259
|
+
message = "embulk-output-bigquery: #{kind} job checking... " \
|
260
|
+
"job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
|
255
261
|
Embulk.logger.info { message }
|
256
262
|
raise JobTimeoutError.new(message)
|
257
263
|
else
|
258
264
|
Embulk.logger.info {
|
259
|
-
"embulk-output-bigquery:
|
260
|
-
"
|
265
|
+
"embulk-output-bigquery: #{kind} job checking... " \
|
266
|
+
"job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
|
261
267
|
}
|
262
268
|
sleep wait_interval
|
263
269
|
_response = client.get_job(@project, job_id)
|
@@ -275,6 +281,8 @@ module Embulk
|
|
275
281
|
raise Error, "failed during waiting a #{kind} job, errors:#{_errors.map(&:to_h)}"
|
276
282
|
end
|
277
283
|
|
284
|
+
Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
|
285
|
+
|
278
286
|
_response
|
279
287
|
end
|
280
288
|
|
@@ -7,15 +7,17 @@ module Embulk
|
|
7
7
|
module Output
|
8
8
|
class Bigquery < OutputPlugin
|
9
9
|
class FileWriter
|
10
|
+
attr_reader :num_rows
|
11
|
+
|
10
12
|
def initialize(task, schema, index, converters = nil)
|
11
13
|
@task = task
|
12
14
|
@schema = schema
|
13
15
|
@index = index
|
14
16
|
@converters = converters || ValueConverterFactory.create_converters(task, schema)
|
15
17
|
|
16
|
-
@
|
18
|
+
@num_rows = 0
|
17
19
|
@progress_log_timer = Time.now
|
18
|
-
@
|
20
|
+
@previous_num_rows = 0
|
19
21
|
|
20
22
|
if @task['payload_column_index']
|
21
23
|
@payload_column_index = @task['payload_column_index']
|
@@ -30,35 +32,8 @@ module Embulk
|
|
30
32
|
end
|
31
33
|
end
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
def self.mutex
|
37
|
-
@mutex
|
38
|
-
end
|
39
|
-
|
40
|
-
def self.reset_ios
|
41
|
-
@ios = Hash.new
|
42
|
-
end
|
43
|
-
|
44
|
-
def self.ios
|
45
|
-
@ios
|
46
|
-
end
|
47
|
-
|
48
|
-
def self.paths
|
49
|
-
@ios.keys
|
50
|
-
end
|
51
|
-
|
52
|
-
THREAD_LOCAL_IO_KEY = :embulk_output_bigquery_file_writer_io
|
53
|
-
|
54
|
-
# Create one io object for one output thread, that is, share among tasks
|
55
|
-
# Close theses shared io objects in transaction
|
56
|
-
#
|
57
|
-
# Thread IO must be created at #add because threads in #initialize or #commit
|
58
|
-
# are different (called from non-output threads). Note also that #add of the
|
59
|
-
# same instance would be called in different output threads
|
60
|
-
def thread_io
|
61
|
-
return Thread.current[THREAD_LOCAL_IO_KEY] if Thread.current[THREAD_LOCAL_IO_KEY]
|
35
|
+
def io
|
36
|
+
return @io if @io
|
62
37
|
|
63
38
|
path = sprintf(
|
64
39
|
"#{@task['path_prefix']}#{@task['sequence_format']}#{@task['file_ext']}",
|
@@ -70,7 +45,7 @@ module Embulk
|
|
70
45
|
end
|
71
46
|
Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
|
72
47
|
|
73
|
-
open(path, 'w')
|
48
|
+
@io = open(path, 'w')
|
74
49
|
end
|
75
50
|
|
76
51
|
def open(path, mode = 'w')
|
@@ -81,21 +56,16 @@ module Embulk
|
|
81
56
|
else
|
82
57
|
io = file_io
|
83
58
|
end
|
84
|
-
|
85
|
-
self.class.ios[path] = io
|
86
|
-
end
|
87
|
-
Thread.current[THREAD_LOCAL_IO_KEY] = io
|
59
|
+
io
|
88
60
|
end
|
89
61
|
|
90
62
|
def close
|
91
|
-
io = thread_io
|
92
63
|
io.close rescue nil
|
93
64
|
io
|
94
65
|
end
|
95
66
|
|
96
67
|
def reopen
|
97
|
-
io =
|
98
|
-
open(io.path, 'a')
|
68
|
+
@io = open(io.path, 'a')
|
99
69
|
end
|
100
70
|
|
101
71
|
def to_payload(record)
|
@@ -123,29 +93,24 @@ module Embulk
|
|
123
93
|
end
|
124
94
|
|
125
95
|
def add(page)
|
126
|
-
|
96
|
+
_io = io
|
127
97
|
# I once tried to split IO writing into another IO thread using SizedQueue
|
128
98
|
# However, it resulted in worse performance, so I removed the codes.
|
129
99
|
page.each do |record|
|
130
100
|
Embulk.logger.trace { "embulk-output-bigquery: record #{record}" }
|
131
101
|
formatted_record = @formatter_proc.call(record)
|
132
102
|
Embulk.logger.trace { "embulk-output-bigquery: formatted_record #{formatted_record.chomp}" }
|
133
|
-
|
134
|
-
@
|
103
|
+
_io.write formatted_record
|
104
|
+
@num_rows += 1
|
135
105
|
end
|
136
106
|
now = Time.now
|
137
107
|
if @progress_log_timer < now - 10 # once in 10 seconds
|
138
|
-
speed = ((@
|
108
|
+
speed = ((@num_rows - @previous_num_rows) / (now - @progress_log_timer).to_f).round(1)
|
139
109
|
@progress_log_timer = now
|
140
|
-
@
|
141
|
-
Embulk.logger.info { "embulk-output-bigquery:
|
110
|
+
@previous_num_rows = @num_rows
|
111
|
+
Embulk.logger.info { "embulk-output-bigquery: num_rows #{num_format(@num_rows)} (#{num_format(speed)} rows/sec)" }
|
142
112
|
end
|
143
|
-
|
144
|
-
|
145
|
-
def commit
|
146
|
-
task_report = {
|
147
|
-
'num_input_rows' => @num_input_rows,
|
148
|
-
}
|
113
|
+
@num_rows
|
149
114
|
end
|
150
115
|
end
|
151
116
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'digest/md5'
|
2
|
+
require 'securerandom'
|
2
3
|
|
3
4
|
module Embulk
|
4
5
|
module Output
|
@@ -52,11 +53,11 @@ module Embulk
|
|
52
53
|
end
|
53
54
|
end
|
54
55
|
|
55
|
-
def self.
|
56
|
+
def self.create_load_job_id(task, path, fields)
|
56
57
|
elements = [
|
57
58
|
Digest::MD5.file(path).hexdigest,
|
58
59
|
task['dataset'],
|
59
|
-
table,
|
60
|
+
task['table'],
|
60
61
|
fields,
|
61
62
|
task['source_format'],
|
62
63
|
task['max_bad_records'],
|
@@ -68,9 +69,7 @@ module Embulk
|
|
68
69
|
|
69
70
|
str = elements.map(&:to_s).join('')
|
70
71
|
md5 = Digest::MD5.hexdigest(str)
|
71
|
-
|
72
|
-
Embulk.logger.debug { "embulk-output-bigquery: create_job_id(#{path}, #{table}) #=> #{job_id}" }
|
73
|
-
job_id
|
72
|
+
"embulk_load_job_#{md5}"
|
74
73
|
end
|
75
74
|
end
|
76
75
|
end
|
data/test/test_file_writer.rb
CHANGED
@@ -16,11 +16,6 @@ module Embulk
|
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
-
def setup
|
20
|
-
Thread.current[FileWriter::THREAD_LOCAL_IO_KEY] = nil
|
21
|
-
FileWriter.reset_ios
|
22
|
-
end
|
23
|
-
|
24
19
|
def default_task
|
25
20
|
{
|
26
21
|
'compression' => 'GZIP',
|
@@ -65,7 +60,7 @@ module Embulk
|
|
65
60
|
ensure
|
66
61
|
io.close rescue nil
|
67
62
|
end
|
68
|
-
path =
|
63
|
+
path = file_writer.io.path
|
69
64
|
assert_equal 'tmp/foo.1', path
|
70
65
|
end
|
71
66
|
end
|
@@ -108,12 +103,12 @@ module Embulk
|
|
108
103
|
|
109
104
|
begin
|
110
105
|
file_writer.add(page)
|
111
|
-
io =
|
106
|
+
io = file_writer.io
|
112
107
|
assert_equal Zlib::GzipWriter, io.class
|
113
108
|
ensure
|
114
109
|
io.close rescue nil
|
115
110
|
end
|
116
|
-
path =
|
111
|
+
path = file_writer.io.path
|
117
112
|
assert_true File.exist?(path)
|
118
113
|
assert_nothing_raised { Zlib::GzipReader.open(path) {|gz| } }
|
119
114
|
end
|
@@ -124,12 +119,12 @@ module Embulk
|
|
124
119
|
|
125
120
|
begin
|
126
121
|
file_writer.add(page)
|
127
|
-
io =
|
122
|
+
io = file_writer.io
|
128
123
|
assert_equal File, io.class
|
129
124
|
ensure
|
130
125
|
io.close rescue nil
|
131
126
|
end
|
132
|
-
path =
|
127
|
+
path = file_writer.io.path
|
133
128
|
assert_true File.exist?(path)
|
134
129
|
assert_raise { Zlib::GzipReader.open(path) {|gz| } }
|
135
130
|
end
|
data/test/test_helper.rb
CHANGED
@@ -81,9 +81,10 @@ module Embulk
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
84
|
-
def
|
84
|
+
def test_create_load_job_id
|
85
85
|
task = {
|
86
86
|
'dataset' => 'your_dataset_name',
|
87
|
+
'table' => 'your_table_name',
|
87
88
|
'source_format' => 'CSV',
|
88
89
|
'max_bad_records' => nil,
|
89
90
|
'field_delimiter' => ',',
|
@@ -95,7 +96,7 @@ module Embulk
|
|
95
96
|
name: 'a', type: 'STRING',
|
96
97
|
}
|
97
98
|
File.write("tmp/your_file_name", "foobarbaz")
|
98
|
-
job_id = Helper.
|
99
|
+
job_id = Helper.create_load_job_id(task, 'tmp/your_file_name', fields)
|
99
100
|
assert job_id.is_a?(String)
|
100
101
|
end
|
101
102
|
end
|
data/test/test_transaction.rb
CHANGED
@@ -40,6 +40,7 @@ module Embulk
|
|
40
40
|
mock(obj).get_dataset(config['dataset'])
|
41
41
|
mock(obj).create_table(config['temp_table'])
|
42
42
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
43
|
+
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
43
44
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
|
44
45
|
mock(obj).delete_table(config['temp_table'])
|
45
46
|
end
|
@@ -53,6 +54,7 @@ module Embulk
|
|
53
54
|
mock(obj).get_dataset(config['dataset'])
|
54
55
|
mock(obj).get_table(config['table'])
|
55
56
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
57
|
+
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
56
58
|
end
|
57
59
|
Bigquery.transaction(config, schema, processor_count, &control)
|
58
60
|
end
|
@@ -63,6 +65,7 @@ module Embulk
|
|
63
65
|
mock(obj).create_dataset(config['dataset'])
|
64
66
|
mock(obj).create_table(config['table'])
|
65
67
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
68
|
+
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
66
69
|
end
|
67
70
|
Bigquery.transaction(config, schema, processor_count, &control)
|
68
71
|
end
|
@@ -75,6 +78,7 @@ module Embulk
|
|
75
78
|
mock(obj).delete_table(config['table'])
|
76
79
|
mock(obj).create_table(config['table'])
|
77
80
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
81
|
+
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
78
82
|
end
|
79
83
|
Bigquery.transaction(config, schema, processor_count, &control)
|
80
84
|
end
|
@@ -85,6 +89,7 @@ module Embulk
|
|
85
89
|
mock(obj).get_dataset(config['dataset'])
|
86
90
|
mock(obj).create_table(config['temp_table'])
|
87
91
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
92
|
+
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
88
93
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
|
89
94
|
mock(obj).delete_table(config['temp_table'])
|
90
95
|
end
|
@@ -99,6 +104,7 @@ module Embulk
|
|
99
104
|
mock(obj).get_dataset(config['dataset_old'])
|
100
105
|
mock(obj).create_table(config['temp_table'])
|
101
106
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
107
|
+
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
102
108
|
|
103
109
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
104
110
|
|
@@ -115,6 +121,7 @@ module Embulk
|
|
115
121
|
mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
|
116
122
|
mock(obj).create_table(config['temp_table'])
|
117
123
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
124
|
+
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
118
125
|
|
119
126
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
120
127
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-04-
|
12
|
+
date: 2016-04-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|