embulk-output-bigquery 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/README.md +7 -6
- data/embulk-output-bigquery.gemspec +1 -1
- data/example/config_prevent_duplicate_insert.yml +1 -1
- data/lib/embulk/output/bigquery.rb +61 -33
- data/lib/embulk/output/bigquery/bigquery_client.rb +44 -36
- data/lib/embulk/output/bigquery/file_writer.rb +16 -51
- data/lib/embulk/output/bigquery/helper.rb +4 -5
- data/test/test_file_writer.rb +5 -10
- data/test/test_helper.rb +3 -2
- data/test/test_transaction.rb +7 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f21e4f5989b1aa631de606560ee75591a113c6f5
|
4
|
+
data.tar.gz: da801735b3ad2871a5d78bdde79d4f8e5e87ca30
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 582b300dacd9a45e39b424c3d0c0c3a887f5edc860430b2f0341df945ee723c0c8c5458619f28f18b2f028fe214fc3dbf58afd2751735bd2c143addb5ba164b3
|
7
|
+
data.tar.gz: 593d02fb4ec66bff1e3095e7e65f4d9b2adc3cb471ec3e998007ccc0fef73cfb48ad1bee6b0ee232d45ad41031affacb941e1d89097520052de007310d769465
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## 0.3.1 - 2016-04-15
|
2
|
+
|
3
|
+
* [new feature] Add `sdk_log_level` option to show log of google-api-client
|
4
|
+
* [maintenance] Fix `prevent_duplicate_insert` was not working correctly
|
5
|
+
* [maintenance] Change to get `num_output_rows` of `transaction_report` from `get_table` API
|
6
|
+
* [maintenance] Log response.statistics of load jobs
|
7
|
+
* [maintenance] Always create job_id on client side as [google recommends](https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs) so that duplication not to be occurred
|
8
|
+
* [maintenance] Fix a possibility which rehearsal would load 0 rows file
|
9
|
+
|
1
10
|
## 0.3.0 - 2016-04-08
|
2
11
|
|
3
12
|
Big change is introduced. Now, embulk-output-bigquery is written in JRuby.
|
data/README.md
CHANGED
@@ -39,7 +39,7 @@ OAuth flow for installed applications.
|
|
39
39
|
| auto_create_table | boolean | optional | false | [See below](#dynamic-table-creating) |
|
40
40
|
| schema_file | string | optional | | /path/to/schema.json |
|
41
41
|
| template_table | string | optional | | template table name [See below](#dynamic-table-creating) |
|
42
|
-
| prevent_duplicate_insert | boolean | optional | false | [See below](#
|
42
|
+
| prevent_duplicate_insert | boolean | optional | false | [See below](#prevent-duplication) |
|
43
43
|
| job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
|
44
44
|
| job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
|
45
45
|
| is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
|
@@ -59,6 +59,7 @@ Client or request options
|
|
59
59
|
| open_timeout_sec | integer | optional | 300 | Seconds to wait for the connection to open |
|
60
60
|
| retries | integer | optional | 5 | Number of retries |
|
61
61
|
| application_name | string | optional | "Embulk BigQuery plugin" | User-Agent |
|
62
|
+
| sdk_log_level | string | optional | nil (WARN) | Log level of google api client library |
|
62
63
|
|
63
64
|
Options for intermediate local files
|
64
65
|
|
@@ -317,15 +318,15 @@ out:
|
|
317
318
|
payload_column_index: 0 # or, payload_column: payload
|
318
319
|
```
|
319
320
|
|
320
|
-
###
|
321
|
+
### Prevent Duplication
|
321
322
|
|
322
|
-
|
323
|
+
`prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
|
323
324
|
|
324
|
-
`
|
325
|
+
When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
|
325
326
|
|
326
|
-
|
327
|
+
`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
|
327
328
|
|
328
|
-
|
329
|
+
[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
|
329
330
|
|
330
331
|
```yaml
|
331
332
|
out:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.1"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -44,6 +44,7 @@ module Embulk
|
|
44
44
|
'auto_create_table' => config.param('auto_create_table', :bool, :default => false),
|
45
45
|
'schema_file' => config.param('schema_file', :string, :default => nil),
|
46
46
|
'template_table' => config.param('template_table', :string, :default => nil),
|
47
|
+
|
47
48
|
'delete_from_local_when_job_end' => config.param('delete_from_local_when_job_end', :bool, :default => true),
|
48
49
|
'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
|
49
50
|
'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
|
@@ -62,6 +63,7 @@ module Embulk
|
|
62
63
|
'open_timeout_sec' => config.param('open_timeout_sec', :integer, :default => 300),
|
63
64
|
'retries' => config.param('retries', :integer, :default => 5),
|
64
65
|
'application_name' => config.param('application_name', :string, :default => 'Embulk BigQuery plugin'),
|
66
|
+
'sdk_log_level' => config.param('sdk_log_level', :string, :default => nil),
|
65
67
|
|
66
68
|
'path_prefix' => config.param('path_prefix', :string, :default => nil),
|
67
69
|
'sequence_format' => config.param('sequence_format', :string, :default => '.%d.%d'),
|
@@ -201,6 +203,10 @@ module Embulk
|
|
201
203
|
task['rehearsal_table'] ||= "LOAD_REHEARSAL_#{unique_name}_#{task['table']}"
|
202
204
|
end
|
203
205
|
|
206
|
+
if task['sdk_log_level']
|
207
|
+
Google::Apis.logger.level = eval("::Logger::#{task['sdk_log_level'].upcase}")
|
208
|
+
end
|
209
|
+
|
204
210
|
task
|
205
211
|
end
|
206
212
|
|
@@ -220,16 +226,16 @@ module Embulk
|
|
220
226
|
@rehearsal_thread = rehearsal_thread
|
221
227
|
end
|
222
228
|
|
223
|
-
def self.transaction_report(
|
224
|
-
num_input_rows =
|
225
|
-
|
226
|
-
end
|
227
|
-
num_output_rows = responses.inject(0) do |sum, response|
|
229
|
+
def self.transaction_report(file_writers, responses, target_table)
|
230
|
+
num_input_rows = file_writers.empty? ? 0 : file_writers.map(&:num_rows).inject(:+)
|
231
|
+
num_response_rows = responses.inject(0) do |sum, response|
|
228
232
|
sum + (response ? response.statistics.load.output_rows.to_i : 0)
|
229
233
|
end
|
234
|
+
num_output_rows = bigquery.get_table(target_table).num_rows.to_i
|
230
235
|
num_rejected_rows = num_input_rows - num_output_rows
|
231
236
|
transaction_report = {
|
232
237
|
'num_input_rows' => num_input_rows,
|
238
|
+
'num_response_rows' => num_response_rows,
|
233
239
|
'num_output_rows' => num_output_rows,
|
234
240
|
'num_rejected_rows' => num_rejected_rows,
|
235
241
|
}
|
@@ -278,12 +284,12 @@ module Embulk
|
|
278
284
|
path_pattern = "#{task['path_prefix']}*#{task['file_ext']}"
|
279
285
|
Embulk.logger.info { "embulk-output-bigquery: Skip file generation. Get paths from `#{path_pattern}`" }
|
280
286
|
paths = Dir.glob(path_pattern)
|
281
|
-
task_reports = paths.map {|path| { 'num_input_rows' => 0 } }
|
282
287
|
else
|
283
288
|
task_reports = yield(task) # generates local files
|
284
|
-
|
285
|
-
|
286
|
-
|
289
|
+
|
290
|
+
ios = file_writers.map(&:io)
|
291
|
+
paths = ios.map(&:path)
|
292
|
+
ios.each do |io|
|
287
293
|
Embulk.logger.debug { "close #{io.path}" }
|
288
294
|
io.close rescue nil
|
289
295
|
end
|
@@ -298,7 +304,7 @@ module Embulk
|
|
298
304
|
else
|
299
305
|
target_table = task['temp_table'] ? task['temp_table'] : task['table']
|
300
306
|
responses = bigquery.load_in_parallel(paths, target_table)
|
301
|
-
transaction_report = self.transaction_report(
|
307
|
+
transaction_report = self.transaction_report(file_writers, responses, target_table)
|
302
308
|
Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
|
303
309
|
|
304
310
|
if task['mode'] == 'replace_backup'
|
@@ -339,37 +345,63 @@ module Embulk
|
|
339
345
|
return next_config_diff
|
340
346
|
end
|
341
347
|
|
342
|
-
|
348
|
+
@file_writers_mutex = Mutex.new
|
349
|
+
@file_writers = Array.new
|
350
|
+
|
351
|
+
def self.reset_file_writers
|
352
|
+
@file_writers = Array.new
|
353
|
+
end
|
354
|
+
|
355
|
+
def self.file_writers
|
356
|
+
@file_writers
|
357
|
+
end
|
358
|
+
|
359
|
+
def self.add_file_writer(file_writer)
|
360
|
+
@file_writers_mutex.synchronize do
|
361
|
+
@file_writers << file_writer
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
FILE_WRITER_KEY = :embulk_output_bigquery_file_writer
|
366
|
+
|
367
|
+
# Create one FileWriter object for one output thread, that is, share among tasks.
|
368
|
+
# Close theses shared objects in transaction.
|
369
|
+
# This is mainly to suppress (or control by -X max_threads) number of files, which
|
370
|
+
# equals to number of concurrency to load in parallel, when number of input tasks is many
|
371
|
+
#
|
372
|
+
# #file_writer must be called at only #add because threads in other methods
|
373
|
+
# are different (called from non-output threads). Note also that #add method
|
374
|
+
# of the same task instance would be called in different output threads
|
375
|
+
def file_writer
|
376
|
+
return Thread.current[FILE_WRITER_KEY] if Thread.current[FILE_WRITER_KEY]
|
377
|
+
file_writer = FileWriter.new(@task, @schema, @index, self.class.converters)
|
378
|
+
self.class.add_file_writer(file_writer)
|
379
|
+
Thread.current[FILE_WRITER_KEY] = file_writer
|
380
|
+
end
|
381
|
+
|
382
|
+
# instance is created on each task
|
343
383
|
def initialize(task, schema, index)
|
344
384
|
super
|
345
385
|
|
346
386
|
if task['with_rehearsal'] and @index == 0
|
347
387
|
@rehearsaled = false
|
348
|
-
@num_rows = 0
|
349
|
-
end
|
350
|
-
|
351
|
-
unless task['skip_file_generation']
|
352
|
-
@file_writer = FileWriter.new(task, schema, index, self.class.converters)
|
353
388
|
end
|
354
389
|
end
|
355
390
|
|
356
|
-
# called for each page in each
|
391
|
+
# called for each page in each task
|
357
392
|
def close
|
358
393
|
end
|
359
394
|
|
360
|
-
# called for each page in each
|
395
|
+
# called for each page in each task
|
361
396
|
def add(page)
|
397
|
+
return if task['skip_file_generation']
|
398
|
+
num_rows = file_writer.add(page)
|
399
|
+
|
362
400
|
if task['with_rehearsal'] and @index == 0 and !@rehearsaled
|
363
|
-
|
364
|
-
if @num_rows >= task['rehearsal_counts']
|
401
|
+
if num_rows >= task['rehearsal_counts']
|
365
402
|
load_rehearsal
|
366
403
|
@rehearsaled = true
|
367
404
|
end
|
368
|
-
@num_rows += page.to_a.size
|
369
|
-
end
|
370
|
-
|
371
|
-
unless task['skip_file_generation']
|
372
|
-
@file_writer.add(page)
|
373
405
|
end
|
374
406
|
end
|
375
407
|
|
@@ -377,11 +409,11 @@ module Embulk
|
|
377
409
|
bigquery = self.class.bigquery
|
378
410
|
Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
|
379
411
|
|
380
|
-
io =
|
412
|
+
io = file_writer.close # need to close once for gzip
|
381
413
|
rehearsal_path = "#{io.path}.rehearsal"
|
382
414
|
Embulk.logger.debug { "embulk_output_bigquery: cp #{io.path} #{rehearsal_path}" }
|
383
415
|
FileUtils.cp(io.path, rehearsal_path)
|
384
|
-
|
416
|
+
file_writer.reopen
|
385
417
|
|
386
418
|
self.class.rehearsal_thread = Thread.new do
|
387
419
|
begin
|
@@ -403,13 +435,9 @@ module Embulk
|
|
403
435
|
def abort
|
404
436
|
end
|
405
437
|
|
406
|
-
# called after processing all pages in each
|
438
|
+
# called after processing all pages in each task, returns a task_report
|
407
439
|
def commit
|
408
|
-
|
409
|
-
@file_writer.commit
|
410
|
-
else
|
411
|
-
{}
|
412
|
-
end
|
440
|
+
{}
|
413
441
|
end
|
414
442
|
end
|
415
443
|
end
|
@@ -107,49 +107,46 @@ module Embulk
|
|
107
107
|
#
|
108
108
|
# We before had a `max_load_parallels` option, but this was not extensible for map reduce executor
|
109
109
|
# So, we dropped it. See https://github.com/embulk/embulk-output-bigquery/pull/35
|
110
|
-
max_load_parallels = paths.size # @task['max_load_parallels'] || paths.size
|
111
110
|
responses = []
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
[idx, response]
|
122
|
-
end
|
123
|
-
end
|
124
|
-
ThreadsWait.all_waits(*threads) do |th|
|
125
|
-
idx, response = th.value # raise errors occurred in threads
|
126
|
-
responses[idx] = response
|
111
|
+
threads = []
|
112
|
+
Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
|
113
|
+
paths.each_with_index do |path, idx|
|
114
|
+
threads << Thread.new do
|
115
|
+
# I am not sure whether google-api-ruby-client is thread-safe,
|
116
|
+
# so let me create new instances for each thread for safe
|
117
|
+
bigquery = self.class.new(@task, @schema, fields)
|
118
|
+
response = bigquery.load(path, table)
|
119
|
+
[idx, response]
|
127
120
|
end
|
128
121
|
end
|
122
|
+
ThreadsWait.all_waits(*threads) do |th|
|
123
|
+
idx, response = th.value # raise errors occurred in threads
|
124
|
+
responses[idx] = response
|
125
|
+
end
|
129
126
|
responses
|
130
127
|
end
|
131
128
|
|
132
129
|
def load(path, table)
|
133
130
|
begin
|
134
131
|
if File.exist?(path)
|
135
|
-
|
132
|
+
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
133
|
+
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
134
|
+
if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
|
135
|
+
job_id = Helper.create_load_job_id(@task, path, fields)
|
136
|
+
else
|
137
|
+
job_id = "embulk_load_job_#{SecureRandom.uuid}"
|
138
|
+
end
|
139
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
|
136
140
|
else
|
137
141
|
Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
|
138
142
|
return
|
139
143
|
end
|
140
144
|
|
141
|
-
if @task['prevent_duplicate_insert']
|
142
|
-
job_reference = {
|
143
|
-
job_reference: {
|
144
|
-
project_id: @project,
|
145
|
-
job_id: Helper.create_job_id(@task, path, table, fields),
|
146
|
-
}
|
147
|
-
}
|
148
|
-
else
|
149
|
-
job_reference = {}
|
150
|
-
end
|
151
|
-
|
152
145
|
body = {
|
146
|
+
job_reference: {
|
147
|
+
project_id: @project,
|
148
|
+
job_id: job_id,
|
149
|
+
},
|
153
150
|
configuration: {
|
154
151
|
load: {
|
155
152
|
destination_table: {
|
@@ -168,8 +165,9 @@ module Embulk
|
|
168
165
|
ignore_unknown_values: @task['ignore_unknown_values'],
|
169
166
|
allow_quoted_newlines: @task['allow_quoted_newlines'],
|
170
167
|
}
|
171
|
-
}
|
168
|
+
}
|
172
169
|
}
|
170
|
+
|
173
171
|
opts = {
|
174
172
|
upload_source: path,
|
175
173
|
content_type: "application/octet-stream",
|
@@ -182,7 +180,7 @@ module Embulk
|
|
182
180
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
183
181
|
response = client.insert_job(@project, body, opts)
|
184
182
|
unless @task['is_skip_job_result_check']
|
185
|
-
wait_load('Load', response)
|
183
|
+
response = wait_load('Load', response)
|
186
184
|
end
|
187
185
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
188
186
|
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
@@ -196,11 +194,18 @@ module Embulk
|
|
196
194
|
def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
|
197
195
|
begin
|
198
196
|
destination_dataset ||= @dataset
|
197
|
+
job_id = "embulk_copy_job_#{SecureRandom.uuid}"
|
198
|
+
|
199
199
|
Embulk.logger.info {
|
200
|
-
"embulk-output-bigquery: Copy job starting... " \
|
200
|
+
"embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
|
201
201
|
"#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
|
202
202
|
}
|
203
|
+
|
203
204
|
body = {
|
205
|
+
job_reference: {
|
206
|
+
project_id: @project,
|
207
|
+
job_id: job_id,
|
208
|
+
},
|
204
209
|
configuration: {
|
205
210
|
copy: {
|
206
211
|
create_deposition: 'CREATE_IF_NEEDED',
|
@@ -218,6 +223,7 @@ module Embulk
|
|
218
223
|
}
|
219
224
|
}
|
220
225
|
}
|
226
|
+
|
221
227
|
opts = {}
|
222
228
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
223
229
|
response = client.insert_job(@project, body, opts)
|
@@ -246,18 +252,18 @@ module Embulk
|
|
246
252
|
if status == "DONE"
|
247
253
|
Embulk.logger.info {
|
248
254
|
"embulk-output-bigquery: #{kind} job completed... " \
|
249
|
-
"
|
255
|
+
"job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
|
250
256
|
}
|
251
257
|
break
|
252
258
|
elsif elapsed.to_i > max_polling_time
|
253
|
-
message = "embulk-output-bigquery:
|
254
|
-
"
|
259
|
+
message = "embulk-output-bigquery: #{kind} job checking... " \
|
260
|
+
"job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[TIMEOUT]"
|
255
261
|
Embulk.logger.info { message }
|
256
262
|
raise JobTimeoutError.new(message)
|
257
263
|
else
|
258
264
|
Embulk.logger.info {
|
259
|
-
"embulk-output-bigquery:
|
260
|
-
"
|
265
|
+
"embulk-output-bigquery: #{kind} job checking... " \
|
266
|
+
"job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
|
261
267
|
}
|
262
268
|
sleep wait_interval
|
263
269
|
_response = client.get_job(@project, job_id)
|
@@ -275,6 +281,8 @@ module Embulk
|
|
275
281
|
raise Error, "failed during waiting a #{kind} job, errors:#{_errors.map(&:to_h)}"
|
276
282
|
end
|
277
283
|
|
284
|
+
Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
|
285
|
+
|
278
286
|
_response
|
279
287
|
end
|
280
288
|
|
@@ -7,15 +7,17 @@ module Embulk
|
|
7
7
|
module Output
|
8
8
|
class Bigquery < OutputPlugin
|
9
9
|
class FileWriter
|
10
|
+
attr_reader :num_rows
|
11
|
+
|
10
12
|
def initialize(task, schema, index, converters = nil)
|
11
13
|
@task = task
|
12
14
|
@schema = schema
|
13
15
|
@index = index
|
14
16
|
@converters = converters || ValueConverterFactory.create_converters(task, schema)
|
15
17
|
|
16
|
-
@
|
18
|
+
@num_rows = 0
|
17
19
|
@progress_log_timer = Time.now
|
18
|
-
@
|
20
|
+
@previous_num_rows = 0
|
19
21
|
|
20
22
|
if @task['payload_column_index']
|
21
23
|
@payload_column_index = @task['payload_column_index']
|
@@ -30,35 +32,8 @@ module Embulk
|
|
30
32
|
end
|
31
33
|
end
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
def self.mutex
|
37
|
-
@mutex
|
38
|
-
end
|
39
|
-
|
40
|
-
def self.reset_ios
|
41
|
-
@ios = Hash.new
|
42
|
-
end
|
43
|
-
|
44
|
-
def self.ios
|
45
|
-
@ios
|
46
|
-
end
|
47
|
-
|
48
|
-
def self.paths
|
49
|
-
@ios.keys
|
50
|
-
end
|
51
|
-
|
52
|
-
THREAD_LOCAL_IO_KEY = :embulk_output_bigquery_file_writer_io
|
53
|
-
|
54
|
-
# Create one io object for one output thread, that is, share among tasks
|
55
|
-
# Close theses shared io objects in transaction
|
56
|
-
#
|
57
|
-
# Thread IO must be created at #add because threads in #initialize or #commit
|
58
|
-
# are different (called from non-output threads). Note also that #add of the
|
59
|
-
# same instance would be called in different output threads
|
60
|
-
def thread_io
|
61
|
-
return Thread.current[THREAD_LOCAL_IO_KEY] if Thread.current[THREAD_LOCAL_IO_KEY]
|
35
|
+
def io
|
36
|
+
return @io if @io
|
62
37
|
|
63
38
|
path = sprintf(
|
64
39
|
"#{@task['path_prefix']}#{@task['sequence_format']}#{@task['file_ext']}",
|
@@ -70,7 +45,7 @@ module Embulk
|
|
70
45
|
end
|
71
46
|
Embulk.logger.info { "embulk-output-bigquery: create #{path}" }
|
72
47
|
|
73
|
-
open(path, 'w')
|
48
|
+
@io = open(path, 'w')
|
74
49
|
end
|
75
50
|
|
76
51
|
def open(path, mode = 'w')
|
@@ -81,21 +56,16 @@ module Embulk
|
|
81
56
|
else
|
82
57
|
io = file_io
|
83
58
|
end
|
84
|
-
|
85
|
-
self.class.ios[path] = io
|
86
|
-
end
|
87
|
-
Thread.current[THREAD_LOCAL_IO_KEY] = io
|
59
|
+
io
|
88
60
|
end
|
89
61
|
|
90
62
|
def close
|
91
|
-
io = thread_io
|
92
63
|
io.close rescue nil
|
93
64
|
io
|
94
65
|
end
|
95
66
|
|
96
67
|
def reopen
|
97
|
-
io =
|
98
|
-
open(io.path, 'a')
|
68
|
+
@io = open(io.path, 'a')
|
99
69
|
end
|
100
70
|
|
101
71
|
def to_payload(record)
|
@@ -123,29 +93,24 @@ module Embulk
|
|
123
93
|
end
|
124
94
|
|
125
95
|
def add(page)
|
126
|
-
|
96
|
+
_io = io
|
127
97
|
# I once tried to split IO writing into another IO thread using SizedQueue
|
128
98
|
# However, it resulted in worse performance, so I removed the codes.
|
129
99
|
page.each do |record|
|
130
100
|
Embulk.logger.trace { "embulk-output-bigquery: record #{record}" }
|
131
101
|
formatted_record = @formatter_proc.call(record)
|
132
102
|
Embulk.logger.trace { "embulk-output-bigquery: formatted_record #{formatted_record.chomp}" }
|
133
|
-
|
134
|
-
@
|
103
|
+
_io.write formatted_record
|
104
|
+
@num_rows += 1
|
135
105
|
end
|
136
106
|
now = Time.now
|
137
107
|
if @progress_log_timer < now - 10 # once in 10 seconds
|
138
|
-
speed = ((@
|
108
|
+
speed = ((@num_rows - @previous_num_rows) / (now - @progress_log_timer).to_f).round(1)
|
139
109
|
@progress_log_timer = now
|
140
|
-
@
|
141
|
-
Embulk.logger.info { "embulk-output-bigquery:
|
110
|
+
@previous_num_rows = @num_rows
|
111
|
+
Embulk.logger.info { "embulk-output-bigquery: num_rows #{num_format(@num_rows)} (#{num_format(speed)} rows/sec)" }
|
142
112
|
end
|
143
|
-
|
144
|
-
|
145
|
-
def commit
|
146
|
-
task_report = {
|
147
|
-
'num_input_rows' => @num_input_rows,
|
148
|
-
}
|
113
|
+
@num_rows
|
149
114
|
end
|
150
115
|
end
|
151
116
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'digest/md5'
|
2
|
+
require 'securerandom'
|
2
3
|
|
3
4
|
module Embulk
|
4
5
|
module Output
|
@@ -52,11 +53,11 @@ module Embulk
|
|
52
53
|
end
|
53
54
|
end
|
54
55
|
|
55
|
-
def self.
|
56
|
+
def self.create_load_job_id(task, path, fields)
|
56
57
|
elements = [
|
57
58
|
Digest::MD5.file(path).hexdigest,
|
58
59
|
task['dataset'],
|
59
|
-
table,
|
60
|
+
task['table'],
|
60
61
|
fields,
|
61
62
|
task['source_format'],
|
62
63
|
task['max_bad_records'],
|
@@ -68,9 +69,7 @@ module Embulk
|
|
68
69
|
|
69
70
|
str = elements.map(&:to_s).join('')
|
70
71
|
md5 = Digest::MD5.hexdigest(str)
|
71
|
-
|
72
|
-
Embulk.logger.debug { "embulk-output-bigquery: create_job_id(#{path}, #{table}) #=> #{job_id}" }
|
73
|
-
job_id
|
72
|
+
"embulk_load_job_#{md5}"
|
74
73
|
end
|
75
74
|
end
|
76
75
|
end
|
data/test/test_file_writer.rb
CHANGED
@@ -16,11 +16,6 @@ module Embulk
|
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
-
def setup
|
20
|
-
Thread.current[FileWriter::THREAD_LOCAL_IO_KEY] = nil
|
21
|
-
FileWriter.reset_ios
|
22
|
-
end
|
23
|
-
|
24
19
|
def default_task
|
25
20
|
{
|
26
21
|
'compression' => 'GZIP',
|
@@ -65,7 +60,7 @@ module Embulk
|
|
65
60
|
ensure
|
66
61
|
io.close rescue nil
|
67
62
|
end
|
68
|
-
path =
|
63
|
+
path = file_writer.io.path
|
69
64
|
assert_equal 'tmp/foo.1', path
|
70
65
|
end
|
71
66
|
end
|
@@ -108,12 +103,12 @@ module Embulk
|
|
108
103
|
|
109
104
|
begin
|
110
105
|
file_writer.add(page)
|
111
|
-
io =
|
106
|
+
io = file_writer.io
|
112
107
|
assert_equal Zlib::GzipWriter, io.class
|
113
108
|
ensure
|
114
109
|
io.close rescue nil
|
115
110
|
end
|
116
|
-
path =
|
111
|
+
path = file_writer.io.path
|
117
112
|
assert_true File.exist?(path)
|
118
113
|
assert_nothing_raised { Zlib::GzipReader.open(path) {|gz| } }
|
119
114
|
end
|
@@ -124,12 +119,12 @@ module Embulk
|
|
124
119
|
|
125
120
|
begin
|
126
121
|
file_writer.add(page)
|
127
|
-
io =
|
122
|
+
io = file_writer.io
|
128
123
|
assert_equal File, io.class
|
129
124
|
ensure
|
130
125
|
io.close rescue nil
|
131
126
|
end
|
132
|
-
path =
|
127
|
+
path = file_writer.io.path
|
133
128
|
assert_true File.exist?(path)
|
134
129
|
assert_raise { Zlib::GzipReader.open(path) {|gz| } }
|
135
130
|
end
|
data/test/test_helper.rb
CHANGED
@@ -81,9 +81,10 @@ module Embulk
|
|
81
81
|
end
|
82
82
|
end
|
83
83
|
|
84
|
-
def
|
84
|
+
def test_create_load_job_id
|
85
85
|
task = {
|
86
86
|
'dataset' => 'your_dataset_name',
|
87
|
+
'table' => 'your_table_name',
|
87
88
|
'source_format' => 'CSV',
|
88
89
|
'max_bad_records' => nil,
|
89
90
|
'field_delimiter' => ',',
|
@@ -95,7 +96,7 @@ module Embulk
|
|
95
96
|
name: 'a', type: 'STRING',
|
96
97
|
}
|
97
98
|
File.write("tmp/your_file_name", "foobarbaz")
|
98
|
-
job_id = Helper.
|
99
|
+
job_id = Helper.create_load_job_id(task, 'tmp/your_file_name', fields)
|
99
100
|
assert job_id.is_a?(String)
|
100
101
|
end
|
101
102
|
end
|
data/test/test_transaction.rb
CHANGED
@@ -40,6 +40,7 @@ module Embulk
|
|
40
40
|
mock(obj).get_dataset(config['dataset'])
|
41
41
|
mock(obj).create_table(config['temp_table'])
|
42
42
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
43
|
+
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
43
44
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
|
44
45
|
mock(obj).delete_table(config['temp_table'])
|
45
46
|
end
|
@@ -53,6 +54,7 @@ module Embulk
|
|
53
54
|
mock(obj).get_dataset(config['dataset'])
|
54
55
|
mock(obj).get_table(config['table'])
|
55
56
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
57
|
+
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
56
58
|
end
|
57
59
|
Bigquery.transaction(config, schema, processor_count, &control)
|
58
60
|
end
|
@@ -63,6 +65,7 @@ module Embulk
|
|
63
65
|
mock(obj).create_dataset(config['dataset'])
|
64
66
|
mock(obj).create_table(config['table'])
|
65
67
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
68
|
+
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
66
69
|
end
|
67
70
|
Bigquery.transaction(config, schema, processor_count, &control)
|
68
71
|
end
|
@@ -75,6 +78,7 @@ module Embulk
|
|
75
78
|
mock(obj).delete_table(config['table'])
|
76
79
|
mock(obj).create_table(config['table'])
|
77
80
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
81
|
+
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
78
82
|
end
|
79
83
|
Bigquery.transaction(config, schema, processor_count, &control)
|
80
84
|
end
|
@@ -85,6 +89,7 @@ module Embulk
|
|
85
89
|
mock(obj).get_dataset(config['dataset'])
|
86
90
|
mock(obj).create_table(config['temp_table'])
|
87
91
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
92
|
+
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
88
93
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
|
89
94
|
mock(obj).delete_table(config['temp_table'])
|
90
95
|
end
|
@@ -99,6 +104,7 @@ module Embulk
|
|
99
104
|
mock(obj).get_dataset(config['dataset_old'])
|
100
105
|
mock(obj).create_table(config['temp_table'])
|
101
106
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
107
|
+
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
102
108
|
|
103
109
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
104
110
|
|
@@ -115,6 +121,7 @@ module Embulk
|
|
115
121
|
mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
|
116
122
|
mock(obj).create_table(config['temp_table'])
|
117
123
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
124
|
+
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
118
125
|
|
119
126
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
120
127
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-04-
|
12
|
+
date: 2016-04-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|