embulk-output-bigquery 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +12 -0
- data/CHANGELOG.md +5 -0
- data/README.md +9 -0
- data/embulk-output-bigquery.gemspec +1 -1
- data/example/config_nested_record.yml +40 -0
- data/lib/embulk/output/bigquery.rb +21 -4
- data/test/test_transaction.rb +4 -7
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 14f823d8b06be1f8537c52ae244c4c94bbeb8833
|
4
|
+
data.tar.gz: b419e43610303eea86f1a963bde2bb4db472b26e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 009d542bbcbe0e73b40ccb76a1197c8dcd4185fcc0bc8b1819d799d74d9fc59428935250ffd67df560da14228a6d9526d9b41503eb7b88b23b1da505712a9157
|
7
|
+
data.tar.gz: 6215755f4975f62a949ba52e5766d61449b74ff0e63ca731caca0ceef17a69ff88a2adfe2636f1fe592f6feb51c10d7fa1624976224ff509d9a5c0bf54f8bb2e
|
data/.travis.yml
ADDED
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -21,6 +21,14 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
|
|
21
21
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
22
22
|
OAuth flow for installed applications.
|
23
23
|
|
24
|
+
### INCOMPATIBILITY CHANGES
|
25
|
+
|
26
|
+
v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
|
27
|
+
|
28
|
+
* `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
|
29
|
+
* `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
|
30
|
+
* `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
|
31
|
+
|
24
32
|
## Configuration
|
25
33
|
|
26
34
|
#### Original options
|
@@ -45,6 +53,7 @@ OAuth flow for installed applications.
|
|
45
53
|
| is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
|
46
54
|
| with_rehearsal | boolean | optional | false | Load `rehearsal_counts` records as a rehearsal. Rehearsal loads into REHEARSAL temporary table, and delete finally. You may use this option to investigate data errors as early stage as possible |
|
47
55
|
| rehearsal_counts | integer | optional | 1000 | Specify number of records to load in a rehearsal |
|
56
|
+
| abort_on_error | boolean | optional | true if max_bad_records is 0, otherwise false | Raise an error if number of input rows and number of output rows does not match |
|
48
57
|
| column_options | hash | optional | | [See below](#column-options) |
|
49
58
|
| default_timezone | string | optional | UTC | |
|
50
59
|
| default_timestamp_format | string | optional | %Y-%m-%d %H:%M:%S.%6N | |
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.2"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# embulk gem install embulk-parser-jsonl
|
2
|
+
in:
|
3
|
+
type: file
|
4
|
+
path_prefix: example/nested_example.jsonl
|
5
|
+
parser:
|
6
|
+
type: jsonl
|
7
|
+
columns:
|
8
|
+
- {name: date, type: string}
|
9
|
+
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
|
10
|
+
- {name: "null", type: string}
|
11
|
+
- {name: long, type: long}
|
12
|
+
- {name: string, type: string}
|
13
|
+
- {name: double, type: double}
|
14
|
+
- {name: json, type: json}
|
15
|
+
- {name: boolean, type: boolean}
|
16
|
+
out:
|
17
|
+
type: bigquery
|
18
|
+
mode: replace
|
19
|
+
auth_method: json_key
|
20
|
+
json_keyfile: example/your-project-000.json
|
21
|
+
dataset: your_dataset_name
|
22
|
+
table: your_table_name
|
23
|
+
compression: GZIP
|
24
|
+
source_format: NEWLINE_DELIMITED_JSON
|
25
|
+
auto_create_dataset: true
|
26
|
+
auto_create_table: true
|
27
|
+
column_options:
|
28
|
+
- {name: date, type: TIMESTAMP, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
|
29
|
+
- {name: timestamp, type: STRING, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
|
30
|
+
- {name: long, type: STRING}
|
31
|
+
- {name: string, type: STRING}
|
32
|
+
- {name: double, type: STRING}
|
33
|
+
- {name: boolean, type: STRING}
|
34
|
+
- name: json
|
35
|
+
type: RECORD
|
36
|
+
fields:
|
37
|
+
- {name: k1, type: STRING}
|
38
|
+
- {name: k2, type: STRING}
|
39
|
+
# 2015-07-13
|
40
|
+
# 2015-07-12 15:00:00
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'tempfile'
|
3
3
|
require 'fileutils'
|
4
|
+
require 'securerandom'
|
4
5
|
require_relative 'bigquery/bigquery_client'
|
5
6
|
require_relative 'bigquery/file_writer'
|
6
7
|
require_relative 'bigquery/value_converter_factory'
|
@@ -52,6 +53,7 @@ module Embulk
|
|
52
53
|
'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
|
53
54
|
'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
|
54
55
|
'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
|
56
|
+
'abort_on_error' => config.param('abort_on_error', :bool, :default => nil),
|
55
57
|
|
56
58
|
'column_options' => config.param('column_options', :array, :default => []),
|
57
59
|
'default_timezone' => config.param('default_timezone', :string, :default => ValueConverterFactory::DEFAULT_TIMEZONE),
|
@@ -193,7 +195,7 @@ module Embulk
|
|
193
195
|
task['file_ext'] = file_ext
|
194
196
|
end
|
195
197
|
|
196
|
-
unique_name =
|
198
|
+
unique_name = SecureRandom.uuid.gsub('-', '_')
|
197
199
|
|
198
200
|
if %w[replace replace_backup append].include?(task['mode'])
|
199
201
|
task['temp_table'] ||= "LOAD_TEMP_#{unique_name}_#{task['table']}"
|
@@ -207,6 +209,10 @@ module Embulk
|
|
207
209
|
Google::Apis.logger.level = eval("::Logger::#{task['sdk_log_level'].upcase}")
|
208
210
|
end
|
209
211
|
|
212
|
+
if task['abort_on_error'].nil?
|
213
|
+
task['abort_on_error'] = (task['max_bad_records'] == 0)
|
214
|
+
end
|
215
|
+
|
210
216
|
task
|
211
217
|
end
|
212
218
|
|
@@ -226,12 +232,16 @@ module Embulk
|
|
226
232
|
@rehearsal_thread = rehearsal_thread
|
227
233
|
end
|
228
234
|
|
229
|
-
def self.transaction_report(
|
235
|
+
def self.transaction_report(task, responses)
|
230
236
|
num_input_rows = file_writers.empty? ? 0 : file_writers.map(&:num_rows).inject(:+)
|
231
237
|
num_response_rows = responses.inject(0) do |sum, response|
|
232
238
|
sum + (response ? response.statistics.load.output_rows.to_i : 0)
|
233
239
|
end
|
234
|
-
|
240
|
+
if task['temp_table']
|
241
|
+
num_output_rows = bigquery.get_table(task['temp_table']).num_rows.to_i
|
242
|
+
else
|
243
|
+
num_output_rows = num_response_rows
|
244
|
+
end
|
235
245
|
num_rejected_rows = num_input_rows - num_output_rows
|
236
246
|
transaction_report = {
|
237
247
|
'num_input_rows' => num_input_rows,
|
@@ -304,9 +314,16 @@ module Embulk
|
|
304
314
|
else
|
305
315
|
target_table = task['temp_table'] ? task['temp_table'] : task['table']
|
306
316
|
responses = bigquery.load_in_parallel(paths, target_table)
|
307
|
-
transaction_report = self.transaction_report(
|
317
|
+
transaction_report = self.transaction_report(task, responses)
|
308
318
|
Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
|
309
319
|
|
320
|
+
if task['abort_on_error']
|
321
|
+
if transaction_report['num_input_rows'] != transaction_report['num_output_rows']
|
322
|
+
raise Error, "ABORT: `num_input_rows (#{transaction_report['num_input_rows']})` and " \
|
323
|
+
"`num_output_rows (#{transaction_report['num_output_rows']})` does not match"
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
310
327
|
if task['mode'] == 'replace_backup'
|
311
328
|
bigquery.copy(task['table'], task['table_old'], task['dataset_old'])
|
312
329
|
end
|
data/test/test_transaction.rb
CHANGED
@@ -34,13 +34,16 @@ module Embulk
|
|
34
34
|
Proc.new {|task| task_reports = [] }
|
35
35
|
end
|
36
36
|
|
37
|
+
def setup
|
38
|
+
stub(Bigquery).transaction_report { {'num_input_rows' => 1, 'num_output_rows' => 1, 'num_rejected_rows' => 0} }
|
39
|
+
end
|
40
|
+
|
37
41
|
def test_append
|
38
42
|
config = least_config.merge('mode' => 'append', 'temp_table' => 'temp_table')
|
39
43
|
any_instance_of(BigqueryClient) do |obj|
|
40
44
|
mock(obj).get_dataset(config['dataset'])
|
41
45
|
mock(obj).create_table(config['temp_table'])
|
42
46
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
43
|
-
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
44
47
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
|
45
48
|
mock(obj).delete_table(config['temp_table'])
|
46
49
|
end
|
@@ -54,7 +57,6 @@ module Embulk
|
|
54
57
|
mock(obj).get_dataset(config['dataset'])
|
55
58
|
mock(obj).get_table(config['table'])
|
56
59
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
57
|
-
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
58
60
|
end
|
59
61
|
Bigquery.transaction(config, schema, processor_count, &control)
|
60
62
|
end
|
@@ -65,7 +67,6 @@ module Embulk
|
|
65
67
|
mock(obj).create_dataset(config['dataset'])
|
66
68
|
mock(obj).create_table(config['table'])
|
67
69
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
68
|
-
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
69
70
|
end
|
70
71
|
Bigquery.transaction(config, schema, processor_count, &control)
|
71
72
|
end
|
@@ -78,7 +79,6 @@ module Embulk
|
|
78
79
|
mock(obj).delete_table(config['table'])
|
79
80
|
mock(obj).create_table(config['table'])
|
80
81
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
81
|
-
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
82
82
|
end
|
83
83
|
Bigquery.transaction(config, schema, processor_count, &control)
|
84
84
|
end
|
@@ -89,7 +89,6 @@ module Embulk
|
|
89
89
|
mock(obj).get_dataset(config['dataset'])
|
90
90
|
mock(obj).create_table(config['temp_table'])
|
91
91
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
92
|
-
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
93
92
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
|
94
93
|
mock(obj).delete_table(config['temp_table'])
|
95
94
|
end
|
@@ -104,7 +103,6 @@ module Embulk
|
|
104
103
|
mock(obj).get_dataset(config['dataset_old'])
|
105
104
|
mock(obj).create_table(config['temp_table'])
|
106
105
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
107
|
-
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
108
106
|
|
109
107
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
110
108
|
|
@@ -121,7 +119,6 @@ module Embulk
|
|
121
119
|
mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
|
122
120
|
mock(obj).create_table(config['temp_table'])
|
123
121
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
124
|
-
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
125
122
|
|
126
123
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
127
124
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-05-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|
@@ -90,6 +90,7 @@ extensions: []
|
|
90
90
|
extra_rdoc_files: []
|
91
91
|
files:
|
92
92
|
- ".gitignore"
|
93
|
+
- ".travis.yml"
|
93
94
|
- CHANGELOG.md
|
94
95
|
- Gemfile
|
95
96
|
- LICENSE.txt
|
@@ -108,6 +109,7 @@ files:
|
|
108
109
|
- example/config_min_ouput_tasks.yml
|
109
110
|
- example/config_mode_append.yml
|
110
111
|
- example/config_mode_append_direct.yml
|
112
|
+
- example/config_nested_record.yml
|
111
113
|
- example/config_payload_column.yml
|
112
114
|
- example/config_payload_column_index.yml
|
113
115
|
- example/config_prevent_duplicate_insert.yml
|