embulk-output-bigquery 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +12 -0
- data/CHANGELOG.md +5 -0
- data/README.md +9 -0
- data/embulk-output-bigquery.gemspec +1 -1
- data/example/config_nested_record.yml +40 -0
- data/lib/embulk/output/bigquery.rb +21 -4
- data/test/test_transaction.rb +4 -7
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 14f823d8b06be1f8537c52ae244c4c94bbeb8833
|
4
|
+
data.tar.gz: b419e43610303eea86f1a963bde2bb4db472b26e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 009d542bbcbe0e73b40ccb76a1197c8dcd4185fcc0bc8b1819d799d74d9fc59428935250ffd67df560da14228a6d9526d9b41503eb7b88b23b1da505712a9157
|
7
|
+
data.tar.gz: 6215755f4975f62a949ba52e5766d61449b74ff0e63ca731caca0ceef17a69ff88a2adfe2636f1fe592f6feb51c10d7fa1624976224ff509d9a5c0bf54f8bb2e
|
data/.travis.yml
ADDED
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -21,6 +21,14 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
|
|
21
21
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
22
22
|
OAuth flow for installed applications.
|
23
23
|
|
24
|
+
### INCOMPATIBILITY CHANGES
|
25
|
+
|
26
|
+
v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
|
27
|
+
|
28
|
+
* `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
|
29
|
+
* `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
|
30
|
+
* `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
|
31
|
+
|
24
32
|
## Configuration
|
25
33
|
|
26
34
|
#### Original options
|
@@ -45,6 +53,7 @@ OAuth flow for installed applications.
|
|
45
53
|
| is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
|
46
54
|
| with_rehearsal | boolean | optional | false | Load `rehearsal_counts` records as a rehearsal. Rehearsal loads into REHEARSAL temporary table, and delete finally. You may use this option to investigate data errors as early stage as possible |
|
47
55
|
| rehearsal_counts | integer | optional | 1000 | Specify number of records to load in a rehearsal |
|
56
|
+
| abort_on_error | boolean | optional | true if max_bad_records is 0, otherwise false | Raise an error if number of input rows and number of output rows does not match |
|
48
57
|
| column_options | hash | optional | | [See below](#column-options) |
|
49
58
|
| default_timezone | string | optional | UTC | |
|
50
59
|
| default_timestamp_format | string | optional | %Y-%m-%d %H:%M:%S.%6N | |
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.2"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# embulk gem install embulk-parser-jsonl
|
2
|
+
in:
|
3
|
+
type: file
|
4
|
+
path_prefix: example/nested_example.jsonl
|
5
|
+
parser:
|
6
|
+
type: jsonl
|
7
|
+
columns:
|
8
|
+
- {name: date, type: string}
|
9
|
+
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
|
10
|
+
- {name: "null", type: string}
|
11
|
+
- {name: long, type: long}
|
12
|
+
- {name: string, type: string}
|
13
|
+
- {name: double, type: double}
|
14
|
+
- {name: json, type: json}
|
15
|
+
- {name: boolean, type: boolean}
|
16
|
+
out:
|
17
|
+
type: bigquery
|
18
|
+
mode: replace
|
19
|
+
auth_method: json_key
|
20
|
+
json_keyfile: example/your-project-000.json
|
21
|
+
dataset: your_dataset_name
|
22
|
+
table: your_table_name
|
23
|
+
compression: GZIP
|
24
|
+
source_format: NEWLINE_DELIMITED_JSON
|
25
|
+
auto_create_dataset: true
|
26
|
+
auto_create_table: true
|
27
|
+
column_options:
|
28
|
+
- {name: date, type: TIMESTAMP, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
|
29
|
+
- {name: timestamp, type: STRING, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
|
30
|
+
- {name: long, type: STRING}
|
31
|
+
- {name: string, type: STRING}
|
32
|
+
- {name: double, type: STRING}
|
33
|
+
- {name: boolean, type: STRING}
|
34
|
+
- name: json
|
35
|
+
type: RECORD
|
36
|
+
fields:
|
37
|
+
- {name: k1, type: STRING}
|
38
|
+
- {name: k2, type: STRING}
|
39
|
+
# 2015-07-13
|
40
|
+
# 2015-07-12 15:00:00
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'tempfile'
|
3
3
|
require 'fileutils'
|
4
|
+
require 'securerandom'
|
4
5
|
require_relative 'bigquery/bigquery_client'
|
5
6
|
require_relative 'bigquery/file_writer'
|
6
7
|
require_relative 'bigquery/value_converter_factory'
|
@@ -52,6 +53,7 @@ module Embulk
|
|
52
53
|
'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
|
53
54
|
'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
|
54
55
|
'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
|
56
|
+
'abort_on_error' => config.param('abort_on_error', :bool, :default => nil),
|
55
57
|
|
56
58
|
'column_options' => config.param('column_options', :array, :default => []),
|
57
59
|
'default_timezone' => config.param('default_timezone', :string, :default => ValueConverterFactory::DEFAULT_TIMEZONE),
|
@@ -193,7 +195,7 @@ module Embulk
|
|
193
195
|
task['file_ext'] = file_ext
|
194
196
|
end
|
195
197
|
|
196
|
-
unique_name =
|
198
|
+
unique_name = SecureRandom.uuid.gsub('-', '_')
|
197
199
|
|
198
200
|
if %w[replace replace_backup append].include?(task['mode'])
|
199
201
|
task['temp_table'] ||= "LOAD_TEMP_#{unique_name}_#{task['table']}"
|
@@ -207,6 +209,10 @@ module Embulk
|
|
207
209
|
Google::Apis.logger.level = eval("::Logger::#{task['sdk_log_level'].upcase}")
|
208
210
|
end
|
209
211
|
|
212
|
+
if task['abort_on_error'].nil?
|
213
|
+
task['abort_on_error'] = (task['max_bad_records'] == 0)
|
214
|
+
end
|
215
|
+
|
210
216
|
task
|
211
217
|
end
|
212
218
|
|
@@ -226,12 +232,16 @@ module Embulk
|
|
226
232
|
@rehearsal_thread = rehearsal_thread
|
227
233
|
end
|
228
234
|
|
229
|
-
def self.transaction_report(
|
235
|
+
def self.transaction_report(task, responses)
|
230
236
|
num_input_rows = file_writers.empty? ? 0 : file_writers.map(&:num_rows).inject(:+)
|
231
237
|
num_response_rows = responses.inject(0) do |sum, response|
|
232
238
|
sum + (response ? response.statistics.load.output_rows.to_i : 0)
|
233
239
|
end
|
234
|
-
|
240
|
+
if task['temp_table']
|
241
|
+
num_output_rows = bigquery.get_table(task['temp_table']).num_rows.to_i
|
242
|
+
else
|
243
|
+
num_output_rows = num_response_rows
|
244
|
+
end
|
235
245
|
num_rejected_rows = num_input_rows - num_output_rows
|
236
246
|
transaction_report = {
|
237
247
|
'num_input_rows' => num_input_rows,
|
@@ -304,9 +314,16 @@ module Embulk
|
|
304
314
|
else
|
305
315
|
target_table = task['temp_table'] ? task['temp_table'] : task['table']
|
306
316
|
responses = bigquery.load_in_parallel(paths, target_table)
|
307
|
-
transaction_report = self.transaction_report(
|
317
|
+
transaction_report = self.transaction_report(task, responses)
|
308
318
|
Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
|
309
319
|
|
320
|
+
if task['abort_on_error']
|
321
|
+
if transaction_report['num_input_rows'] != transaction_report['num_output_rows']
|
322
|
+
raise Error, "ABORT: `num_input_rows (#{transaction_report['num_input_rows']})` and " \
|
323
|
+
"`num_output_rows (#{transaction_report['num_output_rows']})` does not match"
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
310
327
|
if task['mode'] == 'replace_backup'
|
311
328
|
bigquery.copy(task['table'], task['table_old'], task['dataset_old'])
|
312
329
|
end
|
data/test/test_transaction.rb
CHANGED
@@ -34,13 +34,16 @@ module Embulk
|
|
34
34
|
Proc.new {|task| task_reports = [] }
|
35
35
|
end
|
36
36
|
|
37
|
+
def setup
|
38
|
+
stub(Bigquery).transaction_report { {'num_input_rows' => 1, 'num_output_rows' => 1, 'num_rejected_rows' => 0} }
|
39
|
+
end
|
40
|
+
|
37
41
|
def test_append
|
38
42
|
config = least_config.merge('mode' => 'append', 'temp_table' => 'temp_table')
|
39
43
|
any_instance_of(BigqueryClient) do |obj|
|
40
44
|
mock(obj).get_dataset(config['dataset'])
|
41
45
|
mock(obj).create_table(config['temp_table'])
|
42
46
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
43
|
-
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
44
47
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
|
45
48
|
mock(obj).delete_table(config['temp_table'])
|
46
49
|
end
|
@@ -54,7 +57,6 @@ module Embulk
|
|
54
57
|
mock(obj).get_dataset(config['dataset'])
|
55
58
|
mock(obj).get_table(config['table'])
|
56
59
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
57
|
-
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
58
60
|
end
|
59
61
|
Bigquery.transaction(config, schema, processor_count, &control)
|
60
62
|
end
|
@@ -65,7 +67,6 @@ module Embulk
|
|
65
67
|
mock(obj).create_dataset(config['dataset'])
|
66
68
|
mock(obj).create_table(config['table'])
|
67
69
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
68
|
-
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
69
70
|
end
|
70
71
|
Bigquery.transaction(config, schema, processor_count, &control)
|
71
72
|
end
|
@@ -78,7 +79,6 @@ module Embulk
|
|
78
79
|
mock(obj).delete_table(config['table'])
|
79
80
|
mock(obj).create_table(config['table'])
|
80
81
|
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
81
|
-
mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
|
82
82
|
end
|
83
83
|
Bigquery.transaction(config, schema, processor_count, &control)
|
84
84
|
end
|
@@ -89,7 +89,6 @@ module Embulk
|
|
89
89
|
mock(obj).get_dataset(config['dataset'])
|
90
90
|
mock(obj).create_table(config['temp_table'])
|
91
91
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
92
|
-
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
93
92
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
|
94
93
|
mock(obj).delete_table(config['temp_table'])
|
95
94
|
end
|
@@ -104,7 +103,6 @@ module Embulk
|
|
104
103
|
mock(obj).get_dataset(config['dataset_old'])
|
105
104
|
mock(obj).create_table(config['temp_table'])
|
106
105
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
107
|
-
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
108
106
|
|
109
107
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
110
108
|
|
@@ -121,7 +119,6 @@ module Embulk
|
|
121
119
|
mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
|
122
120
|
mock(obj).create_table(config['temp_table'])
|
123
121
|
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
124
|
-
mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
|
125
122
|
|
126
123
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
127
124
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-05-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|
@@ -90,6 +90,7 @@ extensions: []
|
|
90
90
|
extra_rdoc_files: []
|
91
91
|
files:
|
92
92
|
- ".gitignore"
|
93
|
+
- ".travis.yml"
|
93
94
|
- CHANGELOG.md
|
94
95
|
- Gemfile
|
95
96
|
- LICENSE.txt
|
@@ -108,6 +109,7 @@ files:
|
|
108
109
|
- example/config_min_ouput_tasks.yml
|
109
110
|
- example/config_mode_append.yml
|
110
111
|
- example/config_mode_append_direct.yml
|
112
|
+
- example/config_nested_record.yml
|
111
113
|
- example/config_payload_column.yml
|
112
114
|
- example/config_payload_column_index.yml
|
113
115
|
- example/config_prevent_duplicate_insert.yml
|