embulk-output-bigquery 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f21e4f5989b1aa631de606560ee75591a113c6f5
4
- data.tar.gz: da801735b3ad2871a5d78bdde79d4f8e5e87ca30
3
+ metadata.gz: 14f823d8b06be1f8537c52ae244c4c94bbeb8833
4
+ data.tar.gz: b419e43610303eea86f1a963bde2bb4db472b26e
5
5
  SHA512:
6
- metadata.gz: 582b300dacd9a45e39b424c3d0c0c3a887f5edc860430b2f0341df945ee723c0c8c5458619f28f18b2f028fe214fc3dbf58afd2751735bd2c143addb5ba164b3
7
- data.tar.gz: 593d02fb4ec66bff1e3095e7e65f4d9b2adc3cb471ec3e998007ccc0fef73cfb48ad1bee6b0ee232d45ad41031affacb941e1d89097520052de007310d769465
6
+ metadata.gz: 009d542bbcbe0e73b40ccb76a1197c8dcd4185fcc0bc8b1819d799d74d9fc59428935250ffd67df560da14228a6d9526d9b41503eb7b88b23b1da505712a9157
7
+ data.tar.gz: 6215755f4975f62a949ba52e5766d61449b74ff0e63ca731caca0ceef17a69ff88a2adfe2636f1fe592f6feb51c10d7fa1624976224ff509d9a5c0bf54f8bb2e
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ cache: bundler
3
+ rvm:
4
+ - jruby-9.0.5.0
5
+ - jruby-head
6
+ jdk:
7
+ - openjdk7
8
+ before_install:
9
+ - gem install bundler
10
+ matrix:
11
+ allow_failures:
12
+ - rvm: jruby-head
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.3.2 - 2016-05-03
2
+
3
+ * [new feature] Add `abort_on_error` option
4
+ * [maintenance] Use uuid instead of current time for temp_table name
5
+
1
6
  ## 0.3.1 - 2016-04-15
2
7
 
3
8
  * [new feature] Add `sdk_log_level` option to show log of google-api-client
data/README.md CHANGED
@@ -21,6 +21,14 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
21
21
  Current version of this plugin supports Google API with Service Account Authentication, but does not support
22
22
  OAuth flow for installed applications.
23
23
 
24
+ ### INCOMPATIBILITY CHANGES
25
+
26
+ v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
27
+
28
+ * `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
29
+ * `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
30
+ * `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
31
+
24
32
  ## Configuration
25
33
 
26
34
  #### Original options
@@ -45,6 +53,7 @@ OAuth flow for installed applications.
45
53
  | is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
46
54
  | with_rehearsal | boolean | optional | false | Load `rehearsal_counts` records as a rehearsal. Rehearsal loads into REHEARSAL temporary table, and delete finally. You may use this option to investigate data errors as early stage as possible |
47
55
  | rehearsal_counts | integer | optional | 1000 | Specify number of records to load in a rehearsal |
56
+ | abort_on_error | boolean | optional | true if max_bad_records is 0, otherwise false | Raise an error if number of input rows and number of output rows does not match |
48
57
  | column_options | hash | optional | | [See below](#column-options) |
49
58
  | default_timezone | string | optional | UTC | |
50
59
  | default_timestamp_format | string | optional | %Y-%m-%d %H:%M:%S.%6N | |
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.1"
3
+ spec.version = "0.3.2"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -0,0 +1,40 @@
1
+ # embulk gem install embulk-parser-jsonl
2
+ in:
3
+ type: file
4
+ path_prefix: example/nested_example.jsonl
5
+ parser:
6
+ type: jsonl
7
+ columns:
8
+ - {name: date, type: string}
9
+ - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
10
+ - {name: "null", type: string}
11
+ - {name: long, type: long}
12
+ - {name: string, type: string}
13
+ - {name: double, type: double}
14
+ - {name: json, type: json}
15
+ - {name: boolean, type: boolean}
16
+ out:
17
+ type: bigquery
18
+ mode: replace
19
+ auth_method: json_key
20
+ json_keyfile: example/your-project-000.json
21
+ dataset: your_dataset_name
22
+ table: your_table_name
23
+ compression: GZIP
24
+ source_format: NEWLINE_DELIMITED_JSON
25
+ auto_create_dataset: true
26
+ auto_create_table: true
27
+ column_options:
28
+ - {name: date, type: TIMESTAMP, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
29
+ - {name: timestamp, type: STRING, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
30
+ - {name: long, type: STRING}
31
+ - {name: string, type: STRING}
32
+ - {name: double, type: STRING}
33
+ - {name: boolean, type: STRING}
34
+ - name: json
35
+ type: RECORD
36
+ fields:
37
+ - {name: k1, type: STRING}
38
+ - {name: k2, type: STRING}
39
+ # 2015-07-13
40
+ # 2015-07-12 15:00:00
@@ -1,6 +1,7 @@
1
1
  require 'json'
2
2
  require 'tempfile'
3
3
  require 'fileutils'
4
+ require 'securerandom'
4
5
  require_relative 'bigquery/bigquery_client'
5
6
  require_relative 'bigquery/file_writer'
6
7
  require_relative 'bigquery/value_converter_factory'
@@ -52,6 +53,7 @@ module Embulk
52
53
  'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
53
54
  'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
54
55
  'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
56
+ 'abort_on_error' => config.param('abort_on_error', :bool, :default => nil),
55
57
 
56
58
  'column_options' => config.param('column_options', :array, :default => []),
57
59
  'default_timezone' => config.param('default_timezone', :string, :default => ValueConverterFactory::DEFAULT_TIMEZONE),
@@ -193,7 +195,7 @@ module Embulk
193
195
  task['file_ext'] = file_ext
194
196
  end
195
197
 
196
- unique_name = "%08x%08x%08x" % [Process.pid, now.tv_sec, now.tv_nsec]
198
+ unique_name = SecureRandom.uuid.gsub('-', '_')
197
199
 
198
200
  if %w[replace replace_backup append].include?(task['mode'])
199
201
  task['temp_table'] ||= "LOAD_TEMP_#{unique_name}_#{task['table']}"
@@ -207,6 +209,10 @@ module Embulk
207
209
  Google::Apis.logger.level = eval("::Logger::#{task['sdk_log_level'].upcase}")
208
210
  end
209
211
 
212
+ if task['abort_on_error'].nil?
213
+ task['abort_on_error'] = (task['max_bad_records'] == 0)
214
+ end
215
+
210
216
  task
211
217
  end
212
218
 
@@ -226,12 +232,16 @@ module Embulk
226
232
  @rehearsal_thread = rehearsal_thread
227
233
  end
228
234
 
229
- def self.transaction_report(file_writers, responses, target_table)
235
+ def self.transaction_report(task, responses)
230
236
  num_input_rows = file_writers.empty? ? 0 : file_writers.map(&:num_rows).inject(:+)
231
237
  num_response_rows = responses.inject(0) do |sum, response|
232
238
  sum + (response ? response.statistics.load.output_rows.to_i : 0)
233
239
  end
234
- num_output_rows = bigquery.get_table(target_table).num_rows.to_i
240
+ if task['temp_table']
241
+ num_output_rows = bigquery.get_table(task['temp_table']).num_rows.to_i
242
+ else
243
+ num_output_rows = num_response_rows
244
+ end
235
245
  num_rejected_rows = num_input_rows - num_output_rows
236
246
  transaction_report = {
237
247
  'num_input_rows' => num_input_rows,
@@ -304,9 +314,16 @@ module Embulk
304
314
  else
305
315
  target_table = task['temp_table'] ? task['temp_table'] : task['table']
306
316
  responses = bigquery.load_in_parallel(paths, target_table)
307
- transaction_report = self.transaction_report(file_writers, responses, target_table)
317
+ transaction_report = self.transaction_report(task, responses)
308
318
  Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
309
319
 
320
+ if task['abort_on_error']
321
+ if transaction_report['num_input_rows'] != transaction_report['num_output_rows']
322
+ raise Error, "ABORT: `num_input_rows (#{transaction_report['num_input_rows']})` and " \
323
+ "`num_output_rows (#{transaction_report['num_output_rows']})` does not match"
324
+ end
325
+ end
326
+
310
327
  if task['mode'] == 'replace_backup'
311
328
  bigquery.copy(task['table'], task['table_old'], task['dataset_old'])
312
329
  end
@@ -34,13 +34,16 @@ module Embulk
34
34
  Proc.new {|task| task_reports = [] }
35
35
  end
36
36
 
37
+ def setup
38
+ stub(Bigquery).transaction_report { {'num_input_rows' => 1, 'num_output_rows' => 1, 'num_rejected_rows' => 0} }
39
+ end
40
+
37
41
  def test_append
38
42
  config = least_config.merge('mode' => 'append', 'temp_table' => 'temp_table')
39
43
  any_instance_of(BigqueryClient) do |obj|
40
44
  mock(obj).get_dataset(config['dataset'])
41
45
  mock(obj).create_table(config['temp_table'])
42
46
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
43
- mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
44
47
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
45
48
  mock(obj).delete_table(config['temp_table'])
46
49
  end
@@ -54,7 +57,6 @@ module Embulk
54
57
  mock(obj).get_dataset(config['dataset'])
55
58
  mock(obj).get_table(config['table'])
56
59
  mock(obj).load_in_parallel(anything, config['table']) { [] }
57
- mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
58
60
  end
59
61
  Bigquery.transaction(config, schema, processor_count, &control)
60
62
  end
@@ -65,7 +67,6 @@ module Embulk
65
67
  mock(obj).create_dataset(config['dataset'])
66
68
  mock(obj).create_table(config['table'])
67
69
  mock(obj).load_in_parallel(anything, config['table']) { [] }
68
- mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
69
70
  end
70
71
  Bigquery.transaction(config, schema, processor_count, &control)
71
72
  end
@@ -78,7 +79,6 @@ module Embulk
78
79
  mock(obj).delete_table(config['table'])
79
80
  mock(obj).create_table(config['table'])
80
81
  mock(obj).load_in_parallel(anything, config['table']) { [] }
81
- mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
82
82
  end
83
83
  Bigquery.transaction(config, schema, processor_count, &control)
84
84
  end
@@ -89,7 +89,6 @@ module Embulk
89
89
  mock(obj).get_dataset(config['dataset'])
90
90
  mock(obj).create_table(config['temp_table'])
91
91
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
92
- mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
93
92
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
94
93
  mock(obj).delete_table(config['temp_table'])
95
94
  end
@@ -104,7 +103,6 @@ module Embulk
104
103
  mock(obj).get_dataset(config['dataset_old'])
105
104
  mock(obj).create_table(config['temp_table'])
106
105
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
107
- mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
108
106
 
109
107
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
110
108
 
@@ -121,7 +119,6 @@ module Embulk
121
119
  mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
122
120
  mock(obj).create_table(config['temp_table'])
123
121
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
124
- mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
125
122
 
126
123
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
127
124
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-04-15 00:00:00.000000000 Z
12
+ date: 2016-05-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client
@@ -90,6 +90,7 @@ extensions: []
90
90
  extra_rdoc_files: []
91
91
  files:
92
92
  - ".gitignore"
93
+ - ".travis.yml"
93
94
  - CHANGELOG.md
94
95
  - Gemfile
95
96
  - LICENSE.txt
@@ -108,6 +109,7 @@ files:
108
109
  - example/config_min_ouput_tasks.yml
109
110
  - example/config_mode_append.yml
110
111
  - example/config_mode_append_direct.yml
112
+ - example/config_nested_record.yml
111
113
  - example/config_payload_column.yml
112
114
  - example/config_payload_column_index.yml
113
115
  - example/config_prevent_duplicate_insert.yml