embulk-output-bigquery 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f21e4f5989b1aa631de606560ee75591a113c6f5
4
- data.tar.gz: da801735b3ad2871a5d78bdde79d4f8e5e87ca30
3
+ metadata.gz: 14f823d8b06be1f8537c52ae244c4c94bbeb8833
4
+ data.tar.gz: b419e43610303eea86f1a963bde2bb4db472b26e
5
5
  SHA512:
6
- metadata.gz: 582b300dacd9a45e39b424c3d0c0c3a887f5edc860430b2f0341df945ee723c0c8c5458619f28f18b2f028fe214fc3dbf58afd2751735bd2c143addb5ba164b3
7
- data.tar.gz: 593d02fb4ec66bff1e3095e7e65f4d9b2adc3cb471ec3e998007ccc0fef73cfb48ad1bee6b0ee232d45ad41031affacb941e1d89097520052de007310d769465
6
+ metadata.gz: 009d542bbcbe0e73b40ccb76a1197c8dcd4185fcc0bc8b1819d799d74d9fc59428935250ffd67df560da14228a6d9526d9b41503eb7b88b23b1da505712a9157
7
+ data.tar.gz: 6215755f4975f62a949ba52e5766d61449b74ff0e63ca731caca0ceef17a69ff88a2adfe2636f1fe592f6feb51c10d7fa1624976224ff509d9a5c0bf54f8bb2e
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ cache: bundler
3
+ rvm:
4
+ - jruby-9.0.5.0
5
+ - jruby-head
6
+ jdk:
7
+ - openjdk7
8
+ before_install:
9
+ - gem install bundler
10
+ matrix:
11
+ allow_failures:
12
+ - rvm: jruby-head
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.3.2 - 2016-05-03
2
+
3
+ * [new feature] Add `abort_on_error` option
4
+ * [maintenance] Use uuid instead of current time for temp_table name
5
+
1
6
  ## 0.3.1 - 2016-04-15
2
7
 
3
8
  * [new feature] Add `sdk_log_level` option to show log of google-api-client
data/README.md CHANGED
@@ -21,6 +21,14 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
21
21
  Current version of this plugin supports Google API with Service Account Authentication, but does not support
22
22
  OAuth flow for installed applications.
23
23
 
24
+ ### INCOMPATIBILITY CHANGES
25
+
26
+ v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
27
+
28
+ * `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
29
+ * `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
30
+ * `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
31
+
24
32
  ## Configuration
25
33
 
26
34
  #### Original options
@@ -45,6 +53,7 @@ OAuth flow for installed applications.
45
53
  | is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
46
54
  | with_rehearsal | boolean | optional | false | Load `rehearsal_counts` records as a rehearsal. Rehearsal loads into REHEARSAL temporary table, and delete finally. You may use this option to investigate data errors as early stage as possible |
47
55
  | rehearsal_counts | integer | optional | 1000 | Specify number of records to load in a rehearsal |
56
+ | abort_on_error | boolean | optional | true if max_bad_records is 0, otherwise false | Raise an error if number of input rows and number of output rows does not match |
48
57
  | column_options | hash | optional | | [See below](#column-options) |
49
58
  | default_timezone | string | optional | UTC | |
50
59
  | default_timestamp_format | string | optional | %Y-%m-%d %H:%M:%S.%6N | |
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.1"
3
+ spec.version = "0.3.2"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -0,0 +1,40 @@
1
+ # embulk gem install embulk-parser-jsonl
2
+ in:
3
+ type: file
4
+ path_prefix: example/nested_example.jsonl
5
+ parser:
6
+ type: jsonl
7
+ columns:
8
+ - {name: date, type: string}
9
+ - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
10
+ - {name: "null", type: string}
11
+ - {name: long, type: long}
12
+ - {name: string, type: string}
13
+ - {name: double, type: double}
14
+ - {name: json, type: json}
15
+ - {name: boolean, type: boolean}
16
+ out:
17
+ type: bigquery
18
+ mode: replace
19
+ auth_method: json_key
20
+ json_keyfile: example/your-project-000.json
21
+ dataset: your_dataset_name
22
+ table: your_table_name
23
+ compression: GZIP
24
+ source_format: NEWLINE_DELIMITED_JSON
25
+ auto_create_dataset: true
26
+ auto_create_table: true
27
+ column_options:
28
+ - {name: date, type: TIMESTAMP, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
29
+ - {name: timestamp, type: STRING, timestamp_format: "%Y-%m-%d", timezone: "+09:00"}
30
+ - {name: long, type: STRING}
31
+ - {name: string, type: STRING}
32
+ - {name: double, type: STRING}
33
+ - {name: boolean, type: STRING}
34
+ - name: json
35
+ type: RECORD
36
+ fields:
37
+ - {name: k1, type: STRING}
38
+ - {name: k2, type: STRING}
39
+ # 2015-07-13
40
+ # 2015-07-12 15:00:00
@@ -1,6 +1,7 @@
1
1
  require 'json'
2
2
  require 'tempfile'
3
3
  require 'fileutils'
4
+ require 'securerandom'
4
5
  require_relative 'bigquery/bigquery_client'
5
6
  require_relative 'bigquery/file_writer'
6
7
  require_relative 'bigquery/value_converter_factory'
@@ -52,6 +53,7 @@ module Embulk
52
53
  'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
53
54
  'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
54
55
  'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
56
+ 'abort_on_error' => config.param('abort_on_error', :bool, :default => nil),
55
57
 
56
58
  'column_options' => config.param('column_options', :array, :default => []),
57
59
  'default_timezone' => config.param('default_timezone', :string, :default => ValueConverterFactory::DEFAULT_TIMEZONE),
@@ -193,7 +195,7 @@ module Embulk
193
195
  task['file_ext'] = file_ext
194
196
  end
195
197
 
196
- unique_name = "%08x%08x%08x" % [Process.pid, now.tv_sec, now.tv_nsec]
198
+ unique_name = SecureRandom.uuid.gsub('-', '_')
197
199
 
198
200
  if %w[replace replace_backup append].include?(task['mode'])
199
201
  task['temp_table'] ||= "LOAD_TEMP_#{unique_name}_#{task['table']}"
@@ -207,6 +209,10 @@ module Embulk
207
209
  Google::Apis.logger.level = eval("::Logger::#{task['sdk_log_level'].upcase}")
208
210
  end
209
211
 
212
+ if task['abort_on_error'].nil?
213
+ task['abort_on_error'] = (task['max_bad_records'] == 0)
214
+ end
215
+
210
216
  task
211
217
  end
212
218
 
@@ -226,12 +232,16 @@ module Embulk
226
232
  @rehearsal_thread = rehearsal_thread
227
233
  end
228
234
 
229
- def self.transaction_report(file_writers, responses, target_table)
235
+ def self.transaction_report(task, responses)
230
236
  num_input_rows = file_writers.empty? ? 0 : file_writers.map(&:num_rows).inject(:+)
231
237
  num_response_rows = responses.inject(0) do |sum, response|
232
238
  sum + (response ? response.statistics.load.output_rows.to_i : 0)
233
239
  end
234
- num_output_rows = bigquery.get_table(target_table).num_rows.to_i
240
+ if task['temp_table']
241
+ num_output_rows = bigquery.get_table(task['temp_table']).num_rows.to_i
242
+ else
243
+ num_output_rows = num_response_rows
244
+ end
235
245
  num_rejected_rows = num_input_rows - num_output_rows
236
246
  transaction_report = {
237
247
  'num_input_rows' => num_input_rows,
@@ -304,9 +314,16 @@ module Embulk
304
314
  else
305
315
  target_table = task['temp_table'] ? task['temp_table'] : task['table']
306
316
  responses = bigquery.load_in_parallel(paths, target_table)
307
- transaction_report = self.transaction_report(file_writers, responses, target_table)
317
+ transaction_report = self.transaction_report(task, responses)
308
318
  Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
309
319
 
320
+ if task['abort_on_error']
321
+ if transaction_report['num_input_rows'] != transaction_report['num_output_rows']
322
+ raise Error, "ABORT: `num_input_rows (#{transaction_report['num_input_rows']})` and " \
323
+ "`num_output_rows (#{transaction_report['num_output_rows']})` does not match"
324
+ end
325
+ end
326
+
310
327
  if task['mode'] == 'replace_backup'
311
328
  bigquery.copy(task['table'], task['table_old'], task['dataset_old'])
312
329
  end
@@ -34,13 +34,16 @@ module Embulk
34
34
  Proc.new {|task| task_reports = [] }
35
35
  end
36
36
 
37
+ def setup
38
+ stub(Bigquery).transaction_report { {'num_input_rows' => 1, 'num_output_rows' => 1, 'num_rejected_rows' => 0} }
39
+ end
40
+
37
41
  def test_append
38
42
  config = least_config.merge('mode' => 'append', 'temp_table' => 'temp_table')
39
43
  any_instance_of(BigqueryClient) do |obj|
40
44
  mock(obj).get_dataset(config['dataset'])
41
45
  mock(obj).create_table(config['temp_table'])
42
46
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
43
- mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
44
47
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
45
48
  mock(obj).delete_table(config['temp_table'])
46
49
  end
@@ -54,7 +57,6 @@ module Embulk
54
57
  mock(obj).get_dataset(config['dataset'])
55
58
  mock(obj).get_table(config['table'])
56
59
  mock(obj).load_in_parallel(anything, config['table']) { [] }
57
- mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
58
60
  end
59
61
  Bigquery.transaction(config, schema, processor_count, &control)
60
62
  end
@@ -65,7 +67,6 @@ module Embulk
65
67
  mock(obj).create_dataset(config['dataset'])
66
68
  mock(obj).create_table(config['table'])
67
69
  mock(obj).load_in_parallel(anything, config['table']) { [] }
68
- mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
69
70
  end
70
71
  Bigquery.transaction(config, schema, processor_count, &control)
71
72
  end
@@ -78,7 +79,6 @@ module Embulk
78
79
  mock(obj).delete_table(config['table'])
79
80
  mock(obj).create_table(config['table'])
80
81
  mock(obj).load_in_parallel(anything, config['table']) { [] }
81
- mock(obj).get_table(config['table']) { OpenStruct.new(num_rows: 1) }
82
82
  end
83
83
  Bigquery.transaction(config, schema, processor_count, &control)
84
84
  end
@@ -89,7 +89,6 @@ module Embulk
89
89
  mock(obj).get_dataset(config['dataset'])
90
90
  mock(obj).create_table(config['temp_table'])
91
91
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
92
- mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
93
92
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
94
93
  mock(obj).delete_table(config['temp_table'])
95
94
  end
@@ -104,7 +103,6 @@ module Embulk
104
103
  mock(obj).get_dataset(config['dataset_old'])
105
104
  mock(obj).create_table(config['temp_table'])
106
105
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
107
- mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
108
106
 
109
107
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
110
108
 
@@ -121,7 +119,6 @@ module Embulk
121
119
  mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
122
120
  mock(obj).create_table(config['temp_table'])
123
121
  mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
124
- mock(obj).get_table(config['temp_table']) { OpenStruct.new(num_rows: 1) }
125
122
 
126
123
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
127
124
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-04-15 00:00:00.000000000 Z
12
+ date: 2016-05-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client
@@ -90,6 +90,7 @@ extensions: []
90
90
  extra_rdoc_files: []
91
91
  files:
92
92
  - ".gitignore"
93
+ - ".travis.yml"
93
94
  - CHANGELOG.md
94
95
  - Gemfile
95
96
  - LICENSE.txt
@@ -108,6 +109,7 @@ files:
108
109
  - example/config_min_ouput_tasks.yml
109
110
  - example/config_mode_append.yml
110
111
  - example/config_mode_append_direct.yml
112
+ - example/config_nested_record.yml
111
113
  - example/config_payload_column.yml
112
114
  - example/config_payload_column_index.yml
113
115
  - example/config_prevent_duplicate_insert.yml