embulk-output-bigquery 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 71bc9b253f725436a06e183667cbc87720c3719b
4
- data.tar.gz: a32e43da05a4f90ab72c5715ffdf6b08501996d4
3
+ metadata.gz: 1ae4bf7af71e37194f768fad9e16e415747fee70
4
+ data.tar.gz: 796f2c3253d5600c439597f7ca495a7d1d8bac95
5
5
  SHA512:
6
- metadata.gz: bd3d8aefbc98c2f044b782f807f595603ac7b11052a06b6486803fd2f6871127058a50e9c69ffc1fac92b75de9561c57e99ad9ba3cd8899507e93085d45ed615
7
- data.tar.gz: 813b6455f463940968232b4332b8553698b9ef99ad4f3f5af6800b10223c33498fde9f8915604090f85dc7c2f78d16a865cd90da2174447c7f84ab3ef80a4cf8
6
+ metadata.gz: 3011c4128b2ed28a0fd84d0e2a592d706434ea24746c3aaf44f2e86f588b83da80e59b2931426cba0d390906ec283690966a4eb3b092d7d8edce364fc6ecc2b2
7
+ data.tar.gz: e969f903a71e5bf500fc57fb7a8b2ae2e4ffa8760557c329d3013de19959016ba7ef0578df3bdb7fb7cb0f62a3cb7a58f62cdfa23bb81d46476390884461e1a3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.4.1 - 2016-10-03
2
+
3
+ * [enhancement] Support `schema_update_options` option
4
+
1
5
  ## 0.4.0 - 2016-10-01
2
6
 
3
7
  * [enhancement] Support partitioned table
data/README.md CHANGED
@@ -100,9 +100,10 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
100
100
  | encoding | string | optional | "UTF-8" | `UTF-8` or `ISO-8859-1` |
101
101
  | ignore_unknown_values | boolean | optional | false | |
102
102
  | allow_quoted_newlines | boolean | optional | false | Set true, if data contains newline characters. It may cause slow procsssing |
103
- | time_partitioning | hash | optional | nil | See [Time Partitioning](#time-partitioning) |
103
+ | time_partitioning | hash | optional | `{"type":"DAY"}` if `table` parameter has a partition decorator, otherwise nil | See [Time Partitioning](#time-partitioning) |
104
104
  | time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
105
105
  | time_partitioning.expiration__ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. partition |
106
+ | schema_update_options | array | optional | nil | List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) |
106
107
 
107
108
  ### Example
108
109
 
@@ -365,7 +366,7 @@ Using `gcs_bucket` option, such strategy is enabled. You may also use `auto_crea
365
366
  out:
366
367
  type: bigquery
367
368
  gcs_bucket: bucket_name
368
- auto_create_gcs_bucket: false
369
+ auto_create_gcs_bucket: true
369
370
  ```
370
371
 
371
372
  ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers into GCS.
@@ -391,11 +392,30 @@ out:
391
392
  type: bigquery
392
393
  table: table_name$20160929
393
394
  auto_create_table: true
394
- time-partitioning:
395
+ time_partitioning:
395
396
  type: DAY
396
397
  expiration_ms: 259200000
397
398
  ```
398
399
 
400
+ Use `schema_update_options` to allow the schema of the desitination table to be updated as a side effect of the load job as:
401
+
402
+ ```yaml
403
+ out:
404
+ type: bigquery
405
+ table: table_name$20160929
406
+ auto_create_table: true
407
+ time_partitioning:
408
+ type: DAY
409
+ expiration_ms: 259200000
410
+ schema_update_options:
411
+ - ALLOW_FIELD_ADDITION
412
+ - ALLOW_FIELD_RELAXATION
413
+ ```
414
+
415
+ It seems that only adding a new column, and relaxing non-necessary columns to be `NULLABLE` are supported now.
416
+ Deleting columns, and renaming columns are not supported.
417
+ See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) for details.
418
+
399
419
  ## Development
400
420
 
401
421
  ### Run example:
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.4.0"
3
+ spec.version = "0.4.1"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -0,0 +1,31 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: date, type: string}
13
+ - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14
+ - {name: "null", type: string}
15
+ - {name: long, type: long}
16
+ - {name: string, type: string}
17
+ - {name: double, type: double}
18
+ - {name: boolean, type: boolean}
19
+ out:
20
+ type: bigquery
21
+ mode: append_direct
22
+ auth_method: json_key
23
+ json_keyfile: example/your-project-000.json
24
+ dataset: your_dataset_name
25
+ table: your_table_name
26
+ source_format: NEWLINE_DELIMITED_JSON
27
+ compression: NONE
28
+ auto_create_dataset: true
29
+ auto_create_table: true
30
+ schema_file: example/schema.json
31
+ schema_update_options: [ALLOW_FIELD_ADDITION, ALLOW_FIELD_RELAXATION]
@@ -86,6 +86,7 @@ module Embulk
86
86
  'ignore_unknown_values' => config.param('ignore_unknown_values', :bool, :default => false),
87
87
  'allow_quoted_newlines' => config.param('allow_quoted_newlines', :bool, :default => false),
88
88
  'time_partitioning' => config.param('time_partitioning', :hash, :default => nil),
89
+ 'schema_update_options' => config.param('schema_update_options', :array, :default => nil),
89
90
 
90
91
  # for debug
91
92
  'skip_load' => config.param('skip_load', :bool, :default => false),
@@ -230,6 +231,14 @@ module Embulk
230
231
  task['time_partitioning'] = {'type' => 'DAY'}
231
232
  end
232
233
 
234
+ if task['schema_update_options']
235
+ task['schema_update_options'].each do |schema_update_option|
236
+ unless %w[ALLOW_FIELD_ADDITION ALLOW_FIELD_RELAXATION].include?(schema_update_option)
237
+ raise ConfigError.new "`schema_update_options` must contain either of ALLOW_FIELD_ADDITION or ALLOW_FIELD_RELAXATION or both"
238
+ end
239
+ end
240
+ end
241
+
233
242
  task
234
243
  end
235
244
 
@@ -292,19 +301,19 @@ module Embulk
292
301
  else
293
302
  bigquery.delete_table(task['table'])
294
303
  end
295
- bigquery.create_table(task['table'], options: task)
304
+ bigquery.create_table(task['table'])
296
305
  when 'replace', 'replace_backup', 'append'
297
- bigquery.create_table(task['temp_table'], options: task)
306
+ bigquery.create_table(task['temp_table'])
298
307
  if task['time_partitioning']
299
308
  if task['auto_create_table']
300
- bigquery.create_table(task['table'], options: task)
309
+ bigquery.create_table(task['table'])
301
310
  else
302
311
  bigquery.get_table(task['table']) # raises NotFoundError
303
312
  end
304
313
  end
305
314
  else # append_direct
306
315
  if task['auto_create_table']
307
- bigquery.create_table(task['table'], options: task)
316
+ bigquery.create_table(task['table'])
308
317
  else
309
318
  bigquery.get_table(task['table']) # raises NotFoundError
310
319
  end
@@ -313,7 +322,7 @@ module Embulk
313
322
  if task['mode'] == 'replace_backup'
314
323
  if task['time_partitioning'] and Helper.has_partition_decorator?(task['table_old'])
315
324
  if task['auto_create_table']
316
- bigquery.create_table(task['table_old'], dataset: task['dataset_old'], options: task)
325
+ bigquery.create_table(task['table_old'], dataset: task['dataset_old'])
317
326
  else
318
327
  bigquery.get_table(task['table_old'], dataset: task['dataset_old']) # raises NotFoundError
319
328
  end
@@ -194,6 +194,10 @@ module Embulk
194
194
  }
195
195
  }
196
196
 
197
+ if @task['schema_update_options']
198
+ body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
199
+ end
200
+
197
201
  opts = {
198
202
  upload_source: path,
199
203
  content_type: "application/octet-stream",
@@ -254,6 +258,10 @@ module Embulk
254
258
  }
255
259
  }
256
260
 
261
+ if @task['schema_update_options']
262
+ body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
263
+ end
264
+
257
265
  opts = {}
258
266
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
259
267
  response = with_network_retry { client.insert_job(@project, body, opts) }
@@ -371,10 +379,16 @@ module Embulk
371
379
  end
372
380
  end
373
381
 
374
- def create_table(table, dataset: nil, options: {})
382
+ def create_table(table, dataset: nil, options: nil)
375
383
  begin
376
- table = Helper.chomp_partition_decorator(table)
377
384
  dataset ||= @dataset
385
+ options ||= {}
386
+ options['time_partitioning'] ||= @task['time_partitioning']
387
+ if Helper.has_partition_decorator?(table)
388
+ options['time_partitioning'] ||= {'type' => 'DAY'}
389
+ table = Helper.chomp_partition_decorator(table)
390
+ end
391
+
378
392
  Embulk.logger.info { "embulk-output-bigquery: Create table... #{@project}:#{dataset}.#{table}" }
379
393
  body = {
380
394
  table_reference: {
@@ -384,12 +398,14 @@ module Embulk
384
398
  fields: fields,
385
399
  }
386
400
  }
401
+
387
402
  if options['time_partitioning']
388
403
  body[:time_partitioning] = {
389
404
  type: options['time_partitioning']['type'],
390
405
  expiration_ms: options['time_partitioning']['expiration_ms'],
391
406
  }
392
407
  end
408
+
393
409
  opts = {}
394
410
  Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{dataset}, #{body}, #{opts})" }
395
411
  with_network_retry { client.insert_table(@project, dataset, body, opts) }
@@ -110,7 +110,7 @@ else
110
110
  client.delete_table('your_table_name')
111
111
  assert_nothing_raised do
112
112
  client.create_table('your_table_name$20160929', options:{
113
- 'time_partitioning' => {'type'=>'DAY'}
113
+ 'time_partitioning' => {'type'=>'DAY', 'expiration_ms'=>1000}
114
114
  })
115
115
  end
116
116
  end
@@ -153,15 +153,15 @@ else
153
153
 
154
154
  sub_test_case "delete_partition" do
155
155
  def test_delete_partition
156
- client.create_table('your_table_name$20160929', options:{
157
- 'time_partitioning' => {'type'=>'DAY'}
158
- })
156
+ client.delete_table('your_table_name')
157
+ client.create_table('your_table_name$20160929')
159
158
  assert_nothing_raised { client.delete_partition('your_table_name$20160929') }
160
159
  ensure
161
160
  client.delete_table('your_table_name')
162
161
  end
163
162
 
164
163
  def test_delete_partition_of_non_partitioned_table
164
+ client.delete_table('your_table_name')
165
165
  client.create_table('your_table_name')
166
166
  assert_raise { client.delete_partition('your_table_name$20160929') }
167
167
  ensure
@@ -266,6 +266,14 @@ module Embulk
266
266
  task = Bigquery.configure(config, schema, processor_count)
267
267
  assert_equal 'DAY', task['time_partitioning']['type']
268
268
  end
269
+
270
+ def test_schema_update_options
271
+ config = least_config.merge('schema_update_options' => ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'])
272
+ assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
273
+
274
+ config = least_config.merge('schema_update_options' => ['FOO'])
275
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
276
+ end
269
277
  end
270
278
  end
271
279
  end
@@ -55,7 +55,7 @@ module Embulk
55
55
  task = Bigquery.configure(config, schema, processor_count)
56
56
  any_instance_of(BigqueryClient) do |obj|
57
57
  mock(obj).create_dataset(config['dataset'])
58
- mock(obj).create_table(config['table'], options: task)
58
+ mock(obj).create_table(config['table'])
59
59
  end
60
60
  Bigquery.transaction(config, schema, processor_count, &control)
61
61
  end
@@ -74,7 +74,7 @@ module Embulk
74
74
  task = Bigquery.configure(config, schema, processor_count)
75
75
  any_instance_of(BigqueryClient) do |obj|
76
76
  mock(obj).create_dataset(config['dataset'])
77
- mock(obj).create_table(config['table'], options: task)
77
+ mock(obj).create_table(config['table'])
78
78
  end
79
79
  Bigquery.transaction(config, schema, processor_count, &control)
80
80
  end
@@ -87,7 +87,7 @@ module Embulk
87
87
  any_instance_of(BigqueryClient) do |obj|
88
88
  mock(obj).get_dataset(config['dataset'])
89
89
  mock(obj).delete_table(config['table'])
90
- mock(obj).create_table(config['table'], options: task)
90
+ mock(obj).create_table(config['table'])
91
91
  end
92
92
  Bigquery.transaction(config, schema, processor_count, &control)
93
93
  end
@@ -98,7 +98,7 @@ module Embulk
98
98
  any_instance_of(BigqueryClient) do |obj|
99
99
  mock(obj).get_dataset(config['dataset'])
100
100
  mock(obj).delete_partition(config['table'])
101
- mock(obj).create_table(config['table'], options: task)
101
+ mock(obj).create_table(config['table'])
102
102
  end
103
103
  Bigquery.transaction(config, schema, processor_count, &control)
104
104
  end
@@ -110,7 +110,7 @@ module Embulk
110
110
  task = Bigquery.configure(config, schema, processor_count)
111
111
  any_instance_of(BigqueryClient) do |obj|
112
112
  mock(obj).get_dataset(config['dataset'])
113
- mock(obj).create_table(config['temp_table'], options: task)
113
+ mock(obj).create_table(config['temp_table'])
114
114
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
115
115
  mock(obj).delete_table(config['temp_table'])
116
116
  end
@@ -122,7 +122,7 @@ module Embulk
122
122
  task = Bigquery.configure(config, schema, processor_count)
123
123
  any_instance_of(BigqueryClient) do |obj|
124
124
  mock(obj).get_dataset(config['dataset'])
125
- mock(obj).create_table(config['temp_table'], options: task)
125
+ mock(obj).create_table(config['temp_table'])
126
126
  mock(obj).get_table(config['table'])
127
127
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
128
128
  mock(obj).delete_table(config['temp_table'])
@@ -135,8 +135,8 @@ module Embulk
135
135
  task = Bigquery.configure(config, schema, processor_count)
136
136
  any_instance_of(BigqueryClient) do |obj|
137
137
  mock(obj).get_dataset(config['dataset'])
138
- mock(obj).create_table(config['temp_table'], options: task)
139
- mock(obj).create_table(config['table'], options: task)
138
+ mock(obj).create_table(config['temp_table'])
139
+ mock(obj).create_table(config['table'])
140
140
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
141
141
  mock(obj).delete_table(config['temp_table'])
142
142
  end
@@ -151,7 +151,7 @@ module Embulk
151
151
  any_instance_of(BigqueryClient) do |obj|
152
152
  mock(obj).get_dataset(config['dataset'])
153
153
  mock(obj).get_dataset(config['dataset_old'])
154
- mock(obj).create_table(config['temp_table'], options: task)
154
+ mock(obj).create_table(config['temp_table'])
155
155
 
156
156
  mock(obj).get_table(task['table'])
157
157
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -168,7 +168,7 @@ module Embulk
168
168
  any_instance_of(BigqueryClient) do |obj|
169
169
  mock(obj).create_dataset(config['dataset'])
170
170
  mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
171
- mock(obj).create_table(config['temp_table'], options: task)
171
+ mock(obj).create_table(config['temp_table'])
172
172
 
173
173
  mock(obj).get_table(task['table'])
174
174
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -185,7 +185,7 @@ module Embulk
185
185
  any_instance_of(BigqueryClient) do |obj|
186
186
  mock(obj).get_dataset(config['dataset'])
187
187
  mock(obj).get_dataset(config['dataset_old'])
188
- mock(obj).create_table(config['temp_table'], options: task)
188
+ mock(obj).create_table(config['temp_table'])
189
189
  mock(obj).get_table(task['table'])
190
190
  mock(obj).get_table(task['table_old'], dataset: config['dataset_old'])
191
191
 
@@ -204,9 +204,9 @@ module Embulk
204
204
  any_instance_of(BigqueryClient) do |obj|
205
205
  mock(obj).get_dataset(config['dataset'])
206
206
  mock(obj).get_dataset(config['dataset_old'])
207
- mock(obj).create_table(config['temp_table'], options: task)
208
- mock(obj).create_table(task['table'], options: task)
209
- mock(obj).create_table(task['table_old'], dataset: config['dataset_old'], options: task)
207
+ mock(obj).create_table(config['temp_table'])
208
+ mock(obj).create_table(task['table'])
209
+ mock(obj).create_table(task['table_old'], dataset: config['dataset_old'])
210
210
 
211
211
  mock(obj).get_table(task['table'])
212
212
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -224,7 +224,7 @@ module Embulk
224
224
  task = Bigquery.configure(config, schema, processor_count)
225
225
  any_instance_of(BigqueryClient) do |obj|
226
226
  mock(obj).get_dataset(config['dataset'])
227
- mock(obj).create_table(config['temp_table'], options: task)
227
+ mock(obj).create_table(config['temp_table'])
228
228
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
229
229
  mock(obj).delete_table(config['temp_table'])
230
230
  end
@@ -236,7 +236,7 @@ module Embulk
236
236
  task = Bigquery.configure(config, schema, processor_count)
237
237
  any_instance_of(BigqueryClient) do |obj|
238
238
  mock(obj).get_dataset(config['dataset'])
239
- mock(obj).create_table(config['temp_table'], options: task)
239
+ mock(obj).create_table(config['temp_table'])
240
240
  mock(obj).get_table(config['table'])
241
241
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
242
242
  mock(obj).delete_table(config['temp_table'])
@@ -249,15 +249,14 @@ module Embulk
249
249
  task = Bigquery.configure(config, schema, processor_count)
250
250
  any_instance_of(BigqueryClient) do |obj|
251
251
  mock(obj).get_dataset(config['dataset'])
252
- mock(obj).create_table(config['temp_table'], options: task)
253
- mock(obj).create_table(config['table'], options: task)
252
+ mock(obj).create_table(config['temp_table'])
253
+ mock(obj).create_table(config['table'])
254
254
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
255
255
  mock(obj).delete_table(config['temp_table'])
256
256
  end
257
257
  Bigquery.transaction(config, schema, processor_count, &control)
258
258
  end
259
259
  end
260
-
261
260
  end
262
261
  end
263
262
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-10-01 00:00:00.000000000 Z
12
+ date: 2016-10-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client
@@ -97,6 +97,7 @@ files:
97
97
  - README.md
98
98
  - Rakefile
99
99
  - embulk-output-bigquery.gemspec
100
+ - example/config_append_direct_schema_update_options.yml
100
101
  - example/config_client_options.yml
101
102
  - example/config_csv.yml
102
103
  - example/config_delete_in_advance.yml