embulk-output-bigquery 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 71bc9b253f725436a06e183667cbc87720c3719b
4
- data.tar.gz: a32e43da05a4f90ab72c5715ffdf6b08501996d4
3
+ metadata.gz: 1ae4bf7af71e37194f768fad9e16e415747fee70
4
+ data.tar.gz: 796f2c3253d5600c439597f7ca495a7d1d8bac95
5
5
  SHA512:
6
- metadata.gz: bd3d8aefbc98c2f044b782f807f595603ac7b11052a06b6486803fd2f6871127058a50e9c69ffc1fac92b75de9561c57e99ad9ba3cd8899507e93085d45ed615
7
- data.tar.gz: 813b6455f463940968232b4332b8553698b9ef99ad4f3f5af6800b10223c33498fde9f8915604090f85dc7c2f78d16a865cd90da2174447c7f84ab3ef80a4cf8
6
+ metadata.gz: 3011c4128b2ed28a0fd84d0e2a592d706434ea24746c3aaf44f2e86f588b83da80e59b2931426cba0d390906ec283690966a4eb3b092d7d8edce364fc6ecc2b2
7
+ data.tar.gz: e969f903a71e5bf500fc57fb7a8b2ae2e4ffa8760557c329d3013de19959016ba7ef0578df3bdb7fb7cb0f62a3cb7a58f62cdfa23bb81d46476390884461e1a3
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.4.1 - 2016-10-03
2
+
3
+ * [enhancement] Support `schema_update_options` option
4
+
1
5
  ## 0.4.0 - 2016-10-01
2
6
 
3
7
  * [enhancement] Support partitioned table
data/README.md CHANGED
@@ -100,9 +100,10 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
100
100
  | encoding | string | optional | "UTF-8" | `UTF-8` or `ISO-8859-1` |
101
101
  | ignore_unknown_values | boolean | optional | false | |
102
102
  | allow_quoted_newlines | boolean | optional | false | Set true, if data contains newline characters. It may cause slow procsssing |
103
- | time_partitioning | hash | optional | nil | See [Time Partitioning](#time-partitioning) |
103
+ | time_partitioning | hash | optional | `{"type":"DAY"}` if `table` parameter has a partition decorator, otherwise nil | See [Time Partitioning](#time-partitioning) |
104
104
  | time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
105
105
  | time_partitioning.expiration__ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. partition |
106
+ | schema_update_options | array | optional | nil | List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) |
106
107
 
107
108
  ### Example
108
109
 
@@ -365,7 +366,7 @@ Using `gcs_bucket` option, such strategy is enabled. You may also use `auto_crea
365
366
  out:
366
367
  type: bigquery
367
368
  gcs_bucket: bucket_name
368
- auto_create_gcs_bucket: false
369
+ auto_create_gcs_bucket: true
369
370
  ```
370
371
 
371
372
  ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers into GCS.
@@ -391,11 +392,30 @@ out:
391
392
  type: bigquery
392
393
  table: table_name$20160929
393
394
  auto_create_table: true
394
- time-partitioning:
395
+ time_partitioning:
395
396
  type: DAY
396
397
  expiration_ms: 259200000
397
398
  ```
398
399
 
400
+ Use `schema_update_options` to allow the schema of the desitination table to be updated as a side effect of the load job as:
401
+
402
+ ```yaml
403
+ out:
404
+ type: bigquery
405
+ table: table_name$20160929
406
+ auto_create_table: true
407
+ time_partitioning:
408
+ type: DAY
409
+ expiration_ms: 259200000
410
+ schema_update_options:
411
+ - ALLOW_FIELD_ADDITION
412
+ - ALLOW_FIELD_RELAXATION
413
+ ```
414
+
415
+ It seems that only adding a new column, and relaxing non-necessary columns to be `NULLABLE` are supported now.
416
+ Deleting columns, and renaming columns are not supported.
417
+ See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) for details.
418
+
399
419
  ## Development
400
420
 
401
421
  ### Run example:
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.4.0"
3
+ spec.version = "0.4.1"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -0,0 +1,31 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: date, type: string}
13
+ - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14
+ - {name: "null", type: string}
15
+ - {name: long, type: long}
16
+ - {name: string, type: string}
17
+ - {name: double, type: double}
18
+ - {name: boolean, type: boolean}
19
+ out:
20
+ type: bigquery
21
+ mode: append_direct
22
+ auth_method: json_key
23
+ json_keyfile: example/your-project-000.json
24
+ dataset: your_dataset_name
25
+ table: your_table_name
26
+ source_format: NEWLINE_DELIMITED_JSON
27
+ compression: NONE
28
+ auto_create_dataset: true
29
+ auto_create_table: true
30
+ schema_file: example/schema.json
31
+ schema_update_options: [ALLOW_FIELD_ADDITION, ALLOW_FIELD_RELAXATION]
@@ -86,6 +86,7 @@ module Embulk
86
86
  'ignore_unknown_values' => config.param('ignore_unknown_values', :bool, :default => false),
87
87
  'allow_quoted_newlines' => config.param('allow_quoted_newlines', :bool, :default => false),
88
88
  'time_partitioning' => config.param('time_partitioning', :hash, :default => nil),
89
+ 'schema_update_options' => config.param('schema_update_options', :array, :default => nil),
89
90
 
90
91
  # for debug
91
92
  'skip_load' => config.param('skip_load', :bool, :default => false),
@@ -230,6 +231,14 @@ module Embulk
230
231
  task['time_partitioning'] = {'type' => 'DAY'}
231
232
  end
232
233
 
234
+ if task['schema_update_options']
235
+ task['schema_update_options'].each do |schema_update_option|
236
+ unless %w[ALLOW_FIELD_ADDITION ALLOW_FIELD_RELAXATION].include?(schema_update_option)
237
+ raise ConfigError.new "`schema_update_options` must contain either of ALLOW_FIELD_ADDITION or ALLOW_FIELD_RELAXATION or both"
238
+ end
239
+ end
240
+ end
241
+
233
242
  task
234
243
  end
235
244
 
@@ -292,19 +301,19 @@ module Embulk
292
301
  else
293
302
  bigquery.delete_table(task['table'])
294
303
  end
295
- bigquery.create_table(task['table'], options: task)
304
+ bigquery.create_table(task['table'])
296
305
  when 'replace', 'replace_backup', 'append'
297
- bigquery.create_table(task['temp_table'], options: task)
306
+ bigquery.create_table(task['temp_table'])
298
307
  if task['time_partitioning']
299
308
  if task['auto_create_table']
300
- bigquery.create_table(task['table'], options: task)
309
+ bigquery.create_table(task['table'])
301
310
  else
302
311
  bigquery.get_table(task['table']) # raises NotFoundError
303
312
  end
304
313
  end
305
314
  else # append_direct
306
315
  if task['auto_create_table']
307
- bigquery.create_table(task['table'], options: task)
316
+ bigquery.create_table(task['table'])
308
317
  else
309
318
  bigquery.get_table(task['table']) # raises NotFoundError
310
319
  end
@@ -313,7 +322,7 @@ module Embulk
313
322
  if task['mode'] == 'replace_backup'
314
323
  if task['time_partitioning'] and Helper.has_partition_decorator?(task['table_old'])
315
324
  if task['auto_create_table']
316
- bigquery.create_table(task['table_old'], dataset: task['dataset_old'], options: task)
325
+ bigquery.create_table(task['table_old'], dataset: task['dataset_old'])
317
326
  else
318
327
  bigquery.get_table(task['table_old'], dataset: task['dataset_old']) # raises NotFoundError
319
328
  end
@@ -194,6 +194,10 @@ module Embulk
194
194
  }
195
195
  }
196
196
 
197
+ if @task['schema_update_options']
198
+ body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
199
+ end
200
+
197
201
  opts = {
198
202
  upload_source: path,
199
203
  content_type: "application/octet-stream",
@@ -254,6 +258,10 @@ module Embulk
254
258
  }
255
259
  }
256
260
 
261
+ if @task['schema_update_options']
262
+ body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
263
+ end
264
+
257
265
  opts = {}
258
266
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
259
267
  response = with_network_retry { client.insert_job(@project, body, opts) }
@@ -371,10 +379,16 @@ module Embulk
371
379
  end
372
380
  end
373
381
 
374
- def create_table(table, dataset: nil, options: {})
382
+ def create_table(table, dataset: nil, options: nil)
375
383
  begin
376
- table = Helper.chomp_partition_decorator(table)
377
384
  dataset ||= @dataset
385
+ options ||= {}
386
+ options['time_partitioning'] ||= @task['time_partitioning']
387
+ if Helper.has_partition_decorator?(table)
388
+ options['time_partitioning'] ||= {'type' => 'DAY'}
389
+ table = Helper.chomp_partition_decorator(table)
390
+ end
391
+
378
392
  Embulk.logger.info { "embulk-output-bigquery: Create table... #{@project}:#{dataset}.#{table}" }
379
393
  body = {
380
394
  table_reference: {
@@ -384,12 +398,14 @@ module Embulk
384
398
  fields: fields,
385
399
  }
386
400
  }
401
+
387
402
  if options['time_partitioning']
388
403
  body[:time_partitioning] = {
389
404
  type: options['time_partitioning']['type'],
390
405
  expiration_ms: options['time_partitioning']['expiration_ms'],
391
406
  }
392
407
  end
408
+
393
409
  opts = {}
394
410
  Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{dataset}, #{body}, #{opts})" }
395
411
  with_network_retry { client.insert_table(@project, dataset, body, opts) }
@@ -110,7 +110,7 @@ else
110
110
  client.delete_table('your_table_name')
111
111
  assert_nothing_raised do
112
112
  client.create_table('your_table_name$20160929', options:{
113
- 'time_partitioning' => {'type'=>'DAY'}
113
+ 'time_partitioning' => {'type'=>'DAY', 'expiration_ms'=>1000}
114
114
  })
115
115
  end
116
116
  end
@@ -153,15 +153,15 @@ else
153
153
 
154
154
  sub_test_case "delete_partition" do
155
155
  def test_delete_partition
156
- client.create_table('your_table_name$20160929', options:{
157
- 'time_partitioning' => {'type'=>'DAY'}
158
- })
156
+ client.delete_table('your_table_name')
157
+ client.create_table('your_table_name$20160929')
159
158
  assert_nothing_raised { client.delete_partition('your_table_name$20160929') }
160
159
  ensure
161
160
  client.delete_table('your_table_name')
162
161
  end
163
162
 
164
163
  def test_delete_partition_of_non_partitioned_table
164
+ client.delete_table('your_table_name')
165
165
  client.create_table('your_table_name')
166
166
  assert_raise { client.delete_partition('your_table_name$20160929') }
167
167
  ensure
@@ -266,6 +266,14 @@ module Embulk
266
266
  task = Bigquery.configure(config, schema, processor_count)
267
267
  assert_equal 'DAY', task['time_partitioning']['type']
268
268
  end
269
+
270
+ def test_schema_update_options
271
+ config = least_config.merge('schema_update_options' => ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'])
272
+ assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
273
+
274
+ config = least_config.merge('schema_update_options' => ['FOO'])
275
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
276
+ end
269
277
  end
270
278
  end
271
279
  end
@@ -55,7 +55,7 @@ module Embulk
55
55
  task = Bigquery.configure(config, schema, processor_count)
56
56
  any_instance_of(BigqueryClient) do |obj|
57
57
  mock(obj).create_dataset(config['dataset'])
58
- mock(obj).create_table(config['table'], options: task)
58
+ mock(obj).create_table(config['table'])
59
59
  end
60
60
  Bigquery.transaction(config, schema, processor_count, &control)
61
61
  end
@@ -74,7 +74,7 @@ module Embulk
74
74
  task = Bigquery.configure(config, schema, processor_count)
75
75
  any_instance_of(BigqueryClient) do |obj|
76
76
  mock(obj).create_dataset(config['dataset'])
77
- mock(obj).create_table(config['table'], options: task)
77
+ mock(obj).create_table(config['table'])
78
78
  end
79
79
  Bigquery.transaction(config, schema, processor_count, &control)
80
80
  end
@@ -87,7 +87,7 @@ module Embulk
87
87
  any_instance_of(BigqueryClient) do |obj|
88
88
  mock(obj).get_dataset(config['dataset'])
89
89
  mock(obj).delete_table(config['table'])
90
- mock(obj).create_table(config['table'], options: task)
90
+ mock(obj).create_table(config['table'])
91
91
  end
92
92
  Bigquery.transaction(config, schema, processor_count, &control)
93
93
  end
@@ -98,7 +98,7 @@ module Embulk
98
98
  any_instance_of(BigqueryClient) do |obj|
99
99
  mock(obj).get_dataset(config['dataset'])
100
100
  mock(obj).delete_partition(config['table'])
101
- mock(obj).create_table(config['table'], options: task)
101
+ mock(obj).create_table(config['table'])
102
102
  end
103
103
  Bigquery.transaction(config, schema, processor_count, &control)
104
104
  end
@@ -110,7 +110,7 @@ module Embulk
110
110
  task = Bigquery.configure(config, schema, processor_count)
111
111
  any_instance_of(BigqueryClient) do |obj|
112
112
  mock(obj).get_dataset(config['dataset'])
113
- mock(obj).create_table(config['temp_table'], options: task)
113
+ mock(obj).create_table(config['temp_table'])
114
114
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
115
115
  mock(obj).delete_table(config['temp_table'])
116
116
  end
@@ -122,7 +122,7 @@ module Embulk
122
122
  task = Bigquery.configure(config, schema, processor_count)
123
123
  any_instance_of(BigqueryClient) do |obj|
124
124
  mock(obj).get_dataset(config['dataset'])
125
- mock(obj).create_table(config['temp_table'], options: task)
125
+ mock(obj).create_table(config['temp_table'])
126
126
  mock(obj).get_table(config['table'])
127
127
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
128
128
  mock(obj).delete_table(config['temp_table'])
@@ -135,8 +135,8 @@ module Embulk
135
135
  task = Bigquery.configure(config, schema, processor_count)
136
136
  any_instance_of(BigqueryClient) do |obj|
137
137
  mock(obj).get_dataset(config['dataset'])
138
- mock(obj).create_table(config['temp_table'], options: task)
139
- mock(obj).create_table(config['table'], options: task)
138
+ mock(obj).create_table(config['temp_table'])
139
+ mock(obj).create_table(config['table'])
140
140
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
141
141
  mock(obj).delete_table(config['temp_table'])
142
142
  end
@@ -151,7 +151,7 @@ module Embulk
151
151
  any_instance_of(BigqueryClient) do |obj|
152
152
  mock(obj).get_dataset(config['dataset'])
153
153
  mock(obj).get_dataset(config['dataset_old'])
154
- mock(obj).create_table(config['temp_table'], options: task)
154
+ mock(obj).create_table(config['temp_table'])
155
155
 
156
156
  mock(obj).get_table(task['table'])
157
157
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -168,7 +168,7 @@ module Embulk
168
168
  any_instance_of(BigqueryClient) do |obj|
169
169
  mock(obj).create_dataset(config['dataset'])
170
170
  mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
171
- mock(obj).create_table(config['temp_table'], options: task)
171
+ mock(obj).create_table(config['temp_table'])
172
172
 
173
173
  mock(obj).get_table(task['table'])
174
174
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -185,7 +185,7 @@ module Embulk
185
185
  any_instance_of(BigqueryClient) do |obj|
186
186
  mock(obj).get_dataset(config['dataset'])
187
187
  mock(obj).get_dataset(config['dataset_old'])
188
- mock(obj).create_table(config['temp_table'], options: task)
188
+ mock(obj).create_table(config['temp_table'])
189
189
  mock(obj).get_table(task['table'])
190
190
  mock(obj).get_table(task['table_old'], dataset: config['dataset_old'])
191
191
 
@@ -204,9 +204,9 @@ module Embulk
204
204
  any_instance_of(BigqueryClient) do |obj|
205
205
  mock(obj).get_dataset(config['dataset'])
206
206
  mock(obj).get_dataset(config['dataset_old'])
207
- mock(obj).create_table(config['temp_table'], options: task)
208
- mock(obj).create_table(task['table'], options: task)
209
- mock(obj).create_table(task['table_old'], dataset: config['dataset_old'], options: task)
207
+ mock(obj).create_table(config['temp_table'])
208
+ mock(obj).create_table(task['table'])
209
+ mock(obj).create_table(task['table_old'], dataset: config['dataset_old'])
210
210
 
211
211
  mock(obj).get_table(task['table'])
212
212
  mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -224,7 +224,7 @@ module Embulk
224
224
  task = Bigquery.configure(config, schema, processor_count)
225
225
  any_instance_of(BigqueryClient) do |obj|
226
226
  mock(obj).get_dataset(config['dataset'])
227
- mock(obj).create_table(config['temp_table'], options: task)
227
+ mock(obj).create_table(config['temp_table'])
228
228
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
229
229
  mock(obj).delete_table(config['temp_table'])
230
230
  end
@@ -236,7 +236,7 @@ module Embulk
236
236
  task = Bigquery.configure(config, schema, processor_count)
237
237
  any_instance_of(BigqueryClient) do |obj|
238
238
  mock(obj).get_dataset(config['dataset'])
239
- mock(obj).create_table(config['temp_table'], options: task)
239
+ mock(obj).create_table(config['temp_table'])
240
240
  mock(obj).get_table(config['table'])
241
241
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
242
242
  mock(obj).delete_table(config['temp_table'])
@@ -249,15 +249,14 @@ module Embulk
249
249
  task = Bigquery.configure(config, schema, processor_count)
250
250
  any_instance_of(BigqueryClient) do |obj|
251
251
  mock(obj).get_dataset(config['dataset'])
252
- mock(obj).create_table(config['temp_table'], options: task)
253
- mock(obj).create_table(config['table'], options: task)
252
+ mock(obj).create_table(config['temp_table'])
253
+ mock(obj).create_table(config['table'])
254
254
  mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
255
255
  mock(obj).delete_table(config['temp_table'])
256
256
  end
257
257
  Bigquery.transaction(config, schema, processor_count, &control)
258
258
  end
259
259
  end
260
-
261
260
  end
262
261
  end
263
262
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-10-01 00:00:00.000000000 Z
12
+ date: 2016-10-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client
@@ -97,6 +97,7 @@ files:
97
97
  - README.md
98
98
  - Rakefile
99
99
  - embulk-output-bigquery.gemspec
100
+ - example/config_append_direct_schema_update_options.yml
100
101
  - example/config_client_options.yml
101
102
  - example/config_csv.yml
102
103
  - example/config_delete_in_advance.yml