embulk-output-bigquery 0.7.4 → 0.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/README.md +24 -0
- data/embulk-output-bigquery.gemspec +1 -1
- data/lib/embulk/output/bigquery/bigquery_client.rb +12 -0
- data/lib/embulk/output/bigquery.rb +43 -1
- data/test/test_configure.rb +38 -0
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bacb610086a2bbd94300aa3401565e2101bf8b094ef10a75e5c666d768ae5190
|
4
|
+
data.tar.gz: 6121440d4864f5561567ad6a4bc64151377bb8b840a6954e8303435cd83c291d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e9bf6482b42a2d2a159babb0213418330283ec81b9a6bfeb2e85d1b1feed1cbf2d5c955007f055144951a4d51793cb996111229b98035031943731974bc57ae
|
7
|
+
data.tar.gz: 149abe8691c92b5ab32db84cfac98a71a7601b63f1b210eb9bb6011fb5124b80a8cd93fbb475809cd6dafa114792bd7599555dfdc27d6a34a827876efc2aa33d
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -110,6 +110,12 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
|
|
110
110
|
| time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
|
111
111
|
| time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. |
|
112
112
|
| time_partitioning.field | string | optional | nil | `DATE` or `TIMESTAMP` column used for partitioning |
|
113
|
+
| range_partitioning | hash | optional | nil | See [Range Partitioning](#range-partitioning) |
|
114
|
+
| range_partitioning.field | string | required | nil | `INT64` column used for partitioning |
|
115
|
+
| range-partitioning.range | hash | required | nil | Defines the ranges for range paritioning |
|
116
|
+
| range-partitioning.range.start | int | required | nil | The start of range partitioning, inclusive. |
|
117
|
+
| range-partitioning.range.end | int | required | nil | The end of range partitioning, exclusive. |
|
118
|
+
| range-partitioning.range.interval| int | required | nil | The width of each interval. |
|
113
119
|
| clustering | hash | optional | nil | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
|
114
120
|
| clustering.fields | array | required | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
|
115
121
|
| schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
|
@@ -448,6 +454,24 @@ MEMO: [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/big
|
|
448
454
|
to update the schema of the desitination table as a side effect of the load job, but it is not available for copy job.
|
449
455
|
Thus, it was not suitable for embulk-output-bigquery idempotence modes, `append`, `replace`, and `replace_backup`, sigh.
|
450
456
|
|
457
|
+
### Range Partitioning
|
458
|
+
|
459
|
+
See also [Creating and Updating Range-Partitioned Tables](https://cloud.google.com/bigquery/docs/creating-partitioned-tables).
|
460
|
+
|
461
|
+
To load into a partition, specify `range_partitioning` and `table` parameter with a partition decorator as:
|
462
|
+
|
463
|
+
```yaml
|
464
|
+
out:
|
465
|
+
type: bigquery
|
466
|
+
table: table_name$1
|
467
|
+
range_partitioning:
|
468
|
+
field: customer_id
|
469
|
+
range:
|
470
|
+
start: 1
|
471
|
+
end: 99999
|
472
|
+
interval: 1
|
473
|
+
```
|
474
|
+
|
451
475
|
## Development
|
452
476
|
|
453
477
|
### Run example:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.7.
|
3
|
+
spec.version = "0.7.5"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -435,6 +435,18 @@ module Embulk
|
|
435
435
|
}
|
436
436
|
end
|
437
437
|
|
438
|
+
options['range_partitioning'] ||= @task['range_partitioning']
|
439
|
+
if options['range_partitioning']
|
440
|
+
body[:range_partitioning] = {
|
441
|
+
field: options['range_partitioning']['field'],
|
442
|
+
range: {
|
443
|
+
start: options['range_partitioning']['range']['start'].to_s,
|
444
|
+
end: options['range_partitioning']['range']['end'].to_s,
|
445
|
+
interval: options['range_partitioning']['range']['interval'].to_s,
|
446
|
+
},
|
447
|
+
}
|
448
|
+
end
|
449
|
+
|
438
450
|
options['clustering'] ||= @task['clustering']
|
439
451
|
if options['clustering']
|
440
452
|
body[:clustering] = {
|
@@ -89,6 +89,7 @@ module Embulk
|
|
89
89
|
'ignore_unknown_values' => config.param('ignore_unknown_values', :bool, :default => false),
|
90
90
|
'allow_quoted_newlines' => config.param('allow_quoted_newlines', :bool, :default => false),
|
91
91
|
'time_partitioning' => config.param('time_partitioning', :hash, :default => nil),
|
92
|
+
'range_partitioning' => config.param('range_partitioning', :hash, :default => nil),
|
92
93
|
'clustering' => config.param('clustering', :hash, :default => nil), # google-api-ruby-client >= v0.21.0
|
93
94
|
'schema_update_options' => config.param('schema_update_options', :array, :default => nil),
|
94
95
|
|
@@ -227,14 +228,55 @@ module Embulk
|
|
227
228
|
task['abort_on_error'] = (task['max_bad_records'] == 0)
|
228
229
|
end
|
229
230
|
|
231
|
+
if task['time_partitioning'] && task['range_partitioning']
|
232
|
+
raise ConfigError.new "`time_partitioning` and `range_partitioning` cannot be used at the same time"
|
233
|
+
end
|
234
|
+
|
230
235
|
if task['time_partitioning']
|
231
236
|
unless task['time_partitioning']['type']
|
232
237
|
raise ConfigError.new "`time_partitioning` must have `type` key"
|
233
238
|
end
|
234
|
-
|
239
|
+
end
|
240
|
+
|
241
|
+
if Helper.has_partition_decorator?(task['table'])
|
242
|
+
if task['range_partitioning']
|
243
|
+
raise ConfigError.new "Partition decorators(`#{task['table']}`) don't support `range_partition`"
|
244
|
+
end
|
235
245
|
task['time_partitioning'] = {'type' => 'DAY'}
|
236
246
|
end
|
237
247
|
|
248
|
+
if task['range_partitioning']
|
249
|
+
unless task['range_partitioning']['field']
|
250
|
+
raise ConfigError.new "`range_partitioning` must have `field` key"
|
251
|
+
end
|
252
|
+
unless task['range_partitioning']['range']
|
253
|
+
raise ConfigError.new "`range_partitioning` must have `range` key"
|
254
|
+
end
|
255
|
+
|
256
|
+
range = task['range_partitioning']['range']
|
257
|
+
unless range['start']
|
258
|
+
raise ConfigError.new "`range_partitioning` must have `range.start` key"
|
259
|
+
end
|
260
|
+
unless range['start'].is_a?(Integer)
|
261
|
+
raise ConfigError.new "`range_partitioning.range.start` must be an integer"
|
262
|
+
end
|
263
|
+
unless range['end']
|
264
|
+
raise ConfigError.new "`range_partitioning` must have `range.end` key"
|
265
|
+
end
|
266
|
+
unless range['end'].is_a?(Integer)
|
267
|
+
raise ConfigError.new "`range_partitioning.range.end` must be an integer"
|
268
|
+
end
|
269
|
+
unless range['interval']
|
270
|
+
raise ConfigError.new "`range_partitioning` must have `range.interval` key"
|
271
|
+
end
|
272
|
+
unless range['interval'].is_a?(Integer)
|
273
|
+
raise ConfigError.new "`range_partitioning.range.interval` must be an integer"
|
274
|
+
end
|
275
|
+
if range['start'] + range['interval'] >= range['end']
|
276
|
+
raise ConfigError.new "`range_partitioning.range.start` + `range_partitioning.range.interval` must be less than `range_partitioning.range.end`"
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
238
280
|
if task['clustering']
|
239
281
|
unless task['clustering']['fields']
|
240
282
|
raise ConfigError.new "`clustering` must have `fields` key"
|
data/test/test_configure.rb
CHANGED
@@ -270,6 +270,44 @@ module Embulk
|
|
270
270
|
assert_equal 'DAY', task['time_partitioning']['type']
|
271
271
|
end
|
272
272
|
|
273
|
+
def test_range_partitioning
|
274
|
+
config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 3, 'interval' => 1 }})
|
275
|
+
assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
|
276
|
+
|
277
|
+
# field is required
|
278
|
+
config = least_config.merge('range_partitioning' => {'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
|
279
|
+
assert_raise { Bigquery.configure(config, schema, processor_count) }
|
280
|
+
|
281
|
+
|
282
|
+
# range is required
|
283
|
+
config = least_config.merge('range_partitioning' => {'field' => 'foo'})
|
284
|
+
assert_raise { Bigquery.configure(config, schema, processor_count) }
|
285
|
+
|
286
|
+
# range.start is required
|
287
|
+
config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'end' => 2, 'interval' => 1 }})
|
288
|
+
assert_raise { Bigquery.configure(config, schema, processor_count) }
|
289
|
+
|
290
|
+
# range.end is required
|
291
|
+
config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'interval' => 1 }})
|
292
|
+
assert_raise { Bigquery.configure(config, schema, processor_count) }
|
293
|
+
|
294
|
+
# range.interval is required
|
295
|
+
config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2 }})
|
296
|
+
assert_raise { Bigquery.configure(config, schema, processor_count) }
|
297
|
+
|
298
|
+
# range.start + range.interval should be less than range.end
|
299
|
+
config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 2 }})
|
300
|
+
assert_raise { Bigquery.configure(config, schema, processor_count) }
|
301
|
+
end
|
302
|
+
|
303
|
+
def test_time_and_range_partitioning_error
|
304
|
+
config = least_config.merge('time_partitioning' => {'type' => 'DAY'}, 'range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
|
305
|
+
assert_raise { Bigquery.configure(config, schema, processor_count) }
|
306
|
+
|
307
|
+
config = least_config.merge('table' => 'table_name$20160912', 'range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
|
308
|
+
assert_raise { Bigquery.configure(config, schema, processor_count) }
|
309
|
+
end
|
310
|
+
|
273
311
|
def test_clustering
|
274
312
|
config = least_config.merge('clustering' => {'fields' => ['field_a']})
|
275
313
|
assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
8
8
|
- Naotoshi Seo
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2025-05-14 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-apis-storage_v1
|
@@ -147,7 +147,7 @@ homepage: https://github.com/embulk/embulk-output-bigquery
|
|
147
147
|
licenses:
|
148
148
|
- MIT
|
149
149
|
metadata: {}
|
150
|
-
post_install_message:
|
150
|
+
post_install_message:
|
151
151
|
rdoc_options: []
|
152
152
|
require_paths:
|
153
153
|
- lib
|
@@ -163,7 +163,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
163
163
|
version: '0'
|
164
164
|
requirements: []
|
165
165
|
rubygems_version: 3.5.3
|
166
|
-
signing_key:
|
166
|
+
signing_key:
|
167
167
|
specification_version: 4
|
168
168
|
summary: Google BigQuery output plugin for Embulk
|
169
169
|
test_files:
|