embulk-output-bigquery 0.7.4 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ee5e3d543b40d3a9e1cd8a0d9af9aa05ed84e12abcd805d6b6cc9c9ffc79e825
4
- data.tar.gz: 4257cb1626c92e3d46be2dff689b0a4b1efa8983c7531913a195cb074e77bc36
3
+ metadata.gz: bacb610086a2bbd94300aa3401565e2101bf8b094ef10a75e5c666d768ae5190
4
+ data.tar.gz: 6121440d4864f5561567ad6a4bc64151377bb8b840a6954e8303435cd83c291d
5
5
  SHA512:
6
- metadata.gz: 9559bef20b7a5f644871f74bd64dbf90c9776deccbd0a59b2516a8a2fdab9a4952dd78dc8ec365992384286ec2f23ae15fa59d8e71f5dff36a72b38860a74bfe
7
- data.tar.gz: edb4085785ad9ae94a53e31f4f13afec71da983856e0b94bde3e080d3280fb9e28c60ce259ecc3e336a77035afc9433e7ece11b3bdd6697c5e8bf7a52462eeb1
6
+ metadata.gz: 2e9bf6482b42a2d2a159babb0213418330283ec81b9a6bfeb2e85d1b1feed1cbf2d5c955007f055144951a4d51793cb996111229b98035031943731974bc57ae
7
+ data.tar.gz: 149abe8691c92b5ab32db84cfac98a71a7601b63f1b210eb9bb6011fb5124b80a8cd93fbb475809cd6dafa114792bd7599555dfdc27d6a34a827876efc2aa33d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ ## 0.7.5 - 2025-05-13
2
+ * [enhancement] Add range partitioning support (Thanks to kitagry) #174
3
+
1
4
  ## 0.7.4 - 2024-12-19
2
5
  * [maintenance] Primary location unless location is set explicitly (Thanks to joker1007) #172
3
6
 
data/README.md CHANGED
@@ -110,6 +110,12 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
110
110
  | time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
111
111
  | time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. |
112
112
  | time_partitioning.field | string | optional | nil | `DATE` or `TIMESTAMP` column used for partitioning |
113
+ | range_partitioning | hash | optional | nil | See [Range Partitioning](#range-partitioning) |
114
+ | range_partitioning.field | string | required | nil | `INT64` column used for partitioning |
115
+ | range-partitioning.range | hash | required | nil | Defines the ranges for range paritioning |
116
+ | range-partitioning.range.start | int | required | nil | The start of range partitioning, inclusive. |
117
+ | range-partitioning.range.end | int | required | nil | The end of range partitioning, exclusive. |
118
+ | range-partitioning.range.interval| int | required | nil | The width of each interval. |
113
119
  | clustering | hash | optional | nil | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
114
120
  | clustering.fields | array | required | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
115
121
  | schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
@@ -448,6 +454,24 @@ MEMO: [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/big
448
454
  to update the schema of the desitination table as a side effect of the load job, but it is not available for copy job.
449
455
  Thus, it was not suitable for embulk-output-bigquery idempotence modes, `append`, `replace`, and `replace_backup`, sigh.
450
456
 
457
+ ### Range Partitioning
458
+
459
+ See also [Creating and Updating Range-Partitioned Tables](https://cloud.google.com/bigquery/docs/creating-partitioned-tables).
460
+
461
+ To load into a partition, specify `range_partitioning` and `table` parameter with a partition decorator as:
462
+
463
+ ```yaml
464
+ out:
465
+ type: bigquery
466
+ table: table_name$1
467
+ range_partitioning:
468
+ field: customer_id
469
+ range:
470
+ start: 1
471
+ end: 99999
472
+ interval: 1
473
+ ```
474
+
451
475
  ## Development
452
476
 
453
477
  ### Run example:
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.7.4"
3
+ spec.version = "0.7.5"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -435,6 +435,18 @@ module Embulk
435
435
  }
436
436
  end
437
437
 
438
+ options['range_partitioning'] ||= @task['range_partitioning']
439
+ if options['range_partitioning']
440
+ body[:range_partitioning] = {
441
+ field: options['range_partitioning']['field'],
442
+ range: {
443
+ start: options['range_partitioning']['range']['start'].to_s,
444
+ end: options['range_partitioning']['range']['end'].to_s,
445
+ interval: options['range_partitioning']['range']['interval'].to_s,
446
+ },
447
+ }
448
+ end
449
+
438
450
  options['clustering'] ||= @task['clustering']
439
451
  if options['clustering']
440
452
  body[:clustering] = {
@@ -89,6 +89,7 @@ module Embulk
89
89
  'ignore_unknown_values' => config.param('ignore_unknown_values', :bool, :default => false),
90
90
  'allow_quoted_newlines' => config.param('allow_quoted_newlines', :bool, :default => false),
91
91
  'time_partitioning' => config.param('time_partitioning', :hash, :default => nil),
92
+ 'range_partitioning' => config.param('range_partitioning', :hash, :default => nil),
92
93
  'clustering' => config.param('clustering', :hash, :default => nil), # google-api-ruby-client >= v0.21.0
93
94
  'schema_update_options' => config.param('schema_update_options', :array, :default => nil),
94
95
 
@@ -227,14 +228,55 @@ module Embulk
227
228
  task['abort_on_error'] = (task['max_bad_records'] == 0)
228
229
  end
229
230
 
231
+ if task['time_partitioning'] && task['range_partitioning']
232
+ raise ConfigError.new "`time_partitioning` and `range_partitioning` cannot be used at the same time"
233
+ end
234
+
230
235
  if task['time_partitioning']
231
236
  unless task['time_partitioning']['type']
232
237
  raise ConfigError.new "`time_partitioning` must have `type` key"
233
238
  end
234
- elsif Helper.has_partition_decorator?(task['table'])
239
+ end
240
+
241
+ if Helper.has_partition_decorator?(task['table'])
242
+ if task['range_partitioning']
243
+ raise ConfigError.new "Partition decorators(`#{task['table']}`) don't support `range_partition`"
244
+ end
235
245
  task['time_partitioning'] = {'type' => 'DAY'}
236
246
  end
237
247
 
248
+ if task['range_partitioning']
249
+ unless task['range_partitioning']['field']
250
+ raise ConfigError.new "`range_partitioning` must have `field` key"
251
+ end
252
+ unless task['range_partitioning']['range']
253
+ raise ConfigError.new "`range_partitioning` must have `range` key"
254
+ end
255
+
256
+ range = task['range_partitioning']['range']
257
+ unless range['start']
258
+ raise ConfigError.new "`range_partitioning` must have `range.start` key"
259
+ end
260
+ unless range['start'].is_a?(Integer)
261
+ raise ConfigError.new "`range_partitioning.range.start` must be an integer"
262
+ end
263
+ unless range['end']
264
+ raise ConfigError.new "`range_partitioning` must have `range.end` key"
265
+ end
266
+ unless range['end'].is_a?(Integer)
267
+ raise ConfigError.new "`range_partitioning.range.end` must be an integer"
268
+ end
269
+ unless range['interval']
270
+ raise ConfigError.new "`range_partitioning` must have `range.interval` key"
271
+ end
272
+ unless range['interval'].is_a?(Integer)
273
+ raise ConfigError.new "`range_partitioning.range.interval` must be an integer"
274
+ end
275
+ if range['start'] + range['interval'] >= range['end']
276
+ raise ConfigError.new "`range_partitioning.range.start` + `range_partitioning.range.interval` must be less than `range_partitioning.range.end`"
277
+ end
278
+ end
279
+
238
280
  if task['clustering']
239
281
  unless task['clustering']['fields']
240
282
  raise ConfigError.new "`clustering` must have `fields` key"
@@ -270,6 +270,44 @@ module Embulk
270
270
  assert_equal 'DAY', task['time_partitioning']['type']
271
271
  end
272
272
 
273
+ def test_range_partitioning
274
+ config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 3, 'interval' => 1 }})
275
+ assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
276
+
277
+ # field is required
278
+ config = least_config.merge('range_partitioning' => {'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
279
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
280
+
281
+
282
+ # range is required
283
+ config = least_config.merge('range_partitioning' => {'field' => 'foo'})
284
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
285
+
286
+ # range.start is required
287
+ config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'end' => 2, 'interval' => 1 }})
288
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
289
+
290
+ # range.end is required
291
+ config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'interval' => 1 }})
292
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
293
+
294
+ # range.interval is required
295
+ config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2 }})
296
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
297
+
298
+ # range.start + range.interval should be less than range.end
299
+ config = least_config.merge('range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 2 }})
300
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
301
+ end
302
+
303
+ def test_time_and_range_partitioning_error
304
+ config = least_config.merge('time_partitioning' => {'type' => 'DAY'}, 'range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
305
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
306
+
307
+ config = least_config.merge('table' => 'table_name$20160912', 'range_partitioning' => {'field' => 'foo', 'range' => { 'start' => 1, 'end' => 2, 'interval' => 1 }})
308
+ assert_raise { Bigquery.configure(config, schema, processor_count) }
309
+ end
310
+
273
311
  def test_clustering
274
312
  config = least_config.merge('clustering' => {'fields' => ['field_a']})
275
313
  assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.4
4
+ version: 0.7.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
8
8
  - Naotoshi Seo
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2024-12-19 00:00:00.000000000 Z
12
+ date: 2025-05-14 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-apis-storage_v1
@@ -147,7 +147,7 @@ homepage: https://github.com/embulk/embulk-output-bigquery
147
147
  licenses:
148
148
  - MIT
149
149
  metadata: {}
150
- post_install_message:
150
+ post_install_message:
151
151
  rdoc_options: []
152
152
  require_paths:
153
153
  - lib
@@ -163,7 +163,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
163
163
  version: '0'
164
164
  requirements: []
165
165
  rubygems_version: 3.5.3
166
- signing_key:
166
+ signing_key:
167
167
  specification_version: 4
168
168
  summary: Google BigQuery output plugin for Embulk
169
169
  test_files: