embulk-output-bigquery 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +13 -24
- data/embulk-output-bigquery.gemspec +3 -2
- data/lib/embulk/output/bigquery/bigquery_client.rb +5 -4
- data/lib/embulk/output/bigquery/value_converter_factory.rb +6 -31
- data/test/test_file_writer.rb +3 -3
- data/test/test_value_converter_factory.rb +11 -10
- metadata +19 -19
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e8b074a351f22417a10571e1a9aa60a1bc82df0d
|
4
|
+
data.tar.gz: c50a49b3b99f5cab88e023af6d95b39990c40d89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 52e5a630d3173d2baec83dd03fbf0e3e4cb7d46aeb870e192ce31c6c8178534cade8975ec586e2a07bc1c213d616505ed52b5ec154cabdc19669317f8ba673b3
|
7
|
+
data.tar.gz: f9b15c9ff54a64626b33ce123a11ac80a7984535eb9fc47c042c9647b3d1728bd6f5f4f3157c9fcbac9163b097f83de5de7e3db2fb443a93aa4bdf32bd9d7fd5
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -102,8 +102,8 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
|
|
102
102
|
| allow_quoted_newlines | boolean | optional | false | Set true, if data contains newline characters. It may cause slow procsssing |
|
103
103
|
| time_partitioning | hash | optional | `{"type":"DAY"}` if `table` parameter has a partition decorator, otherwise nil | See [Time Partitioning](#time-partitioning) |
|
104
104
|
| time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
|
105
|
-
| time_partitioning.
|
106
|
-
| schema_update_options | array | optional | nil | List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) |
|
105
|
+
| time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. partition |
|
106
|
+
| schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `append_direct`, `replace`, `replace_backup` (except `delete_in_advance`) |
|
107
107
|
|
108
108
|
### Example
|
109
109
|
|
@@ -127,24 +127,25 @@ out:
|
|
127
127
|
|
128
128
|
##### append
|
129
129
|
|
130
|
-
1. Load to temporary table
|
130
|
+
1. Load to temporary table (Create and WRITE_APPEND in parallel)
|
131
131
|
2. Copy temporary table to destination table (or partition). (WRITE_APPEND)
|
132
132
|
|
133
133
|
##### append_direct
|
134
134
|
|
135
|
-
Insert data into existing table (or partition) directly.
|
135
|
+
1. Insert data into existing table (or partition) directly. (WRITE_APPEND in parallel)
|
136
|
+
|
136
137
|
This is not transactional, i.e., if fails, the target table could have some rows inserted.
|
137
138
|
|
138
139
|
##### replace
|
139
140
|
|
140
|
-
1. Load to temporary table
|
141
|
+
1. Load to temporary table (Create and WRITE_APPEND in parallel)
|
141
142
|
2. Copy temporary table to destination table (or partition). (WRITE_TRUNCATE)
|
142
143
|
|
143
144
|
```is_skip_job_result_check``` must be false when replace mode
|
144
145
|
|
145
146
|
##### replace_backup
|
146
147
|
|
147
|
-
1. Load to temporary table
|
148
|
+
1. Load to temporary table (Create and WRITE_APPEND in parallel)
|
148
149
|
2. Copy destination table (or partition) to backup table (or partition). (dataset_old, table_old)
|
149
150
|
3. Copy temporary table to destination table (or partition). (WRITE_TRUNCATE)
|
150
151
|
|
@@ -316,7 +317,7 @@ Therefore, it is recommended to format records with filter plugins written in Ja
|
|
316
317
|
filters:
|
317
318
|
- type: to_json
|
318
319
|
column: {name: payload, type: string}
|
319
|
-
default_format: %Y-%m-%d %H:%M:%S.%6N
|
320
|
+
default_format: "%Y-%m-%d %H:%M:%S.%6N"
|
320
321
|
out:
|
321
322
|
type: bigquery
|
322
323
|
payload_column_index: 0 # or, payload_column: payload
|
@@ -397,24 +398,12 @@ out:
|
|
397
398
|
expiration_ms: 259200000
|
398
399
|
```
|
399
400
|
|
400
|
-
Use
|
401
|
-
|
402
|
-
```yaml
|
403
|
-
out:
|
404
|
-
type: bigquery
|
405
|
-
table: table_name$20160929
|
406
|
-
auto_create_table: true
|
407
|
-
time_partitioning:
|
408
|
-
type: DAY
|
409
|
-
expiration_ms: 259200000
|
410
|
-
schema_update_options:
|
411
|
-
- ALLOW_FIELD_ADDITION
|
412
|
-
- ALLOW_FIELD_RELAXATION
|
413
|
-
```
|
401
|
+
Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though.
|
402
|
+
Note that only adding a new column, and relaxing non-necessary columns to be `NULLABLE` are supported now. Deleting columns, and renaming columns are not supported.
|
414
403
|
|
415
|
-
|
416
|
-
|
417
|
-
|
404
|
+
MEMO: [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) is available
|
405
|
+
to update the schema of the desitination table as a side effect of the load job, but it is not available for copy job.
|
406
|
+
Thus, it was not suitable for embulk-output-bigquery idempotence modes, `append`, `replace`, and `replace_backup`, sigh.
|
418
407
|
|
419
408
|
## Development
|
420
409
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.4.
|
3
|
+
spec.version = "0.4.3"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -13,7 +13,8 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.require_paths = ["lib"]
|
14
14
|
|
15
15
|
spec.add_dependency 'google-api-client'
|
16
|
-
spec.add_dependency
|
16
|
+
spec.add_dependency 'time_with_zone'
|
17
|
+
|
17
18
|
spec.add_development_dependency 'embulk', ['>= 0.8.2']
|
18
19
|
spec.add_development_dependency 'bundler', ['>= 1.10.6']
|
19
20
|
spec.add_development_dependency 'rake', ['>= 10.0']
|
@@ -104,6 +104,11 @@ module Embulk
|
|
104
104
|
}
|
105
105
|
}
|
106
106
|
}
|
107
|
+
|
108
|
+
if @task['schema_update_options']
|
109
|
+
body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
|
110
|
+
end
|
111
|
+
|
107
112
|
opts = {}
|
108
113
|
|
109
114
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
@@ -258,10 +263,6 @@ module Embulk
|
|
258
263
|
}
|
259
264
|
}
|
260
265
|
|
261
|
-
if @task['schema_update_options']
|
262
|
-
body[:configuration][:copy][:schema_update_options] = @task['schema_update_options']
|
263
|
-
end
|
264
|
-
|
265
266
|
opts = {}
|
266
267
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
267
268
|
response = with_network_retry { client.insert_job(@project, body, opts) }
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'time'
|
2
|
-
require '
|
2
|
+
require 'time_with_zone'
|
3
3
|
require 'json'
|
4
4
|
require_relative 'helper'
|
5
5
|
|
@@ -23,8 +23,8 @@ module Embulk
|
|
23
23
|
# @return [Array] an arary whose key is column_index, and value is its converter (Proc)
|
24
24
|
def self.create_converters(task, schema)
|
25
25
|
column_options_map = Helper.column_options_map(task['column_options'])
|
26
|
-
default_timestamp_format = task['default_timestamp_format']
|
27
|
-
default_timezone = task['default_timezone']
|
26
|
+
default_timestamp_format = task['default_timestamp_format'] || DEFAULT_TIMESTAMP_FORMAT
|
27
|
+
default_timezone = task['default_timezone'] || DEFAULT_TIMEZONE
|
28
28
|
schema.map do |column|
|
29
29
|
column_name = column[:name]
|
30
30
|
embulk_type = column[:type]
|
@@ -53,7 +53,7 @@ module Embulk
|
|
53
53
|
@timestamp_format = timestamp_format
|
54
54
|
@default_timestamp_format = default_timestamp_format
|
55
55
|
@timezone = timezone || default_timezone
|
56
|
-
@zone_offset =
|
56
|
+
@zone_offset = TimeWithZone.zone_offset(@timezone)
|
57
57
|
@strict = strict.nil? ? true : strict
|
58
58
|
end
|
59
59
|
|
@@ -194,7 +194,7 @@ module Embulk
|
|
194
194
|
Proc.new {|val|
|
195
195
|
next nil if val.nil?
|
196
196
|
with_typecast_error(val) do |val|
|
197
|
-
|
197
|
+
TimeWithZone.set_zone_offset(Time.strptime(val, @timestamp_format), zone_offset).strftime("%Y-%m-%d %H:%M:%S.%6N %:z")
|
198
198
|
end
|
199
199
|
}
|
200
200
|
else
|
@@ -238,7 +238,7 @@ module Embulk
|
|
238
238
|
when 'TIMESTAMP'
|
239
239
|
Proc.new {|val|
|
240
240
|
next nil if val.nil?
|
241
|
-
val.
|
241
|
+
val.strftime("%Y-%m-%d %H:%M:%S.%6N %:z")
|
242
242
|
}
|
243
243
|
else
|
244
244
|
raise NotSupportedType, "cannot take column type #{type} for timestamp column"
|
@@ -261,31 +261,6 @@ module Embulk
|
|
261
261
|
raise NotSupportedType, "cannot take column type #{type} for json column"
|
262
262
|
end
|
263
263
|
end
|
264
|
-
|
265
|
-
private
|
266
|
-
|
267
|
-
# [+-]HH:MM, [+-]HHMM, [+-]HH
|
268
|
-
NUMERIC_PATTERN = %r{\A[+-]\d\d(:?\d\d)?\z}
|
269
|
-
|
270
|
-
# Region/Zone, Region/Zone/Zone
|
271
|
-
NAME_PATTERN = %r{\A[^/]+/[^/]+(/[^/]+)?\z}
|
272
|
-
|
273
|
-
def strptime_with_zone(date, timestamp_format, zone_offset)
|
274
|
-
time = Time.strptime(date, timestamp_format)
|
275
|
-
utc_offset = time.utc_offset
|
276
|
-
time.localtime(zone_offset) + utc_offset - zone_offset
|
277
|
-
end
|
278
|
-
|
279
|
-
def get_zone_offset(timezone)
|
280
|
-
if NUMERIC_PATTERN === timezone
|
281
|
-
Time.zone_offset(timezone)
|
282
|
-
elsif NAME_PATTERN === timezone || 'UTC' == timezone
|
283
|
-
tz = TZInfo::Timezone.get(timezone)
|
284
|
-
tz.period_for_utc(Time.now).utc_total_offset
|
285
|
-
else
|
286
|
-
raise ArgumentError, "timezone format is invalid: #{timezone}"
|
287
|
-
end
|
288
|
-
end
|
289
264
|
end
|
290
265
|
end
|
291
266
|
end
|
data/test/test_file_writer.rb
CHANGED
@@ -43,7 +43,7 @@ module Embulk
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def record
|
46
|
-
[true, 1, 1.1, 'foo', Time.parse("2016-02-26 00:00:00 +
|
46
|
+
[true, 1, 1.1, 'foo', Time.parse("2016-02-26 00:00:00 +00:00").utc, {"foo"=>"foo"}]
|
47
47
|
end
|
48
48
|
|
49
49
|
def page
|
@@ -81,7 +81,7 @@ module Embulk
|
|
81
81
|
formatter_proc = file_writer.instance_variable_get(:@formatter_proc)
|
82
82
|
assert_equal :to_csv, formatter_proc.name
|
83
83
|
|
84
|
-
expected = %Q[true,1,1.1,foo,
|
84
|
+
expected = %Q[true,1,1.1,foo,2016-02-26 00:00:00.000000 +00:00,"{""foo"":""foo""}"\n]
|
85
85
|
assert_equal expected, formatter_proc.call(record)
|
86
86
|
end
|
87
87
|
|
@@ -91,7 +91,7 @@ module Embulk
|
|
91
91
|
formatter_proc = file_writer.instance_variable_get(:@formatter_proc)
|
92
92
|
assert_equal :to_jsonl, formatter_proc.name
|
93
93
|
|
94
|
-
expected = %Q[{"boolean":true,"long":1,"double":1.1,"string":"foo","timestamp":
|
94
|
+
expected = %Q[{"boolean":true,"long":1,"double":1.1,"string":"foo","timestamp":"2016-02-26 00:00:00.000000 +00:00","json":"{\\"foo\\":\\"foo\\"}"}\n]
|
95
95
|
assert_equal expected, formatter_proc.call(record)
|
96
96
|
end
|
97
97
|
end
|
@@ -23,8 +23,8 @@ module Embulk
|
|
23
23
|
assert_equal 1, converters[1].call(1)
|
24
24
|
assert_equal 1.1, converters[2].call(1.1)
|
25
25
|
assert_equal 'foo', converters[3].call('foo')
|
26
|
-
timestamp = Time.parse("2016-02-26 00:00:00.
|
27
|
-
assert_equal
|
26
|
+
timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00")
|
27
|
+
assert_equal "2016-02-26 00:00:00.500000 +00:00", converters[4].call(timestamp)
|
28
28
|
assert_equal %Q[{"foo":"foo"}], converters[5].call({'foo'=>'foo'})
|
29
29
|
end
|
30
30
|
|
@@ -55,7 +55,7 @@ module Embulk
|
|
55
55
|
assert_equal '1', converters[1].call(1)
|
56
56
|
assert_equal '1.1', converters[2].call(1.1)
|
57
57
|
assert_equal 1, converters[3].call('1')
|
58
|
-
timestamp = Time.parse("2016-02-26 00:00:00.100000
|
58
|
+
timestamp = Time.parse("2016-02-26 00:00:00.100000 +00:00")
|
59
59
|
assert_equal 1456444800, converters[4].call(timestamp)
|
60
60
|
assert_equal({'foo'=>'foo'}, converters[5].call({'foo'=>'foo'}))
|
61
61
|
end
|
@@ -208,7 +208,7 @@ module Embulk
|
|
208
208
|
timestamp_format: '%Y-%m-%d', timezone: 'Asia/Tokyo'
|
209
209
|
).create_converter
|
210
210
|
assert_equal nil, converter.call(nil)
|
211
|
-
assert_equal
|
211
|
+
assert_equal "2016-02-26 00:00:00.000000 +09:00", converter.call("2016-02-26")
|
212
212
|
|
213
213
|
# Users must care of BQ timestamp format by themselves with no timestamp_format
|
214
214
|
converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter
|
@@ -240,22 +240,22 @@ module Embulk
|
|
240
240
|
def test_float
|
241
241
|
converter = ValueConverterFactory.new(SCHEMA_TYPE, 'FLOAT').create_converter
|
242
242
|
assert_equal nil, converter.call(nil)
|
243
|
-
expected = 1456444800.
|
243
|
+
expected = 1456444800.500000
|
244
244
|
assert_equal expected, converter.call(Time.at(expected))
|
245
245
|
end
|
246
246
|
|
247
247
|
def test_string
|
248
248
|
converter = ValueConverterFactory.new(SCHEMA_TYPE, 'STRING').create_converter
|
249
249
|
assert_equal nil, converter.call(nil)
|
250
|
-
timestamp = Time.parse("2016-02-26 00:00:00.
|
251
|
-
expected = "2016-02-26 00:00:00.
|
250
|
+
timestamp = Time.parse("2016-02-26 00:00:00.500000 +00:00")
|
251
|
+
expected = "2016-02-26 00:00:00.500000"
|
252
252
|
assert_equal expected, converter.call(timestamp)
|
253
253
|
|
254
254
|
converter = ValueConverterFactory.new(
|
255
255
|
SCHEMA_TYPE, 'STRING',
|
256
256
|
timestamp_format: '%Y-%m-%d', timezone: 'Asia/Tokyo'
|
257
257
|
).create_converter
|
258
|
-
timestamp = Time.parse("2016-02-25 15:00:00.
|
258
|
+
timestamp = Time.parse("2016-02-25 15:00:00.500000 +00:00")
|
259
259
|
expected = "2016-02-26"
|
260
260
|
assert_equal expected, converter.call(timestamp)
|
261
261
|
end
|
@@ -263,8 +263,9 @@ module Embulk
|
|
263
263
|
def test_timestamp
|
264
264
|
converter = ValueConverterFactory.new(SCHEMA_TYPE, 'TIMESTAMP').create_converter
|
265
265
|
assert_equal nil, converter.call(nil)
|
266
|
-
|
267
|
-
|
266
|
+
subject = 1456444800.500000
|
267
|
+
expected = "2016-02-26 00:00:00.500000 +00:00"
|
268
|
+
assert_equal expected, converter.call(Time.at(subject).utc)
|
268
269
|
end
|
269
270
|
|
270
271
|
def test_record
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,78 +9,78 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2017-02-11 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name: google-api-client
|
16
|
-
version_requirements: !ruby/object:Gem::Requirement
|
17
|
-
requirements:
|
18
|
-
- - ">="
|
19
|
-
- !ruby/object:Gem::Version
|
20
|
-
version: '0'
|
21
15
|
requirement: !ruby/object:Gem::Requirement
|
22
16
|
requirements:
|
23
17
|
- - ">="
|
24
18
|
- !ruby/object:Gem::Version
|
25
19
|
version: '0'
|
20
|
+
name: google-api-client
|
26
21
|
prerelease: false
|
27
22
|
type: :runtime
|
28
|
-
- !ruby/object:Gem::Dependency
|
29
|
-
name: tzinfo
|
30
23
|
version_requirements: !ruby/object:Gem::Requirement
|
31
24
|
requirements:
|
32
25
|
- - ">="
|
33
26
|
- !ruby/object:Gem::Version
|
34
27
|
version: '0'
|
28
|
+
- !ruby/object:Gem::Dependency
|
35
29
|
requirement: !ruby/object:Gem::Requirement
|
36
30
|
requirements:
|
37
31
|
- - ">="
|
38
32
|
- !ruby/object:Gem::Version
|
39
33
|
version: '0'
|
34
|
+
name: time_with_zone
|
40
35
|
prerelease: false
|
41
36
|
type: :runtime
|
42
|
-
- !ruby/object:Gem::Dependency
|
43
|
-
name: embulk
|
44
37
|
version_requirements: !ruby/object:Gem::Requirement
|
45
38
|
requirements:
|
46
39
|
- - ">="
|
47
40
|
- !ruby/object:Gem::Version
|
48
|
-
version: 0
|
41
|
+
version: '0'
|
42
|
+
- !ruby/object:Gem::Dependency
|
49
43
|
requirement: !ruby/object:Gem::Requirement
|
50
44
|
requirements:
|
51
45
|
- - ">="
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: 0.8.2
|
48
|
+
name: embulk
|
54
49
|
prerelease: false
|
55
50
|
type: :development
|
56
|
-
- !ruby/object:Gem::Dependency
|
57
|
-
name: bundler
|
58
51
|
version_requirements: !ruby/object:Gem::Requirement
|
59
52
|
requirements:
|
60
53
|
- - ">="
|
61
54
|
- !ruby/object:Gem::Version
|
62
|
-
version:
|
55
|
+
version: 0.8.2
|
56
|
+
- !ruby/object:Gem::Dependency
|
63
57
|
requirement: !ruby/object:Gem::Requirement
|
64
58
|
requirements:
|
65
59
|
- - ">="
|
66
60
|
- !ruby/object:Gem::Version
|
67
61
|
version: 1.10.6
|
62
|
+
name: bundler
|
68
63
|
prerelease: false
|
69
64
|
type: :development
|
70
|
-
- !ruby/object:Gem::Dependency
|
71
|
-
name: rake
|
72
65
|
version_requirements: !ruby/object:Gem::Requirement
|
73
66
|
requirements:
|
74
67
|
- - ">="
|
75
68
|
- !ruby/object:Gem::Version
|
76
|
-
version:
|
69
|
+
version: 1.10.6
|
70
|
+
- !ruby/object:Gem::Dependency
|
77
71
|
requirement: !ruby/object:Gem::Requirement
|
78
72
|
requirements:
|
79
73
|
- - ">="
|
80
74
|
- !ruby/object:Gem::Version
|
81
75
|
version: '10.0'
|
76
|
+
name: rake
|
82
77
|
prerelease: false
|
83
78
|
type: :development
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '10.0'
|
84
84
|
description: Embulk plugin that insert records to Google BigQuery.
|
85
85
|
email:
|
86
86
|
- satoshiakama@gmail.com
|