embulk-output-bigquery 0.4.14 → 0.6.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG.md +28 -0
- data/README.md +74 -77
- data/embulk-output-bigquery.gemspec +10 -3
- data/lib/embulk/output/bigquery.rb +19 -49
- data/lib/embulk/output/bigquery/auth.rb +35 -0
- data/lib/embulk/output/bigquery/bigquery_client.rb +2 -11
- data/lib/embulk/output/bigquery/google_client.rb +3 -34
- data/lib/embulk/output/bigquery/value_converter_factory.rb +12 -0
- data/test/test_bigquery_client.rb +1 -5
- data/test/test_configure.rb +10 -19
- data/test/test_example.rb +0 -1
- data/test/test_helper.rb +4 -1
- data/test/test_transaction.rb +22 -62
- data/test/test_value_converter_factory.rb +42 -0
- metadata +29 -52
- data/example/config_append_direct_schema_update_options.yml +0 -31
- data/example/config_client_options.yml +0 -33
- data/example/config_csv.yml +0 -30
- data/example/config_delete_in_advance.yml +0 -29
- data/example/config_delete_in_advance_field_partitioned_table.yml +0 -33
- data/example/config_delete_in_advance_partitioned_table.yml +0 -33
- data/example/config_expose_errors.yml +0 -30
- data/example/config_gcs.yml +0 -32
- data/example/config_guess_from_embulk_schema.yml +0 -29
- data/example/config_guess_with_column_options.yml +0 -40
- data/example/config_gzip.yml +0 -1
- data/example/config_jsonl.yml +0 -1
- data/example/config_max_threads.yml +0 -34
- data/example/config_min_ouput_tasks.yml +0 -34
- data/example/config_mode_append.yml +0 -30
- data/example/config_mode_append_direct.yml +0 -30
- data/example/config_nested_record.yml +0 -1
- data/example/config_payload_column.yml +0 -20
- data/example/config_payload_column_index.yml +0 -20
- data/example/config_prevent_duplicate_insert.yml +0 -30
- data/example/config_progress_log_interval.yml +0 -31
- data/example/config_replace.yml +0 -30
- data/example/config_replace_backup.yml +0 -32
- data/example/config_replace_backup_field_partitioned_table.yml +0 -34
- data/example/config_replace_backup_partitioned_table.yml +0 -34
- data/example/config_replace_field_partitioned_table.yml +0 -33
- data/example/config_replace_partitioned_table.yml +0 -33
- data/example/config_replace_schema_update_options.yml +0 -33
- data/example/config_skip_file_generation.yml +0 -32
- data/example/config_table_strftime.yml +0 -30
- data/example/config_template_table.yml +0 -21
- data/example/config_uncompressed.yml +0 -1
- data/example/config_with_rehearsal.yml +0 -33
- data/example/example.csv +0 -17
- data/example/example.yml +0 -1
- data/example/example2_1.csv +0 -1
- data/example/example2_2.csv +0 -1
- data/example/example4_1.csv +0 -1
- data/example/example4_2.csv +0 -1
- data/example/example4_3.csv +0 -1
- data/example/example4_4.csv +0 -1
- data/example/json_key.json +0 -12
- data/example/nested_example.jsonl +0 -16
- data/example/schema.json +0 -30
- data/example/schema_expose_errors.json +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8b3d7d7d675d8428946f81517d1002f667f4fafe
|
4
|
+
data.tar.gz: 25940b93f70492675869d3c4dd50f83f8b7347cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97a2aff66c765f24289717ac79e0a25a6bf31ee3ec5b84b64c96e8573382b31b0a27c30f06692a296b3bfedd70ea9f34f1a451cea7de27d3fa4c61a7502bab98
|
7
|
+
data.tar.gz: b795d47af337e109dfafb9f41a0a720d0eb314c7ba7219193648505ec9dffa3874215b5d311256f625228a4f3e52b73153ee3d694a3d2f88d4c2fd0dd24960b1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
## 0.6.3 - 2019-10-28
|
2
|
+
|
3
|
+
* [enhancement] Add DATE type conveter (thanks to @tksfjt1024)
|
4
|
+
|
5
|
+
## 0.6.2 - 2019-10-16
|
6
|
+
|
7
|
+
* [maintenance] Lock signet and google-api-client version (thanks to @hiroyuki-sato)
|
8
|
+
|
9
|
+
## 0.6.1 - 2019-08-28
|
10
|
+
|
11
|
+
* [maintenance] Release a new gem not to include symlinks to make it work on Windows.
|
12
|
+
|
13
|
+
## 0.6.0 - 2019-08-11
|
14
|
+
|
15
|
+
Cleanup `auth_method`:
|
16
|
+
|
17
|
+
* [enhancement] Support `auth_method: authorized_user` (OAuth)
|
18
|
+
* [incompatibility change] Rename `auth_method: json_key` to `auth_method: service_account` (`json_key` is kept for backward compatibility)
|
19
|
+
* [incompatibility change] Remove deprecated `auth_method: private_key` (p12 key)
|
20
|
+
* [incompatibility change] Change the default `auth_method` to `application_default` from `private_key` because `private_key` was dropped.
|
21
|
+
|
22
|
+
## 0.5.0 - 2019-08-10
|
23
|
+
|
24
|
+
* [incompatibility change] Drop deprecated `time_partitioning`.`require_partition_filter`
|
25
|
+
* [incompatibility change] Drop `prevent_duplicate_insert` which has no use-case now
|
26
|
+
* [incompatibility change] Modes `replace`, `replace_backup`, `append`, and `delete_in_advance` require `auto_create_table: true` now because, previously, these modes had created a target table even with `auto_create_table: false` and made users being confused. Note that `auto_create_table: true` is always required even for a partition (a table name with a partition decorator) which may not require creating a table. This is for simplicity of logics and implementations.
|
27
|
+
* [incompatibility change] Change default value of `auto_create_table` to `true` because the above 4 modes, that is, except `append_direct` always require `auto_create_table: true` now.
|
28
|
+
|
1
29
|
## 0.4.14 - 2019-08-10
|
2
30
|
|
3
31
|
* [enhancement] Support field partitioning correctly.
|
data/README.md
CHANGED
@@ -23,34 +23,23 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
|
|
23
23
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
24
24
|
OAuth flow for installed applications.
|
25
25
|
|
26
|
-
### INCOMPATIBILITY CHANGES
|
27
|
-
|
28
|
-
v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
|
29
|
-
|
30
|
-
* `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
|
31
|
-
* `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
|
32
|
-
* `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
|
33
|
-
|
34
26
|
## Configuration
|
35
27
|
|
36
28
|
#### Original options
|
37
29
|
|
38
30
|
| name | type | required? | default | description |
|
39
31
|
|:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
|
40
|
-
| mode | string | optional | "append" | See [Mode](#mode)
|
41
|
-
| auth_method | string | optional | "
|
42
|
-
|
|
43
|
-
|
|
44
|
-
| json_keyfile | string | required when auth_method is json_key | | Fullpath of json key |
|
45
|
-
| project | string | required if json_keyfile is not given | | project_id |
|
32
|
+
| mode | string | optional | "append" | See [Mode](#mode) |
|
33
|
+
| auth_method | string | optional | "application\_default" | See [Authentication](#authentication) |
|
34
|
+
| json_keyfile | string | optional | | keyfile path or `content` |
|
35
|
+
| project | string | required unless service\_account's `json_keyfile` is given. | | project\_id |
|
46
36
|
| dataset | string | required | | dataset |
|
47
37
|
| location | string | optional | nil | geographic location of dataset. See [Location](#location) |
|
48
38
|
| table | string | required | | table name, or table name with a partition decorator such as `table_name$20160929`|
|
49
39
|
| auto_create_dataset | boolean | optional | false | automatically create dataset |
|
50
|
-
| auto_create_table | boolean | optional |
|
40
|
+
| auto_create_table | boolean | optional | true | `false` is available only for `append_direct` mode. Other modes require `true`. See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
|
51
41
|
| schema_file | string | optional | | /path/to/schema.json |
|
52
42
|
| template_table | string | optional | | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
|
53
|
-
| prevent_duplicate_insert | boolean | optional | false | See [Prevent Duplication](#prevent-duplication) |
|
54
43
|
| job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
|
55
44
|
| job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
|
56
45
|
| is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
|
@@ -107,7 +96,6 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
|
|
107
96
|
| time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
|
108
97
|
| time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. |
|
109
98
|
| time_partitioning.field | string | optional | nil | `DATE` or `TIMESTAMP` column used for partitioning |
|
110
|
-
| time_partitioning.require_partition_filter | boolean | optional | nil | If true, valid partition filter is required when query |
|
111
99
|
| clustering | hash | optional | nil | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
|
112
100
|
| clustering.fields | array | required | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
|
113
101
|
| schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
|
@@ -118,9 +106,8 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
|
|
118
106
|
out:
|
119
107
|
type: bigquery
|
120
108
|
mode: append
|
121
|
-
auth_method:
|
122
|
-
|
123
|
-
p12_keyfile: /path/to/p12_keyfile.p12
|
109
|
+
auth_method: service_account
|
110
|
+
json_keyfile: /path/to/json_keyfile.json
|
124
111
|
project: your-project-000
|
125
112
|
dataset: your_dataset_name
|
126
113
|
table: your_table_name
|
@@ -128,7 +115,7 @@ out:
|
|
128
115
|
source_format: NEWLINE_DELIMITED_JSON
|
129
116
|
```
|
130
117
|
|
131
|
-
###
|
118
|
+
### Location
|
132
119
|
|
133
120
|
The geographic location of the dataset. Required except for US and EU.
|
134
121
|
|
@@ -136,7 +123,7 @@ GCS bucket should be in same region when you use `gcs_bucket`.
|
|
136
123
|
|
137
124
|
See also [Dataset Locations | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/dataset-locations)
|
138
125
|
|
139
|
-
###
|
126
|
+
### Mode
|
140
127
|
|
141
128
|
5 modes are provided.
|
142
129
|
|
@@ -175,53 +162,69 @@ NOTE: BigQuery does not support replacing (actually, copying into) a non-partiti
|
|
175
162
|
|
176
163
|
### Authentication
|
177
164
|
|
178
|
-
There are
|
165
|
+
There are four authentication methods
|
166
|
+
|
167
|
+
1. `service_account` (or `json_key` for backward compatibility)
|
168
|
+
1. `authorized_user`
|
169
|
+
1. `compute_engine`
|
170
|
+
1. `application_default`
|
171
|
+
|
172
|
+
#### service\_account (or json\_key)
|
179
173
|
|
180
|
-
|
181
|
-
|
182
|
-
3. Pre-defined access token (Google Compute Engine only)
|
174
|
+
Use GCP service account credentials.
|
175
|
+
You first need to create a service account, download its json key and deploy the key with embulk.
|
183
176
|
|
184
|
-
|
177
|
+
```yaml
|
178
|
+
out:
|
179
|
+
type: bigquery
|
180
|
+
auth_method: service_account
|
181
|
+
json_keyfile: /path/to/json_keyfile.json
|
182
|
+
```
|
185
183
|
|
186
|
-
You
|
187
|
-
download its private key and deploy the key with embulk.
|
184
|
+
You can also embed contents of `json_keyfile` at config.yml.
|
188
185
|
|
189
186
|
```yaml
|
190
187
|
out:
|
191
188
|
type: bigquery
|
192
|
-
auth_method:
|
193
|
-
|
194
|
-
|
189
|
+
auth_method: service_account
|
190
|
+
json_keyfile:
|
191
|
+
content: |
|
192
|
+
{
|
193
|
+
"private_key_id": "123456789",
|
194
|
+
"private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
|
195
|
+
"client_email": "..."
|
196
|
+
}
|
195
197
|
```
|
196
198
|
|
197
|
-
####
|
199
|
+
#### authorized\_user
|
198
200
|
|
199
|
-
|
200
|
-
|
201
|
+
Use Google user credentials.
|
202
|
+
You can get your credentials at `~/.config/gcloud/application_default_credentials.json` by running `gcloud auth login`.
|
201
203
|
|
202
204
|
```yaml
|
203
205
|
out:
|
204
206
|
type: bigquery
|
205
|
-
auth_method:
|
206
|
-
json_keyfile: /path/to/
|
207
|
+
auth_method: authorized_user
|
208
|
+
json_keyfile: /path/to/credentials.json
|
207
209
|
```
|
208
210
|
|
209
|
-
You can also embed contents of json_keyfile at config.yml.
|
211
|
+
You can also embed contents of `json_keyfile` at config.yml.
|
210
212
|
|
211
213
|
```yaml
|
212
214
|
out:
|
213
215
|
type: bigquery
|
214
|
-
auth_method:
|
216
|
+
auth_method: authorized_user
|
215
217
|
json_keyfile:
|
216
218
|
content: |
|
217
219
|
{
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
220
|
+
"client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
|
221
|
+
"client_secret":"xxxxxxxxxxx",
|
222
|
+
"refresh_token":"xxxxxxxxxxx",
|
223
|
+
"type":"authorized_user"
|
224
|
+
}
|
222
225
|
```
|
223
226
|
|
224
|
-
####
|
227
|
+
#### compute\_engine
|
225
228
|
|
226
229
|
On the other hand, you don't need to explicitly create a service account for embulk when you
|
227
230
|
run embulk in Google Compute Engine. In this third authentication method, you need to
|
@@ -234,6 +237,22 @@ out:
|
|
234
237
|
auth_method: compute_engine
|
235
238
|
```
|
236
239
|
|
240
|
+
#### application\_default
|
241
|
+
|
242
|
+
Use Application Default Credentials (ADC). ADC is a strategy to locate Google Cloud Service Account credentials.
|
243
|
+
|
244
|
+
1. ADC checks to see if the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set. If the variable is set, ADC uses the service account file that the variable points to.
|
245
|
+
2. ADC checks to see if `~/.config/gcloud/application_default_credentials.json` is located. This file is created by running `gcloud auth application-default login`.
|
246
|
+
3. Use the default service account for credentials if the application running on Compute Engine, App Engine, Kubernetes Engine, Cloud Functions or Cloud Run.
|
247
|
+
|
248
|
+
See https://cloud.google.com/docs/authentication/production for details.
|
249
|
+
|
250
|
+
```yaml
|
251
|
+
out:
|
252
|
+
type: bigquery
|
253
|
+
auth_method: application_default
|
254
|
+
```
|
255
|
+
|
237
256
|
### Table id formatting
|
238
257
|
|
239
258
|
`table` and option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
|
@@ -242,21 +261,16 @@ Table ids are formatted at runtime
|
|
242
261
|
using the local time of the embulk server.
|
243
262
|
|
244
263
|
For example, with the configuration below,
|
245
|
-
data is inserted into tables `
|
264
|
+
data is inserted into tables `table_20150503`, `table_20150504` and so on.
|
246
265
|
|
247
266
|
```yaml
|
248
267
|
out:
|
249
268
|
type: bigquery
|
250
|
-
table: table_%
|
269
|
+
table: table_%Y%m%d
|
251
270
|
```
|
252
271
|
|
253
272
|
### Dynamic table creating
|
254
273
|
|
255
|
-
This plugin tries to create a table using BigQuery API when
|
256
|
-
|
257
|
-
* mode is either of `delete_in_advance`, `replace`, `replace_backup`, `append`.
|
258
|
-
* mode is `append_direct` and `auto_create_table` is true.
|
259
|
-
|
260
274
|
There are 3 ways to set schema.
|
261
275
|
|
262
276
|
#### Set schema.json
|
@@ -267,7 +281,7 @@ Please set file path of schema.json.
|
|
267
281
|
out:
|
268
282
|
type: bigquery
|
269
283
|
auto_create_table: true
|
270
|
-
table: table_%
|
284
|
+
table: table_%Y%m%d
|
271
285
|
schema_file: /path/to/schema.json
|
272
286
|
```
|
273
287
|
|
@@ -279,7 +293,7 @@ Plugin will try to read schema from existing table and use it as schema template
|
|
279
293
|
out:
|
280
294
|
type: bigquery
|
281
295
|
auto_create_table: true
|
282
|
-
table: table_%
|
296
|
+
table: table_%Y%m%d
|
283
297
|
template_table: existing_table_name
|
284
298
|
```
|
285
299
|
|
@@ -293,17 +307,17 @@ Column options are used to aid guessing BigQuery schema, or to define conversion
|
|
293
307
|
|
294
308
|
- **column_options**: advanced: an array of options for columns
|
295
309
|
- **name**: column name
|
296
|
-
- **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, and `RECORD`. See belows for supported conversion type.
|
310
|
+
- **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE`, and `RECORD`. See belows for supported conversion type.
|
297
311
|
- boolean: `BOOLEAN`, `STRING` (default: `BOOLEAN`)
|
298
312
|
- long: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `INTEGER`)
|
299
313
|
- double: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `FLOAT`)
|
300
|
-
- string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `RECORD` (default: `STRING`)
|
301
|
-
- timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `TIMESTAMP`)
|
314
|
+
- string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE`, `RECORD` (default: `STRING`)
|
315
|
+
- timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE` (default: `TIMESTAMP`)
|
302
316
|
- json: `STRING`, `RECORD` (default: `STRING`)
|
303
317
|
- **mode**: BigQuery mode such as `NULLABLE`, `REQUIRED`, and `REPEATED` (string, default: `NULLABLE`)
|
304
318
|
- **fields**: Describes the nested schema fields if the type property is set to RECORD. Please note that this is **required** for `RECORD` column.
|
305
319
|
- **timestamp_format**: timestamp format to convert into/from `timestamp` (string, default is `default_timestamp_format`)
|
306
|
-
- **timezone**: timezone to convert into/from `timestamp` (string, default is `default_timezone`).
|
320
|
+
- **timezone**: timezone to convert into/from `timestamp`, `date` (string, default is `default_timezone`).
|
307
321
|
- **default_timestamp_format**: default timestamp format for column_options (string, default is "%Y-%m-%d %H:%M:%S.%6N")
|
308
322
|
- **default_timezone**: default timezone for column_options (string, default is "UTC")
|
309
323
|
|
@@ -355,22 +369,6 @@ out:
|
|
355
369
|
payload_column_index: 0 # or, payload_column: payload
|
356
370
|
```
|
357
371
|
|
358
|
-
### Prevent Duplication
|
359
|
-
|
360
|
-
`prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
|
361
|
-
|
362
|
-
When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
|
363
|
-
|
364
|
-
`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
|
365
|
-
|
366
|
-
[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
|
367
|
-
|
368
|
-
```yaml
|
369
|
-
out:
|
370
|
-
type: bigquery
|
371
|
-
prevent_duplicate_insert: true
|
372
|
-
```
|
373
|
-
|
374
372
|
### GCS Bucket
|
375
373
|
|
376
374
|
This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs).
|
@@ -401,32 +399,31 @@ To load into a partition, specify `table` parameter with a partition decorator a
|
|
401
399
|
out:
|
402
400
|
type: bigquery
|
403
401
|
table: table_name$20160929
|
404
|
-
auto_create_table: true
|
405
402
|
```
|
406
403
|
|
407
|
-
You may configure `time_partitioning` parameter together
|
404
|
+
You may configure `time_partitioning` parameter together as:
|
408
405
|
|
409
406
|
```yaml
|
410
407
|
out:
|
411
408
|
type: bigquery
|
412
409
|
table: table_name$20160929
|
413
|
-
auto_create_table: true
|
414
410
|
time_partitioning:
|
415
411
|
type: DAY
|
416
412
|
expiration_ms: 259200000
|
417
413
|
```
|
418
414
|
|
419
415
|
You can also create column-based partitioning table as:
|
416
|
+
|
420
417
|
```yaml
|
421
418
|
out:
|
422
419
|
type: bigquery
|
423
420
|
mode: replace
|
424
|
-
auto_create_table: true
|
425
421
|
table: table_name
|
426
422
|
time_partitioning:
|
427
423
|
type: DAY
|
428
424
|
field: timestamp
|
429
425
|
```
|
426
|
+
|
430
427
|
Note the `time_partitioning.field` should be top-level `DATE` or `TIMESTAMP`.
|
431
428
|
|
432
429
|
Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.
|
3
|
+
spec.version = "0.6.3"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -8,11 +8,18 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.licenses = ["MIT"]
|
9
9
|
spec.homepage = "https://github.com/embulk/embulk-output-bigquery"
|
10
10
|
|
11
|
-
|
11
|
+
# Exclude example directory which uses symlinks from generating gem.
|
12
|
+
# Symlinks do not work properly on the Windows platform without administrator privilege.
|
13
|
+
spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"] - Dir["example/*" ]
|
12
14
|
spec.test_files = spec.files.grep(%r{^(test|spec)/})
|
13
15
|
spec.require_paths = ["lib"]
|
14
16
|
|
15
|
-
|
17
|
+
# TODO
|
18
|
+
# signet 0.12.0 and google-api-client 0.33.0 require >= Ruby 2.4.
|
19
|
+
# Embulk 0.9 use JRuby 9.1.X.Y and It compatible Ruby 2.3.
|
20
|
+
# So, Force install signet < 0.12 and google-api-client < 0.33.0
|
21
|
+
spec.add_dependency 'signet', '~> 0.7', '< 0.12.0'
|
22
|
+
spec.add_dependency 'google-api-client','< 0.33.0'
|
16
23
|
spec.add_dependency 'time_with_zone'
|
17
24
|
|
18
25
|
spec.add_development_dependency 'bundler', ['>= 1.10.6']
|
@@ -23,7 +23,7 @@ module Embulk
|
|
23
23
|
# @return JSON string
|
24
24
|
def self.load(v)
|
25
25
|
if v.is_a?(String) # path
|
26
|
-
File.read(v)
|
26
|
+
File.read(File.expand_path(v))
|
27
27
|
elsif v.is_a?(Hash)
|
28
28
|
v['content']
|
29
29
|
end
|
@@ -33,9 +33,7 @@ module Embulk
|
|
33
33
|
def self.configure(config, schema, task_count)
|
34
34
|
task = {
|
35
35
|
'mode' => config.param('mode', :string, :default => 'append'),
|
36
|
-
'auth_method' => config.param('auth_method', :string, :default => '
|
37
|
-
'service_account_email' => config.param('service_account_email', :string, :default => nil),
|
38
|
-
'p12_keyfile' => config.param('p12_keyfile', :string, :default => nil),
|
36
|
+
'auth_method' => config.param('auth_method', :string, :default => 'application_default'),
|
39
37
|
'json_keyfile' => config.param('json_keyfile', LocalFile, :default => nil),
|
40
38
|
'project' => config.param('project', :string, :default => nil),
|
41
39
|
'dataset' => config.param('dataset', :string),
|
@@ -45,7 +43,7 @@ module Embulk
|
|
45
43
|
'table_old' => config.param('table_old', :string, :default => nil),
|
46
44
|
'table_name_old' => config.param('table_name_old', :string, :default => nil), # lower version compatibility
|
47
45
|
'auto_create_dataset' => config.param('auto_create_dataset', :bool, :default => false),
|
48
|
-
'auto_create_table' => config.param('auto_create_table', :bool, :default =>
|
46
|
+
'auto_create_table' => config.param('auto_create_table', :bool, :default => true),
|
49
47
|
'schema_file' => config.param('schema_file', :string, :default => nil),
|
50
48
|
'template_table' => config.param('template_table', :string, :default => nil),
|
51
49
|
|
@@ -53,7 +51,6 @@ module Embulk
|
|
53
51
|
'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
|
54
52
|
'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
|
55
53
|
'is_skip_job_result_check' => config.param('is_skip_job_result_check', :bool, :default => false),
|
56
|
-
'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
|
57
54
|
'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
|
58
55
|
'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
|
59
56
|
'abort_on_error' => config.param('abort_on_error', :bool, :default => nil),
|
@@ -105,10 +102,14 @@ module Embulk
|
|
105
102
|
raise ConfigError.new "`mode` must be one of append, append_direct, replace, delete_in_advance, replace_backup"
|
106
103
|
end
|
107
104
|
|
105
|
+
if %w[append replace delete_in_advance replace_backup].include?(task['mode']) and !task['auto_create_table']
|
106
|
+
raise ConfigError.new "`mode: #{task['mode']}` requires `auto_create_table: true`"
|
107
|
+
end
|
108
|
+
|
108
109
|
if task['mode'] == 'replace_backup'
|
109
110
|
task['table_old'] ||= task['table_name_old'] # for lower version compatibility
|
110
111
|
if task['dataset_old'].nil? and task['table_old'].nil?
|
111
|
-
raise ConfigError.new "`mode replace_backup` requires either of `dataset_old` or `table_old`"
|
112
|
+
raise ConfigError.new "`mode: replace_backup` requires either of `dataset_old` or `table_old`"
|
112
113
|
end
|
113
114
|
task['dataset_old'] ||= task['dataset']
|
114
115
|
task['table_old'] ||= task['table']
|
@@ -122,28 +123,21 @@ module Embulk
|
|
122
123
|
end
|
123
124
|
|
124
125
|
task['auth_method'] = task['auth_method'].downcase
|
125
|
-
unless %w[
|
126
|
-
raise ConfigError.new "`auth_method` must be one of
|
127
|
-
end
|
128
|
-
if task['auth_method'] == 'private_key' and task['p12_keyfile'].nil?
|
129
|
-
raise ConfigError.new "`p12_keyfile` is required for auth_method private_key"
|
126
|
+
unless %w[json_key service_account authorized_user compute_engine application_default].include?(task['auth_method'])
|
127
|
+
raise ConfigError.new "`auth_method` must be one of service_account (or json_key), authorized_user, compute_engine, application_default"
|
130
128
|
end
|
131
|
-
if task['auth_method'] == 'json_key' and task['json_keyfile'].nil?
|
132
|
-
raise ConfigError.new "`json_keyfile` is required for auth_method json_key"
|
129
|
+
if (task['auth_method'] == 'service_account' or task['auth_method'] == 'json_key') and task['json_keyfile'].nil?
|
130
|
+
raise ConfigError.new "`json_keyfile` is required for auth_method: service_account (or json_key)"
|
133
131
|
end
|
134
132
|
|
135
|
-
jsonkey_params = nil
|
136
133
|
if task['json_keyfile']
|
137
134
|
begin
|
138
|
-
|
135
|
+
json_key = JSON.parse(task['json_keyfile'])
|
136
|
+
task['project'] ||= json_key['project_id']
|
139
137
|
rescue => e
|
140
138
|
raise ConfigError.new "json_keyfile is not a JSON file"
|
141
139
|
end
|
142
140
|
end
|
143
|
-
|
144
|
-
if jsonkey_params
|
145
|
-
task['project'] ||= jsonkey_params['project_id']
|
146
|
-
end
|
147
141
|
if task['project'].nil?
|
148
142
|
raise ConfigError.new "Required field \"project\" is not set"
|
149
143
|
end
|
@@ -306,42 +300,18 @@ module Embulk
|
|
306
300
|
|
307
301
|
case task['mode']
|
308
302
|
when 'delete_in_advance'
|
309
|
-
bigquery.
|
303
|
+
bigquery.delete_table_or_partition(task['table'])
|
310
304
|
bigquery.create_table_if_not_exists(task['table'])
|
311
305
|
when 'replace'
|
312
306
|
bigquery.create_table_if_not_exists(task['temp_table'])
|
313
|
-
|
314
|
-
if task['auto_create_table']
|
315
|
-
bigquery.create_table_if_not_exists(task['table'])
|
316
|
-
else
|
317
|
-
bigquery.get_table(task['table']) # raises NotFoundError
|
318
|
-
end
|
319
|
-
end
|
307
|
+
bigquery.create_table_if_not_exists(task['table']) # needs for when task['table'] is a partition
|
320
308
|
when 'append'
|
321
309
|
bigquery.create_table_if_not_exists(task['temp_table'])
|
322
|
-
|
323
|
-
if task['auto_create_table']
|
324
|
-
bigquery.create_table_if_not_exists(task['table'])
|
325
|
-
else
|
326
|
-
bigquery.get_table(task['table']) # raises NotFoundError
|
327
|
-
end
|
328
|
-
end
|
310
|
+
bigquery.create_table_if_not_exists(task['table']) # needs for when task['table'] is a partition
|
329
311
|
when 'replace_backup'
|
330
312
|
bigquery.create_table_if_not_exists(task['temp_table'])
|
331
|
-
|
332
|
-
|
333
|
-
bigquery.create_table_if_not_exists(task['table'])
|
334
|
-
else
|
335
|
-
bigquery.get_table(task['table']) # raises NotFoundError
|
336
|
-
end
|
337
|
-
end
|
338
|
-
if Helper.has_partition_decorator?(task['table_old'])
|
339
|
-
if task['auto_create_table']
|
340
|
-
bigquery.create_table_if_not_exists(task['table_old'], dataset: task['dataset_old'])
|
341
|
-
else
|
342
|
-
bigquery.get_table(task['table_old'], dataset: task['dataset_old']) # raises NotFoundError
|
343
|
-
end
|
344
|
-
end
|
313
|
+
bigquery.create_table_if_not_exists(task['table'])
|
314
|
+
bigquery.create_table_if_not_exists(task['table_old'], dataset: task['dataset_old']) # needs for when a partition
|
345
315
|
else # append_direct
|
346
316
|
if task['auto_create_table']
|
347
317
|
bigquery.create_table_if_not_exists(task['table'])
|