embulk-output-bigquery 0.4.14 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG.md +28 -0
- data/README.md +74 -77
- data/embulk-output-bigquery.gemspec +10 -3
- data/lib/embulk/output/bigquery.rb +19 -49
- data/lib/embulk/output/bigquery/auth.rb +35 -0
- data/lib/embulk/output/bigquery/bigquery_client.rb +2 -11
- data/lib/embulk/output/bigquery/google_client.rb +3 -34
- data/lib/embulk/output/bigquery/value_converter_factory.rb +12 -0
- data/test/test_bigquery_client.rb +1 -5
- data/test/test_configure.rb +10 -19
- data/test/test_example.rb +0 -1
- data/test/test_helper.rb +4 -1
- data/test/test_transaction.rb +22 -62
- data/test/test_value_converter_factory.rb +42 -0
- metadata +29 -52
- data/example/config_append_direct_schema_update_options.yml +0 -31
- data/example/config_client_options.yml +0 -33
- data/example/config_csv.yml +0 -30
- data/example/config_delete_in_advance.yml +0 -29
- data/example/config_delete_in_advance_field_partitioned_table.yml +0 -33
- data/example/config_delete_in_advance_partitioned_table.yml +0 -33
- data/example/config_expose_errors.yml +0 -30
- data/example/config_gcs.yml +0 -32
- data/example/config_guess_from_embulk_schema.yml +0 -29
- data/example/config_guess_with_column_options.yml +0 -40
- data/example/config_gzip.yml +0 -1
- data/example/config_jsonl.yml +0 -1
- data/example/config_max_threads.yml +0 -34
- data/example/config_min_ouput_tasks.yml +0 -34
- data/example/config_mode_append.yml +0 -30
- data/example/config_mode_append_direct.yml +0 -30
- data/example/config_nested_record.yml +0 -1
- data/example/config_payload_column.yml +0 -20
- data/example/config_payload_column_index.yml +0 -20
- data/example/config_prevent_duplicate_insert.yml +0 -30
- data/example/config_progress_log_interval.yml +0 -31
- data/example/config_replace.yml +0 -30
- data/example/config_replace_backup.yml +0 -32
- data/example/config_replace_backup_field_partitioned_table.yml +0 -34
- data/example/config_replace_backup_partitioned_table.yml +0 -34
- data/example/config_replace_field_partitioned_table.yml +0 -33
- data/example/config_replace_partitioned_table.yml +0 -33
- data/example/config_replace_schema_update_options.yml +0 -33
- data/example/config_skip_file_generation.yml +0 -32
- data/example/config_table_strftime.yml +0 -30
- data/example/config_template_table.yml +0 -21
- data/example/config_uncompressed.yml +0 -1
- data/example/config_with_rehearsal.yml +0 -33
- data/example/example.csv +0 -17
- data/example/example.yml +0 -1
- data/example/example2_1.csv +0 -1
- data/example/example2_2.csv +0 -1
- data/example/example4_1.csv +0 -1
- data/example/example4_2.csv +0 -1
- data/example/example4_3.csv +0 -1
- data/example/example4_4.csv +0 -1
- data/example/json_key.json +0 -12
- data/example/nested_example.jsonl +0 -16
- data/example/schema.json +0 -30
- data/example/schema_expose_errors.json +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8b3d7d7d675d8428946f81517d1002f667f4fafe
|
4
|
+
data.tar.gz: 25940b93f70492675869d3c4dd50f83f8b7347cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97a2aff66c765f24289717ac79e0a25a6bf31ee3ec5b84b64c96e8573382b31b0a27c30f06692a296b3bfedd70ea9f34f1a451cea7de27d3fa4c61a7502bab98
|
7
|
+
data.tar.gz: b795d47af337e109dfafb9f41a0a720d0eb314c7ba7219193648505ec9dffa3874215b5d311256f625228a4f3e52b73153ee3d694a3d2f88d4c2fd0dd24960b1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
## 0.6.3 - 2019-10-28
|
2
|
+
|
3
|
+
* [enhancement] Add DATE type conveter (thanks to @tksfjt1024)
|
4
|
+
|
5
|
+
## 0.6.2 - 2019-10-16
|
6
|
+
|
7
|
+
* [maintenance] Lock signet and google-api-client version (thanks to @hiroyuki-sato)
|
8
|
+
|
9
|
+
## 0.6.1 - 2019-08-28
|
10
|
+
|
11
|
+
* [maintenance] Release a new gem not to include symlinks to make it work on Windows.
|
12
|
+
|
13
|
+
## 0.6.0 - 2019-08-11
|
14
|
+
|
15
|
+
Cleanup `auth_method`:
|
16
|
+
|
17
|
+
* [enhancement] Support `auth_method: authorized_user` (OAuth)
|
18
|
+
* [incompatibility change] Rename `auth_method: json_key` to `auth_method: service_account` (`json_key` is kept for backward compatibility)
|
19
|
+
* [incompatibility change] Remove deprecated `auth_method: private_key` (p12 key)
|
20
|
+
* [incompatibility change] Change the default `auth_method` to `application_default` from `private_key` because `private_key` was dropped.
|
21
|
+
|
22
|
+
## 0.5.0 - 2019-08-10
|
23
|
+
|
24
|
+
* [incompatibility change] Drop deprecated `time_partitioning`.`require_partition_filter`
|
25
|
+
* [incompatibility change] Drop `prevent_duplicate_insert` which has no use-case now
|
26
|
+
* [incompatibility change] Modes `replace`, `replace_backup`, `append`, and `delete_in_advance` require `auto_create_table: true` now because, previously, these modes had created a target table even with `auto_create_table: false` and made users being confused. Note that `auto_create_table: true` is always required even for a partition (a table name with a partition decorator) which may not require creating a table. This is for simplicity of logics and implementations.
|
27
|
+
* [incompatibility change] Change default value of `auto_create_table` to `true` because the above 4 modes, that is, except `append_direct` always require `auto_create_table: true` now.
|
28
|
+
|
1
29
|
## 0.4.14 - 2019-08-10
|
2
30
|
|
3
31
|
* [enhancement] Support field partitioning correctly.
|
data/README.md
CHANGED
@@ -23,34 +23,23 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
|
|
23
23
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
24
24
|
OAuth flow for installed applications.
|
25
25
|
|
26
|
-
### INCOMPATIBILITY CHANGES
|
27
|
-
|
28
|
-
v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
|
29
|
-
|
30
|
-
* `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
|
31
|
-
* `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
|
32
|
-
* `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
|
33
|
-
|
34
26
|
## Configuration
|
35
27
|
|
36
28
|
#### Original options
|
37
29
|
|
38
30
|
| name | type | required? | default | description |
|
39
31
|
|:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
|
40
|
-
| mode | string | optional | "append" | See [Mode](#mode)
|
41
|
-
| auth_method | string | optional | "
|
42
|
-
|
|
43
|
-
|
|
44
|
-
| json_keyfile | string | required when auth_method is json_key | | Fullpath of json key |
|
45
|
-
| project | string | required if json_keyfile is not given | | project_id |
|
32
|
+
| mode | string | optional | "append" | See [Mode](#mode) |
|
33
|
+
| auth_method | string | optional | "application\_default" | See [Authentication](#authentication) |
|
34
|
+
| json_keyfile | string | optional | | keyfile path or `content` |
|
35
|
+
| project | string | required unless service\_account's `json_keyfile` is given. | | project\_id |
|
46
36
|
| dataset | string | required | | dataset |
|
47
37
|
| location | string | optional | nil | geographic location of dataset. See [Location](#location) |
|
48
38
|
| table | string | required | | table name, or table name with a partition decorator such as `table_name$20160929`|
|
49
39
|
| auto_create_dataset | boolean | optional | false | automatically create dataset |
|
50
|
-
| auto_create_table | boolean | optional |
|
40
|
+
| auto_create_table | boolean | optional | true | `false` is available only for `append_direct` mode. Other modes require `true`. See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
|
51
41
|
| schema_file | string | optional | | /path/to/schema.json |
|
52
42
|
| template_table | string | optional | | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
|
53
|
-
| prevent_duplicate_insert | boolean | optional | false | See [Prevent Duplication](#prevent-duplication) |
|
54
43
|
| job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
|
55
44
|
| job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
|
56
45
|
| is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
|
@@ -107,7 +96,6 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
|
|
107
96
|
| time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
|
108
97
|
| time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. |
|
109
98
|
| time_partitioning.field | string | optional | nil | `DATE` or `TIMESTAMP` column used for partitioning |
|
110
|
-
| time_partitioning.require_partition_filter | boolean | optional | nil | If true, valid partition filter is required when query |
|
111
99
|
| clustering | hash | optional | nil | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
|
112
100
|
| clustering.fields | array | required | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
|
113
101
|
| schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
|
@@ -118,9 +106,8 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
|
|
118
106
|
out:
|
119
107
|
type: bigquery
|
120
108
|
mode: append
|
121
|
-
auth_method:
|
122
|
-
|
123
|
-
p12_keyfile: /path/to/p12_keyfile.p12
|
109
|
+
auth_method: service_account
|
110
|
+
json_keyfile: /path/to/json_keyfile.json
|
124
111
|
project: your-project-000
|
125
112
|
dataset: your_dataset_name
|
126
113
|
table: your_table_name
|
@@ -128,7 +115,7 @@ out:
|
|
128
115
|
source_format: NEWLINE_DELIMITED_JSON
|
129
116
|
```
|
130
117
|
|
131
|
-
###
|
118
|
+
### Location
|
132
119
|
|
133
120
|
The geographic location of the dataset. Required except for US and EU.
|
134
121
|
|
@@ -136,7 +123,7 @@ GCS bucket should be in same region when you use `gcs_bucket`.
|
|
136
123
|
|
137
124
|
See also [Dataset Locations | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/dataset-locations)
|
138
125
|
|
139
|
-
###
|
126
|
+
### Mode
|
140
127
|
|
141
128
|
5 modes are provided.
|
142
129
|
|
@@ -175,53 +162,69 @@ NOTE: BigQuery does not support replacing (actually, copying into) a non-partiti
|
|
175
162
|
|
176
163
|
### Authentication
|
177
164
|
|
178
|
-
There are
|
165
|
+
There are four authentication methods
|
166
|
+
|
167
|
+
1. `service_account` (or `json_key` for backward compatibility)
|
168
|
+
1. `authorized_user`
|
169
|
+
1. `compute_engine`
|
170
|
+
1. `application_default`
|
171
|
+
|
172
|
+
#### service\_account (or json\_key)
|
179
173
|
|
180
|
-
|
181
|
-
|
182
|
-
3. Pre-defined access token (Google Compute Engine only)
|
174
|
+
Use GCP service account credentials.
|
175
|
+
You first need to create a service account, download its json key and deploy the key with embulk.
|
183
176
|
|
184
|
-
|
177
|
+
```yaml
|
178
|
+
out:
|
179
|
+
type: bigquery
|
180
|
+
auth_method: service_account
|
181
|
+
json_keyfile: /path/to/json_keyfile.json
|
182
|
+
```
|
185
183
|
|
186
|
-
You
|
187
|
-
download its private key and deploy the key with embulk.
|
184
|
+
You can also embed contents of `json_keyfile` at config.yml.
|
188
185
|
|
189
186
|
```yaml
|
190
187
|
out:
|
191
188
|
type: bigquery
|
192
|
-
auth_method:
|
193
|
-
|
194
|
-
|
189
|
+
auth_method: service_account
|
190
|
+
json_keyfile:
|
191
|
+
content: |
|
192
|
+
{
|
193
|
+
"private_key_id": "123456789",
|
194
|
+
"private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
|
195
|
+
"client_email": "..."
|
196
|
+
}
|
195
197
|
```
|
196
198
|
|
197
|
-
####
|
199
|
+
#### authorized\_user
|
198
200
|
|
199
|
-
|
200
|
-
|
201
|
+
Use Google user credentials.
|
202
|
+
You can get your credentials at `~/.config/gcloud/application_default_credentials.json` by running `gcloud auth login`.
|
201
203
|
|
202
204
|
```yaml
|
203
205
|
out:
|
204
206
|
type: bigquery
|
205
|
-
auth_method:
|
206
|
-
json_keyfile: /path/to/
|
207
|
+
auth_method: authorized_user
|
208
|
+
json_keyfile: /path/to/credentials.json
|
207
209
|
```
|
208
210
|
|
209
|
-
You can also embed contents of json_keyfile at config.yml.
|
211
|
+
You can also embed contents of `json_keyfile` at config.yml.
|
210
212
|
|
211
213
|
```yaml
|
212
214
|
out:
|
213
215
|
type: bigquery
|
214
|
-
auth_method:
|
216
|
+
auth_method: authorized_user
|
215
217
|
json_keyfile:
|
216
218
|
content: |
|
217
219
|
{
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
220
|
+
"client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
|
221
|
+
"client_secret":"xxxxxxxxxxx",
|
222
|
+
"refresh_token":"xxxxxxxxxxx",
|
223
|
+
"type":"authorized_user"
|
224
|
+
}
|
222
225
|
```
|
223
226
|
|
224
|
-
####
|
227
|
+
#### compute\_engine
|
225
228
|
|
226
229
|
On the other hand, you don't need to explicitly create a service account for embulk when you
|
227
230
|
run embulk in Google Compute Engine. In this third authentication method, you need to
|
@@ -234,6 +237,22 @@ out:
|
|
234
237
|
auth_method: compute_engine
|
235
238
|
```
|
236
239
|
|
240
|
+
#### application\_default
|
241
|
+
|
242
|
+
Use Application Default Credentials (ADC). ADC is a strategy to locate Google Cloud Service Account credentials.
|
243
|
+
|
244
|
+
1. ADC checks to see if the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set. If the variable is set, ADC uses the service account file that the variable points to.
|
245
|
+
2. ADC checks to see if `~/.config/gcloud/application_default_credentials.json` is located. This file is created by running `gcloud auth application-default login`.
|
246
|
+
3. Use the default service account for credentials if the application running on Compute Engine, App Engine, Kubernetes Engine, Cloud Functions or Cloud Run.
|
247
|
+
|
248
|
+
See https://cloud.google.com/docs/authentication/production for details.
|
249
|
+
|
250
|
+
```yaml
|
251
|
+
out:
|
252
|
+
type: bigquery
|
253
|
+
auth_method: application_default
|
254
|
+
```
|
255
|
+
|
237
256
|
### Table id formatting
|
238
257
|
|
239
258
|
`table` and option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
|
@@ -242,21 +261,16 @@ Table ids are formatted at runtime
|
|
242
261
|
using the local time of the embulk server.
|
243
262
|
|
244
263
|
For example, with the configuration below,
|
245
|
-
data is inserted into tables `
|
264
|
+
data is inserted into tables `table_20150503`, `table_20150504` and so on.
|
246
265
|
|
247
266
|
```yaml
|
248
267
|
out:
|
249
268
|
type: bigquery
|
250
|
-
table: table_%
|
269
|
+
table: table_%Y%m%d
|
251
270
|
```
|
252
271
|
|
253
272
|
### Dynamic table creating
|
254
273
|
|
255
|
-
This plugin tries to create a table using BigQuery API when
|
256
|
-
|
257
|
-
* mode is either of `delete_in_advance`, `replace`, `replace_backup`, `append`.
|
258
|
-
* mode is `append_direct` and `auto_create_table` is true.
|
259
|
-
|
260
274
|
There are 3 ways to set schema.
|
261
275
|
|
262
276
|
#### Set schema.json
|
@@ -267,7 +281,7 @@ Please set file path of schema.json.
|
|
267
281
|
out:
|
268
282
|
type: bigquery
|
269
283
|
auto_create_table: true
|
270
|
-
table: table_%
|
284
|
+
table: table_%Y%m%d
|
271
285
|
schema_file: /path/to/schema.json
|
272
286
|
```
|
273
287
|
|
@@ -279,7 +293,7 @@ Plugin will try to read schema from existing table and use it as schema template
|
|
279
293
|
out:
|
280
294
|
type: bigquery
|
281
295
|
auto_create_table: true
|
282
|
-
table: table_%
|
296
|
+
table: table_%Y%m%d
|
283
297
|
template_table: existing_table_name
|
284
298
|
```
|
285
299
|
|
@@ -293,17 +307,17 @@ Column options are used to aid guessing BigQuery schema, or to define conversion
|
|
293
307
|
|
294
308
|
- **column_options**: advanced: an array of options for columns
|
295
309
|
- **name**: column name
|
296
|
-
- **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, and `RECORD`. See belows for supported conversion type.
|
310
|
+
- **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE`, and `RECORD`. See belows for supported conversion type.
|
297
311
|
- boolean: `BOOLEAN`, `STRING` (default: `BOOLEAN`)
|
298
312
|
- long: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `INTEGER`)
|
299
313
|
- double: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `FLOAT`)
|
300
|
-
- string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `RECORD` (default: `STRING`)
|
301
|
-
- timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `TIMESTAMP`)
|
314
|
+
- string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE`, `RECORD` (default: `STRING`)
|
315
|
+
- timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE` (default: `TIMESTAMP`)
|
302
316
|
- json: `STRING`, `RECORD` (default: `STRING`)
|
303
317
|
- **mode**: BigQuery mode such as `NULLABLE`, `REQUIRED`, and `REPEATED` (string, default: `NULLABLE`)
|
304
318
|
- **fields**: Describes the nested schema fields if the type property is set to RECORD. Please note that this is **required** for `RECORD` column.
|
305
319
|
- **timestamp_format**: timestamp format to convert into/from `timestamp` (string, default is `default_timestamp_format`)
|
306
|
-
- **timezone**: timezone to convert into/from `timestamp` (string, default is `default_timezone`).
|
320
|
+
- **timezone**: timezone to convert into/from `timestamp`, `date` (string, default is `default_timezone`).
|
307
321
|
- **default_timestamp_format**: default timestamp format for column_options (string, default is "%Y-%m-%d %H:%M:%S.%6N")
|
308
322
|
- **default_timezone**: default timezone for column_options (string, default is "UTC")
|
309
323
|
|
@@ -355,22 +369,6 @@ out:
|
|
355
369
|
payload_column_index: 0 # or, payload_column: payload
|
356
370
|
```
|
357
371
|
|
358
|
-
### Prevent Duplication
|
359
|
-
|
360
|
-
`prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
|
361
|
-
|
362
|
-
When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
|
363
|
-
|
364
|
-
`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
|
365
|
-
|
366
|
-
[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
|
367
|
-
|
368
|
-
```yaml
|
369
|
-
out:
|
370
|
-
type: bigquery
|
371
|
-
prevent_duplicate_insert: true
|
372
|
-
```
|
373
|
-
|
374
372
|
### GCS Bucket
|
375
373
|
|
376
374
|
This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs).
|
@@ -401,32 +399,31 @@ To load into a partition, specify `table` parameter with a partition decorator a
|
|
401
399
|
out:
|
402
400
|
type: bigquery
|
403
401
|
table: table_name$20160929
|
404
|
-
auto_create_table: true
|
405
402
|
```
|
406
403
|
|
407
|
-
You may configure `time_partitioning` parameter together
|
404
|
+
You may configure `time_partitioning` parameter together as:
|
408
405
|
|
409
406
|
```yaml
|
410
407
|
out:
|
411
408
|
type: bigquery
|
412
409
|
table: table_name$20160929
|
413
|
-
auto_create_table: true
|
414
410
|
time_partitioning:
|
415
411
|
type: DAY
|
416
412
|
expiration_ms: 259200000
|
417
413
|
```
|
418
414
|
|
419
415
|
You can also create column-based partitioning table as:
|
416
|
+
|
420
417
|
```yaml
|
421
418
|
out:
|
422
419
|
type: bigquery
|
423
420
|
mode: replace
|
424
|
-
auto_create_table: true
|
425
421
|
table: table_name
|
426
422
|
time_partitioning:
|
427
423
|
type: DAY
|
428
424
|
field: timestamp
|
429
425
|
```
|
426
|
+
|
430
427
|
Note the `time_partitioning.field` should be top-level `DATE` or `TIMESTAMP`.
|
431
428
|
|
432
429
|
Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.
|
3
|
+
spec.version = "0.6.3"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -8,11 +8,18 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.licenses = ["MIT"]
|
9
9
|
spec.homepage = "https://github.com/embulk/embulk-output-bigquery"
|
10
10
|
|
11
|
-
|
11
|
+
# Exclude example directory which uses symlinks from generating gem.
|
12
|
+
# Symlinks do not work properly on the Windows platform without administrator privilege.
|
13
|
+
spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"] - Dir["example/*" ]
|
12
14
|
spec.test_files = spec.files.grep(%r{^(test|spec)/})
|
13
15
|
spec.require_paths = ["lib"]
|
14
16
|
|
15
|
-
|
17
|
+
# TODO
|
18
|
+
# signet 0.12.0 and google-api-client 0.33.0 require >= Ruby 2.4.
|
19
|
+
# Embulk 0.9 use JRuby 9.1.X.Y and It compatible Ruby 2.3.
|
20
|
+
# So, Force install signet < 0.12 and google-api-client < 0.33.0
|
21
|
+
spec.add_dependency 'signet', '~> 0.7', '< 0.12.0'
|
22
|
+
spec.add_dependency 'google-api-client','< 0.33.0'
|
16
23
|
spec.add_dependency 'time_with_zone'
|
17
24
|
|
18
25
|
spec.add_development_dependency 'bundler', ['>= 1.10.6']
|
@@ -23,7 +23,7 @@ module Embulk
|
|
23
23
|
# @return JSON string
|
24
24
|
def self.load(v)
|
25
25
|
if v.is_a?(String) # path
|
26
|
-
File.read(v)
|
26
|
+
File.read(File.expand_path(v))
|
27
27
|
elsif v.is_a?(Hash)
|
28
28
|
v['content']
|
29
29
|
end
|
@@ -33,9 +33,7 @@ module Embulk
|
|
33
33
|
def self.configure(config, schema, task_count)
|
34
34
|
task = {
|
35
35
|
'mode' => config.param('mode', :string, :default => 'append'),
|
36
|
-
'auth_method' => config.param('auth_method', :string, :default => '
|
37
|
-
'service_account_email' => config.param('service_account_email', :string, :default => nil),
|
38
|
-
'p12_keyfile' => config.param('p12_keyfile', :string, :default => nil),
|
36
|
+
'auth_method' => config.param('auth_method', :string, :default => 'application_default'),
|
39
37
|
'json_keyfile' => config.param('json_keyfile', LocalFile, :default => nil),
|
40
38
|
'project' => config.param('project', :string, :default => nil),
|
41
39
|
'dataset' => config.param('dataset', :string),
|
@@ -45,7 +43,7 @@ module Embulk
|
|
45
43
|
'table_old' => config.param('table_old', :string, :default => nil),
|
46
44
|
'table_name_old' => config.param('table_name_old', :string, :default => nil), # lower version compatibility
|
47
45
|
'auto_create_dataset' => config.param('auto_create_dataset', :bool, :default => false),
|
48
|
-
'auto_create_table' => config.param('auto_create_table', :bool, :default =>
|
46
|
+
'auto_create_table' => config.param('auto_create_table', :bool, :default => true),
|
49
47
|
'schema_file' => config.param('schema_file', :string, :default => nil),
|
50
48
|
'template_table' => config.param('template_table', :string, :default => nil),
|
51
49
|
|
@@ -53,7 +51,6 @@ module Embulk
|
|
53
51
|
'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
|
54
52
|
'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
|
55
53
|
'is_skip_job_result_check' => config.param('is_skip_job_result_check', :bool, :default => false),
|
56
|
-
'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
|
57
54
|
'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
|
58
55
|
'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
|
59
56
|
'abort_on_error' => config.param('abort_on_error', :bool, :default => nil),
|
@@ -105,10 +102,14 @@ module Embulk
|
|
105
102
|
raise ConfigError.new "`mode` must be one of append, append_direct, replace, delete_in_advance, replace_backup"
|
106
103
|
end
|
107
104
|
|
105
|
+
if %w[append replace delete_in_advance replace_backup].include?(task['mode']) and !task['auto_create_table']
|
106
|
+
raise ConfigError.new "`mode: #{task['mode']}` requires `auto_create_table: true`"
|
107
|
+
end
|
108
|
+
|
108
109
|
if task['mode'] == 'replace_backup'
|
109
110
|
task['table_old'] ||= task['table_name_old'] # for lower version compatibility
|
110
111
|
if task['dataset_old'].nil? and task['table_old'].nil?
|
111
|
-
raise ConfigError.new "`mode replace_backup` requires either of `dataset_old` or `table_old`"
|
112
|
+
raise ConfigError.new "`mode: replace_backup` requires either of `dataset_old` or `table_old`"
|
112
113
|
end
|
113
114
|
task['dataset_old'] ||= task['dataset']
|
114
115
|
task['table_old'] ||= task['table']
|
@@ -122,28 +123,21 @@ module Embulk
|
|
122
123
|
end
|
123
124
|
|
124
125
|
task['auth_method'] = task['auth_method'].downcase
|
125
|
-
unless %w[
|
126
|
-
raise ConfigError.new "`auth_method` must be one of
|
127
|
-
end
|
128
|
-
if task['auth_method'] == 'private_key' and task['p12_keyfile'].nil?
|
129
|
-
raise ConfigError.new "`p12_keyfile` is required for auth_method private_key"
|
126
|
+
unless %w[json_key service_account authorized_user compute_engine application_default].include?(task['auth_method'])
|
127
|
+
raise ConfigError.new "`auth_method` must be one of service_account (or json_key), authorized_user, compute_engine, application_default"
|
130
128
|
end
|
131
|
-
if task['auth_method'] == 'json_key' and task['json_keyfile'].nil?
|
132
|
-
raise ConfigError.new "`json_keyfile` is required for auth_method json_key"
|
129
|
+
if (task['auth_method'] == 'service_account' or task['auth_method'] == 'json_key') and task['json_keyfile'].nil?
|
130
|
+
raise ConfigError.new "`json_keyfile` is required for auth_method: service_account (or json_key)"
|
133
131
|
end
|
134
132
|
|
135
|
-
jsonkey_params = nil
|
136
133
|
if task['json_keyfile']
|
137
134
|
begin
|
138
|
-
|
135
|
+
json_key = JSON.parse(task['json_keyfile'])
|
136
|
+
task['project'] ||= json_key['project_id']
|
139
137
|
rescue => e
|
140
138
|
raise ConfigError.new "json_keyfile is not a JSON file"
|
141
139
|
end
|
142
140
|
end
|
143
|
-
|
144
|
-
if jsonkey_params
|
145
|
-
task['project'] ||= jsonkey_params['project_id']
|
146
|
-
end
|
147
141
|
if task['project'].nil?
|
148
142
|
raise ConfigError.new "Required field \"project\" is not set"
|
149
143
|
end
|
@@ -306,42 +300,18 @@ module Embulk
|
|
306
300
|
|
307
301
|
case task['mode']
|
308
302
|
when 'delete_in_advance'
|
309
|
-
bigquery.
|
303
|
+
bigquery.delete_table_or_partition(task['table'])
|
310
304
|
bigquery.create_table_if_not_exists(task['table'])
|
311
305
|
when 'replace'
|
312
306
|
bigquery.create_table_if_not_exists(task['temp_table'])
|
313
|
-
|
314
|
-
if task['auto_create_table']
|
315
|
-
bigquery.create_table_if_not_exists(task['table'])
|
316
|
-
else
|
317
|
-
bigquery.get_table(task['table']) # raises NotFoundError
|
318
|
-
end
|
319
|
-
end
|
307
|
+
bigquery.create_table_if_not_exists(task['table']) # needs for when task['table'] is a partition
|
320
308
|
when 'append'
|
321
309
|
bigquery.create_table_if_not_exists(task['temp_table'])
|
322
|
-
|
323
|
-
if task['auto_create_table']
|
324
|
-
bigquery.create_table_if_not_exists(task['table'])
|
325
|
-
else
|
326
|
-
bigquery.get_table(task['table']) # raises NotFoundError
|
327
|
-
end
|
328
|
-
end
|
310
|
+
bigquery.create_table_if_not_exists(task['table']) # needs for when task['table'] is a partition
|
329
311
|
when 'replace_backup'
|
330
312
|
bigquery.create_table_if_not_exists(task['temp_table'])
|
331
|
-
|
332
|
-
|
333
|
-
bigquery.create_table_if_not_exists(task['table'])
|
334
|
-
else
|
335
|
-
bigquery.get_table(task['table']) # raises NotFoundError
|
336
|
-
end
|
337
|
-
end
|
338
|
-
if Helper.has_partition_decorator?(task['table_old'])
|
339
|
-
if task['auto_create_table']
|
340
|
-
bigquery.create_table_if_not_exists(task['table_old'], dataset: task['dataset_old'])
|
341
|
-
else
|
342
|
-
bigquery.get_table(task['table_old'], dataset: task['dataset_old']) # raises NotFoundError
|
343
|
-
end
|
344
|
-
end
|
313
|
+
bigquery.create_table_if_not_exists(task['table'])
|
314
|
+
bigquery.create_table_if_not_exists(task['table_old'], dataset: task['dataset_old']) # needs for when a partition
|
345
315
|
else # append_direct
|
346
316
|
if task['auto_create_table']
|
347
317
|
bigquery.create_table_if_not_exists(task['table'])
|