embulk-output-bigquery 0.4.14 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +5 -5
  2. data/CHANGELOG.md +28 -0
  3. data/README.md +74 -77
  4. data/embulk-output-bigquery.gemspec +10 -3
  5. data/lib/embulk/output/bigquery.rb +19 -49
  6. data/lib/embulk/output/bigquery/auth.rb +35 -0
  7. data/lib/embulk/output/bigquery/bigquery_client.rb +2 -11
  8. data/lib/embulk/output/bigquery/google_client.rb +3 -34
  9. data/lib/embulk/output/bigquery/value_converter_factory.rb +12 -0
  10. data/test/test_bigquery_client.rb +1 -5
  11. data/test/test_configure.rb +10 -19
  12. data/test/test_example.rb +0 -1
  13. data/test/test_helper.rb +4 -1
  14. data/test/test_transaction.rb +22 -62
  15. data/test/test_value_converter_factory.rb +42 -0
  16. metadata +29 -52
  17. data/example/config_append_direct_schema_update_options.yml +0 -31
  18. data/example/config_client_options.yml +0 -33
  19. data/example/config_csv.yml +0 -30
  20. data/example/config_delete_in_advance.yml +0 -29
  21. data/example/config_delete_in_advance_field_partitioned_table.yml +0 -33
  22. data/example/config_delete_in_advance_partitioned_table.yml +0 -33
  23. data/example/config_expose_errors.yml +0 -30
  24. data/example/config_gcs.yml +0 -32
  25. data/example/config_guess_from_embulk_schema.yml +0 -29
  26. data/example/config_guess_with_column_options.yml +0 -40
  27. data/example/config_gzip.yml +0 -1
  28. data/example/config_jsonl.yml +0 -1
  29. data/example/config_max_threads.yml +0 -34
  30. data/example/config_min_ouput_tasks.yml +0 -34
  31. data/example/config_mode_append.yml +0 -30
  32. data/example/config_mode_append_direct.yml +0 -30
  33. data/example/config_nested_record.yml +0 -1
  34. data/example/config_payload_column.yml +0 -20
  35. data/example/config_payload_column_index.yml +0 -20
  36. data/example/config_prevent_duplicate_insert.yml +0 -30
  37. data/example/config_progress_log_interval.yml +0 -31
  38. data/example/config_replace.yml +0 -30
  39. data/example/config_replace_backup.yml +0 -32
  40. data/example/config_replace_backup_field_partitioned_table.yml +0 -34
  41. data/example/config_replace_backup_partitioned_table.yml +0 -34
  42. data/example/config_replace_field_partitioned_table.yml +0 -33
  43. data/example/config_replace_partitioned_table.yml +0 -33
  44. data/example/config_replace_schema_update_options.yml +0 -33
  45. data/example/config_skip_file_generation.yml +0 -32
  46. data/example/config_table_strftime.yml +0 -30
  47. data/example/config_template_table.yml +0 -21
  48. data/example/config_uncompressed.yml +0 -1
  49. data/example/config_with_rehearsal.yml +0 -33
  50. data/example/example.csv +0 -17
  51. data/example/example.yml +0 -1
  52. data/example/example2_1.csv +0 -1
  53. data/example/example2_2.csv +0 -1
  54. data/example/example4_1.csv +0 -1
  55. data/example/example4_2.csv +0 -1
  56. data/example/example4_3.csv +0 -1
  57. data/example/example4_4.csv +0 -1
  58. data/example/json_key.json +0 -12
  59. data/example/nested_example.jsonl +0 -16
  60. data/example/schema.json +0 -30
  61. data/example/schema_expose_errors.json +0 -30
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 4fb376f288bfa86d632d727b3d0770ca4b94e364261c3f87a2569c801ee2fa00
4
- data.tar.gz: 2571a07afb9aac0774e0744f9d5118712bb83f44f82470dd4fd25bf515c7b9fa
2
+ SHA1:
3
+ metadata.gz: 8b3d7d7d675d8428946f81517d1002f667f4fafe
4
+ data.tar.gz: 25940b93f70492675869d3c4dd50f83f8b7347cf
5
5
  SHA512:
6
- metadata.gz: 15f71decc69d34d8fbc3ee09452a6307107b71f759820b8a0521c6473b2231c4706febf216b59baae0e18fc3a06a056c18552d1093f0ac264ef84183a6d27992
7
- data.tar.gz: 7ee57f82766927cb804bf0d88550f7f3e4d0459315160a0eec98ccd4c00e2a2423a093cffd17e836d2dba8461cbc2ae4e227ff85d60c7c9628d32b1fd142b7eb
6
+ metadata.gz: 97a2aff66c765f24289717ac79e0a25a6bf31ee3ec5b84b64c96e8573382b31b0a27c30f06692a296b3bfedd70ea9f34f1a451cea7de27d3fa4c61a7502bab98
7
+ data.tar.gz: b795d47af337e109dfafb9f41a0a720d0eb314c7ba7219193648505ec9dffa3874215b5d311256f625228a4f3e52b73153ee3d694a3d2f88d4c2fd0dd24960b1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,31 @@
1
+ ## 0.6.3 - 2019-10-28
2
+
3
+ * [enhancement] Add DATE type conveter (thanks to @tksfjt1024)
4
+
5
+ ## 0.6.2 - 2019-10-16
6
+
7
+ * [maintenance] Lock signet and google-api-client version (thanks to @hiroyuki-sato)
8
+
9
+ ## 0.6.1 - 2019-08-28
10
+
11
+ * [maintenance] Release a new gem not to include symlinks to make it work on Windows.
12
+
13
+ ## 0.6.0 - 2019-08-11
14
+
15
+ Cleanup `auth_method`:
16
+
17
+ * [enhancement] Support `auth_method: authorized_user` (OAuth)
18
+ * [incompatibility change] Rename `auth_method: json_key` to `auth_method: service_account` (`json_key` is kept for backward compatibility)
19
+ * [incompatibility change] Remove deprecated `auth_method: private_key` (p12 key)
20
+ * [incompatibility change] Change the default `auth_method` to `application_default` from `private_key` because `private_key` was dropped.
21
+
22
+ ## 0.5.0 - 2019-08-10
23
+
24
+ * [incompatibility change] Drop deprecated `time_partitioning`.`require_partition_filter`
25
+ * [incompatibility change] Drop `prevent_duplicate_insert` which has no use-case now
26
+ * [incompatibility change] Modes `replace`, `replace_backup`, `append`, and `delete_in_advance` require `auto_create_table: true` now because, previously, these modes had created a target table even with `auto_create_table: false` and made users being confused. Note that `auto_create_table: true` is always required even for a partition (a table name with a partition decorator) which may not require creating a table. This is for simplicity of logics and implementations.
27
+ * [incompatibility change] Change default value of `auto_create_table` to `true` because the above 4 modes, that is, except `append_direct` always require `auto_create_table: true` now.
28
+
1
29
  ## 0.4.14 - 2019-08-10
2
30
 
3
31
  * [enhancement] Support field partitioning correctly.
data/README.md CHANGED
@@ -23,34 +23,23 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
23
23
  Current version of this plugin supports Google API with Service Account Authentication, but does not support
24
24
  OAuth flow for installed applications.
25
25
 
26
- ### INCOMPATIBILITY CHANGES
27
-
28
- v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
29
-
30
- * `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
31
- * `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
32
- * `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
33
-
34
26
  ## Configuration
35
27
 
36
28
  #### Original options
37
29
 
38
30
  | name | type | required? | default | description |
39
31
  |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
40
- | mode | string | optional | "append" | See [Mode](#mode) |
41
- | auth_method | string | optional | "private_key" | `private_key` , `json_key` or `compute_engine`
42
- | service_account_email | string | required when auth_method is private_key | | Your Google service account email
43
- | p12_keyfile | string | required when auth_method is private_key | | Fullpath of private key in P12(PKCS12) format |
44
- | json_keyfile | string | required when auth_method is json_key | | Fullpath of json key |
45
- | project | string | required if json_keyfile is not given | | project_id |
32
+ | mode | string | optional | "append" | See [Mode](#mode) |
33
+ | auth_method | string | optional | "application\_default" | See [Authentication](#authentication) |
34
+ | json_keyfile | string | optional | | keyfile path or `content` |
35
+ | project | string | required unless service\_account's `json_keyfile` is given. | | project\_id |
46
36
  | dataset | string | required | | dataset |
47
37
  | location | string | optional | nil | geographic location of dataset. See [Location](#location) |
48
38
  | table | string | required | | table name, or table name with a partition decorator such as `table_name$20160929`|
49
39
  | auto_create_dataset | boolean | optional | false | automatically create dataset |
50
- | auto_create_table | boolean | optional | false | See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
40
+ | auto_create_table | boolean | optional | true | `false` is available only for `append_direct` mode. Other modes require `true`. See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
51
41
  | schema_file | string | optional | | /path/to/schema.json |
52
42
  | template_table | string | optional | | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
53
- | prevent_duplicate_insert | boolean | optional | false | See [Prevent Duplication](#prevent-duplication) |
54
43
  | job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
55
44
  | job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
56
45
  | is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
@@ -107,7 +96,6 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
107
96
  | time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
108
97
  | time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. |
109
98
  | time_partitioning.field | string | optional | nil | `DATE` or `TIMESTAMP` column used for partitioning |
110
- | time_partitioning.require_partition_filter | boolean | optional | nil | If true, valid partition filter is required when query |
111
99
  | clustering | hash | optional | nil | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
112
100
  | clustering.fields | array | required | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
113
101
  | schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
@@ -118,9 +106,8 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
118
106
  out:
119
107
  type: bigquery
120
108
  mode: append
121
- auth_method: private_key # default
122
- service_account_email: ABCXYZ123ABCXYZ123.gserviceaccount.com
123
- p12_keyfile: /path/to/p12_keyfile.p12
109
+ auth_method: service_account
110
+ json_keyfile: /path/to/json_keyfile.json
124
111
  project: your-project-000
125
112
  dataset: your_dataset_name
126
113
  table: your_table_name
@@ -128,7 +115,7 @@ out:
128
115
  source_format: NEWLINE_DELIMITED_JSON
129
116
  ```
130
117
 
131
- ### location
118
+ ### Location
132
119
 
133
120
  The geographic location of the dataset. Required except for US and EU.
134
121
 
@@ -136,7 +123,7 @@ GCS bucket should be in same region when you use `gcs_bucket`.
136
123
 
137
124
  See also [Dataset Locations | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/dataset-locations)
138
125
 
139
- ### mode
126
+ ### Mode
140
127
 
141
128
  5 modes are provided.
142
129
 
@@ -175,53 +162,69 @@ NOTE: BigQuery does not support replacing (actually, copying into) a non-partiti
175
162
 
176
163
  ### Authentication
177
164
 
178
- There are three methods supported to fetch access token for the service account.
165
+ There are four authentication methods
166
+
167
+ 1. `service_account` (or `json_key` for backward compatibility)
168
+ 1. `authorized_user`
169
+ 1. `compute_engine`
170
+ 1. `application_default`
171
+
172
+ #### service\_account (or json\_key)
179
173
 
180
- 1. Public-Private key pair of GCP(Google Cloud Platform)'s service account
181
- 2. JSON key of GCP(Google Cloud Platform)'s service account
182
- 3. Pre-defined access token (Google Compute Engine only)
174
+ Use GCP service account credentials.
175
+ You first need to create a service account, download its json key and deploy the key with embulk.
183
176
 
184
- #### Public-Private key pair of GCP's service account
177
+ ```yaml
178
+ out:
179
+ type: bigquery
180
+ auth_method: service_account
181
+ json_keyfile: /path/to/json_keyfile.json
182
+ ```
185
183
 
186
- You first need to create a service account (client ID),
187
- download its private key and deploy the key with embulk.
184
+ You can also embed contents of `json_keyfile` at config.yml.
188
185
 
189
186
  ```yaml
190
187
  out:
191
188
  type: bigquery
192
- auth_method: private_key # default
193
- service_account_email: ABCXYZ123ABCXYZ123.gserviceaccount.com
194
- p12_keyfile: /path/to/p12_keyfile.p12
189
+ auth_method: service_account
190
+ json_keyfile:
191
+ content: |
192
+ {
193
+ "private_key_id": "123456789",
194
+ "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
195
+ "client_email": "..."
196
+ }
195
197
  ```
196
198
 
197
- #### JSON key of GCP's service account
199
+ #### authorized\_user
198
200
 
199
- You first need to create a service account (client ID),
200
- download its json key and deploy the key with embulk.
201
+ Use Google user credentials.
202
+ You can get your credentials at `~/.config/gcloud/application_default_credentials.json` by running `gcloud auth login`.
201
203
 
202
204
  ```yaml
203
205
  out:
204
206
  type: bigquery
205
- auth_method: json_key
206
- json_keyfile: /path/to/json_keyfile.json
207
+ auth_method: authorized_user
208
+ json_keyfile: /path/to/credentials.json
207
209
  ```
208
210
 
209
- You can also embed contents of json_keyfile at config.yml.
211
+ You can also embed contents of `json_keyfile` at config.yml.
210
212
 
211
213
  ```yaml
212
214
  out:
213
215
  type: bigquery
214
- auth_method: json_key
216
+ auth_method: authorized_user
215
217
  json_keyfile:
216
218
  content: |
217
219
  {
218
- "private_key_id": "123456789",
219
- "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
220
- "client_email": "..."
221
- }
220
+ "client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
221
+ "client_secret":"xxxxxxxxxxx",
222
+ "refresh_token":"xxxxxxxxxxx",
223
+ "type":"authorized_user"
224
+ }
222
225
  ```
223
226
 
224
- #### Pre-defined access token(GCE only)
227
+ #### compute\_engine
225
228
 
226
229
  On the other hand, you don't need to explicitly create a service account for embulk when you
227
230
  run embulk in Google Compute Engine. In this third authentication method, you need to
@@ -234,6 +237,22 @@ out:
234
237
  auth_method: compute_engine
235
238
  ```
236
239
 
240
+ #### application\_default
241
+
242
+ Use Application Default Credentials (ADC). ADC is a strategy to locate Google Cloud Service Account credentials.
243
+
244
+ 1. ADC checks to see if the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set. If the variable is set, ADC uses the service account file that the variable points to.
245
+ 2. ADC checks to see if `~/.config/gcloud/application_default_credentials.json` is located. This file is created by running `gcloud auth application-default login`.
246
+ 3. Use the default service account for credentials if the application running on Compute Engine, App Engine, Kubernetes Engine, Cloud Functions or Cloud Run.
247
+
248
+ See https://cloud.google.com/docs/authentication/production for details.
249
+
250
+ ```yaml
251
+ out:
252
+ type: bigquery
253
+ auth_method: application_default
254
+ ```
255
+
237
256
  ### Table id formatting
238
257
 
239
258
  `table` and option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
@@ -242,21 +261,16 @@ Table ids are formatted at runtime
242
261
  using the local time of the embulk server.
243
262
 
244
263
  For example, with the configuration below,
245
- data is inserted into tables `table_2015_04`, `table_2015_05` and so on.
264
+ data is inserted into tables `table_20150503`, `table_20150504` and so on.
246
265
 
247
266
  ```yaml
248
267
  out:
249
268
  type: bigquery
250
- table: table_%Y_%m
269
+ table: table_%Y%m%d
251
270
  ```
252
271
 
253
272
  ### Dynamic table creating
254
273
 
255
- This plugin tries to create a table using BigQuery API when
256
-
257
- * mode is either of `delete_in_advance`, `replace`, `replace_backup`, `append`.
258
- * mode is `append_direct` and `auto_create_table` is true.
259
-
260
274
  There are 3 ways to set schema.
261
275
 
262
276
  #### Set schema.json
@@ -267,7 +281,7 @@ Please set file path of schema.json.
267
281
  out:
268
282
  type: bigquery
269
283
  auto_create_table: true
270
- table: table_%Y_%m
284
+ table: table_%Y%m%d
271
285
  schema_file: /path/to/schema.json
272
286
  ```
273
287
 
@@ -279,7 +293,7 @@ Plugin will try to read schema from existing table and use it as schema template
279
293
  out:
280
294
  type: bigquery
281
295
  auto_create_table: true
282
- table: table_%Y_%m
296
+ table: table_%Y%m%d
283
297
  template_table: existing_table_name
284
298
  ```
285
299
 
@@ -293,17 +307,17 @@ Column options are used to aid guessing BigQuery schema, or to define conversion
293
307
 
294
308
  - **column_options**: advanced: an array of options for columns
295
309
  - **name**: column name
296
- - **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, and `RECORD`. See belows for supported conversion type.
310
+ - **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE`, and `RECORD`. See belows for supported conversion type.
297
311
  - boolean: `BOOLEAN`, `STRING` (default: `BOOLEAN`)
298
312
  - long: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `INTEGER`)
299
313
  - double: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `FLOAT`)
300
- - string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `RECORD` (default: `STRING`)
301
- - timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `TIMESTAMP`)
314
+ - string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE`, `RECORD` (default: `STRING`)
315
+ - timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE` (default: `TIMESTAMP`)
302
316
  - json: `STRING`, `RECORD` (default: `STRING`)
303
317
  - **mode**: BigQuery mode such as `NULLABLE`, `REQUIRED`, and `REPEATED` (string, default: `NULLABLE`)
304
318
  - **fields**: Describes the nested schema fields if the type property is set to RECORD. Please note that this is **required** for `RECORD` column.
305
319
  - **timestamp_format**: timestamp format to convert into/from `timestamp` (string, default is `default_timestamp_format`)
306
- - **timezone**: timezone to convert into/from `timestamp` (string, default is `default_timezone`).
320
+ - **timezone**: timezone to convert into/from `timestamp`, `date` (string, default is `default_timezone`).
307
321
  - **default_timestamp_format**: default timestamp format for column_options (string, default is "%Y-%m-%d %H:%M:%S.%6N")
308
322
  - **default_timezone**: default timezone for column_options (string, default is "UTC")
309
323
 
@@ -355,22 +369,6 @@ out:
355
369
  payload_column_index: 0 # or, payload_column: payload
356
370
  ```
357
371
 
358
- ### Prevent Duplication
359
-
360
- `prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
361
-
362
- When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
363
-
364
- `job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
365
-
366
- [job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
367
-
368
- ```yaml
369
- out:
370
- type: bigquery
371
- prevent_duplicate_insert: true
372
- ```
373
-
374
372
  ### GCS Bucket
375
373
 
376
374
  This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs).
@@ -401,32 +399,31 @@ To load into a partition, specify `table` parameter with a partition decorator a
401
399
  out:
402
400
  type: bigquery
403
401
  table: table_name$20160929
404
- auto_create_table: true
405
402
  ```
406
403
 
407
- You may configure `time_partitioning` parameter together to create table via `auto_create_table: true` option as:
404
+ You may configure `time_partitioning` parameter together as:
408
405
 
409
406
  ```yaml
410
407
  out:
411
408
  type: bigquery
412
409
  table: table_name$20160929
413
- auto_create_table: true
414
410
  time_partitioning:
415
411
  type: DAY
416
412
  expiration_ms: 259200000
417
413
  ```
418
414
 
419
415
  You can also create column-based partitioning table as:
416
+
420
417
  ```yaml
421
418
  out:
422
419
  type: bigquery
423
420
  mode: replace
424
- auto_create_table: true
425
421
  table: table_name
426
422
  time_partitioning:
427
423
  type: DAY
428
424
  field: timestamp
429
425
  ```
426
+
430
427
  Note the `time_partitioning.field` should be top-level `DATE` or `TIMESTAMP`.
431
428
 
432
429
  Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though.
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.4.14"
3
+ spec.version = "0.6.3"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -8,11 +8,18 @@ Gem::Specification.new do |spec|
8
8
  spec.licenses = ["MIT"]
9
9
  spec.homepage = "https://github.com/embulk/embulk-output-bigquery"
10
10
 
11
- spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
11
+ # Exclude example directory which uses symlinks from generating gem.
12
+ # Symlinks do not work properly on the Windows platform without administrator privilege.
13
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"] - Dir["example/*" ]
12
14
  spec.test_files = spec.files.grep(%r{^(test|spec)/})
13
15
  spec.require_paths = ["lib"]
14
16
 
15
- spec.add_dependency 'google-api-client'
17
+ # TODO
18
+ # signet 0.12.0 and google-api-client 0.33.0 require >= Ruby 2.4.
19
+ # Embulk 0.9 use JRuby 9.1.X.Y and It compatible Ruby 2.3.
20
+ # So, Force install signet < 0.12 and google-api-client < 0.33.0
21
+ spec.add_dependency 'signet', '~> 0.7', '< 0.12.0'
22
+ spec.add_dependency 'google-api-client','< 0.33.0'
16
23
  spec.add_dependency 'time_with_zone'
17
24
 
18
25
  spec.add_development_dependency 'bundler', ['>= 1.10.6']
@@ -23,7 +23,7 @@ module Embulk
23
23
  # @return JSON string
24
24
  def self.load(v)
25
25
  if v.is_a?(String) # path
26
- File.read(v)
26
+ File.read(File.expand_path(v))
27
27
  elsif v.is_a?(Hash)
28
28
  v['content']
29
29
  end
@@ -33,9 +33,7 @@ module Embulk
33
33
  def self.configure(config, schema, task_count)
34
34
  task = {
35
35
  'mode' => config.param('mode', :string, :default => 'append'),
36
- 'auth_method' => config.param('auth_method', :string, :default => 'private_key'),
37
- 'service_account_email' => config.param('service_account_email', :string, :default => nil),
38
- 'p12_keyfile' => config.param('p12_keyfile', :string, :default => nil),
36
+ 'auth_method' => config.param('auth_method', :string, :default => 'application_default'),
39
37
  'json_keyfile' => config.param('json_keyfile', LocalFile, :default => nil),
40
38
  'project' => config.param('project', :string, :default => nil),
41
39
  'dataset' => config.param('dataset', :string),
@@ -45,7 +43,7 @@ module Embulk
45
43
  'table_old' => config.param('table_old', :string, :default => nil),
46
44
  'table_name_old' => config.param('table_name_old', :string, :default => nil), # lower version compatibility
47
45
  'auto_create_dataset' => config.param('auto_create_dataset', :bool, :default => false),
48
- 'auto_create_table' => config.param('auto_create_table', :bool, :default => false),
46
+ 'auto_create_table' => config.param('auto_create_table', :bool, :default => true),
49
47
  'schema_file' => config.param('schema_file', :string, :default => nil),
50
48
  'template_table' => config.param('template_table', :string, :default => nil),
51
49
 
@@ -53,7 +51,6 @@ module Embulk
53
51
  'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
54
52
  'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
55
53
  'is_skip_job_result_check' => config.param('is_skip_job_result_check', :bool, :default => false),
56
- 'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
57
54
  'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
58
55
  'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
59
56
  'abort_on_error' => config.param('abort_on_error', :bool, :default => nil),
@@ -105,10 +102,14 @@ module Embulk
105
102
  raise ConfigError.new "`mode` must be one of append, append_direct, replace, delete_in_advance, replace_backup"
106
103
  end
107
104
 
105
+ if %w[append replace delete_in_advance replace_backup].include?(task['mode']) and !task['auto_create_table']
106
+ raise ConfigError.new "`mode: #{task['mode']}` requires `auto_create_table: true`"
107
+ end
108
+
108
109
  if task['mode'] == 'replace_backup'
109
110
  task['table_old'] ||= task['table_name_old'] # for lower version compatibility
110
111
  if task['dataset_old'].nil? and task['table_old'].nil?
111
- raise ConfigError.new "`mode replace_backup` requires either of `dataset_old` or `table_old`"
112
+ raise ConfigError.new "`mode: replace_backup` requires either of `dataset_old` or `table_old`"
112
113
  end
113
114
  task['dataset_old'] ||= task['dataset']
114
115
  task['table_old'] ||= task['table']
@@ -122,28 +123,21 @@ module Embulk
122
123
  end
123
124
 
124
125
  task['auth_method'] = task['auth_method'].downcase
125
- unless %w[private_key json_key compute_engine application_default].include?(task['auth_method'])
126
- raise ConfigError.new "`auth_method` must be one of private_key, json_key, compute_engine, application_default"
127
- end
128
- if task['auth_method'] == 'private_key' and task['p12_keyfile'].nil?
129
- raise ConfigError.new "`p12_keyfile` is required for auth_method private_key"
126
+ unless %w[json_key service_account authorized_user compute_engine application_default].include?(task['auth_method'])
127
+ raise ConfigError.new "`auth_method` must be one of service_account (or json_key), authorized_user, compute_engine, application_default"
130
128
  end
131
- if task['auth_method'] == 'json_key' and task['json_keyfile'].nil?
132
- raise ConfigError.new "`json_keyfile` is required for auth_method json_key"
129
+ if (task['auth_method'] == 'service_account' or task['auth_method'] == 'json_key') and task['json_keyfile'].nil?
130
+ raise ConfigError.new "`json_keyfile` is required for auth_method: service_account (or json_key)"
133
131
  end
134
132
 
135
- jsonkey_params = nil
136
133
  if task['json_keyfile']
137
134
  begin
138
- jsonkey_params = JSON.parse(task['json_keyfile'])
135
+ json_key = JSON.parse(task['json_keyfile'])
136
+ task['project'] ||= json_key['project_id']
139
137
  rescue => e
140
138
  raise ConfigError.new "json_keyfile is not a JSON file"
141
139
  end
142
140
  end
143
-
144
- if jsonkey_params
145
- task['project'] ||= jsonkey_params['project_id']
146
- end
147
141
  if task['project'].nil?
148
142
  raise ConfigError.new "Required field \"project\" is not set"
149
143
  end
@@ -306,42 +300,18 @@ module Embulk
306
300
 
307
301
  case task['mode']
308
302
  when 'delete_in_advance'
309
- bigquery.delete_partition(task['table'])
303
+ bigquery.delete_table_or_partition(task['table'])
310
304
  bigquery.create_table_if_not_exists(task['table'])
311
305
  when 'replace'
312
306
  bigquery.create_table_if_not_exists(task['temp_table'])
313
- if Helper.has_partition_decorator?(task['table'])
314
- if task['auto_create_table']
315
- bigquery.create_table_if_not_exists(task['table'])
316
- else
317
- bigquery.get_table(task['table']) # raises NotFoundError
318
- end
319
- end
307
+ bigquery.create_table_if_not_exists(task['table']) # needs for when task['table'] is a partition
320
308
  when 'append'
321
309
  bigquery.create_table_if_not_exists(task['temp_table'])
322
- if Helper.has_partition_decorator?(task['table'])
323
- if task['auto_create_table']
324
- bigquery.create_table_if_not_exists(task['table'])
325
- else
326
- bigquery.get_table(task['table']) # raises NotFoundError
327
- end
328
- end
310
+ bigquery.create_table_if_not_exists(task['table']) # needs for when task['table'] is a partition
329
311
  when 'replace_backup'
330
312
  bigquery.create_table_if_not_exists(task['temp_table'])
331
- if Helper.has_partition_decorator?(task['table'])
332
- if task['auto_create_table']
333
- bigquery.create_table_if_not_exists(task['table'])
334
- else
335
- bigquery.get_table(task['table']) # raises NotFoundError
336
- end
337
- end
338
- if Helper.has_partition_decorator?(task['table_old'])
339
- if task['auto_create_table']
340
- bigquery.create_table_if_not_exists(task['table_old'], dataset: task['dataset_old'])
341
- else
342
- bigquery.get_table(task['table_old'], dataset: task['dataset_old']) # raises NotFoundError
343
- end
344
- end
313
+ bigquery.create_table_if_not_exists(task['table'])
314
+ bigquery.create_table_if_not_exists(task['table_old'], dataset: task['dataset_old']) # needs for when a partition
345
315
  else # append_direct
346
316
  if task['auto_create_table']
347
317
  bigquery.create_table_if_not_exists(task['table'])