embulk-output-bigquery 0.4.14 → 0.6.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +5 -5
  2. data/CHANGELOG.md +28 -0
  3. data/README.md +74 -77
  4. data/embulk-output-bigquery.gemspec +10 -3
  5. data/lib/embulk/output/bigquery.rb +19 -49
  6. data/lib/embulk/output/bigquery/auth.rb +35 -0
  7. data/lib/embulk/output/bigquery/bigquery_client.rb +2 -11
  8. data/lib/embulk/output/bigquery/google_client.rb +3 -34
  9. data/lib/embulk/output/bigquery/value_converter_factory.rb +12 -0
  10. data/test/test_bigquery_client.rb +1 -5
  11. data/test/test_configure.rb +10 -19
  12. data/test/test_example.rb +0 -1
  13. data/test/test_helper.rb +4 -1
  14. data/test/test_transaction.rb +22 -62
  15. data/test/test_value_converter_factory.rb +42 -0
  16. metadata +29 -52
  17. data/example/config_append_direct_schema_update_options.yml +0 -31
  18. data/example/config_client_options.yml +0 -33
  19. data/example/config_csv.yml +0 -30
  20. data/example/config_delete_in_advance.yml +0 -29
  21. data/example/config_delete_in_advance_field_partitioned_table.yml +0 -33
  22. data/example/config_delete_in_advance_partitioned_table.yml +0 -33
  23. data/example/config_expose_errors.yml +0 -30
  24. data/example/config_gcs.yml +0 -32
  25. data/example/config_guess_from_embulk_schema.yml +0 -29
  26. data/example/config_guess_with_column_options.yml +0 -40
  27. data/example/config_gzip.yml +0 -1
  28. data/example/config_jsonl.yml +0 -1
  29. data/example/config_max_threads.yml +0 -34
  30. data/example/config_min_ouput_tasks.yml +0 -34
  31. data/example/config_mode_append.yml +0 -30
  32. data/example/config_mode_append_direct.yml +0 -30
  33. data/example/config_nested_record.yml +0 -1
  34. data/example/config_payload_column.yml +0 -20
  35. data/example/config_payload_column_index.yml +0 -20
  36. data/example/config_prevent_duplicate_insert.yml +0 -30
  37. data/example/config_progress_log_interval.yml +0 -31
  38. data/example/config_replace.yml +0 -30
  39. data/example/config_replace_backup.yml +0 -32
  40. data/example/config_replace_backup_field_partitioned_table.yml +0 -34
  41. data/example/config_replace_backup_partitioned_table.yml +0 -34
  42. data/example/config_replace_field_partitioned_table.yml +0 -33
  43. data/example/config_replace_partitioned_table.yml +0 -33
  44. data/example/config_replace_schema_update_options.yml +0 -33
  45. data/example/config_skip_file_generation.yml +0 -32
  46. data/example/config_table_strftime.yml +0 -30
  47. data/example/config_template_table.yml +0 -21
  48. data/example/config_uncompressed.yml +0 -1
  49. data/example/config_with_rehearsal.yml +0 -33
  50. data/example/example.csv +0 -17
  51. data/example/example.yml +0 -1
  52. data/example/example2_1.csv +0 -1
  53. data/example/example2_2.csv +0 -1
  54. data/example/example4_1.csv +0 -1
  55. data/example/example4_2.csv +0 -1
  56. data/example/example4_3.csv +0 -1
  57. data/example/example4_4.csv +0 -1
  58. data/example/json_key.json +0 -12
  59. data/example/nested_example.jsonl +0 -16
  60. data/example/schema.json +0 -30
  61. data/example/schema_expose_errors.json +0 -30
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA256:
3
- metadata.gz: 4fb376f288bfa86d632d727b3d0770ca4b94e364261c3f87a2569c801ee2fa00
4
- data.tar.gz: 2571a07afb9aac0774e0744f9d5118712bb83f44f82470dd4fd25bf515c7b9fa
2
+ SHA1:
3
+ metadata.gz: 8b3d7d7d675d8428946f81517d1002f667f4fafe
4
+ data.tar.gz: 25940b93f70492675869d3c4dd50f83f8b7347cf
5
5
  SHA512:
6
- metadata.gz: 15f71decc69d34d8fbc3ee09452a6307107b71f759820b8a0521c6473b2231c4706febf216b59baae0e18fc3a06a056c18552d1093f0ac264ef84183a6d27992
7
- data.tar.gz: 7ee57f82766927cb804bf0d88550f7f3e4d0459315160a0eec98ccd4c00e2a2423a093cffd17e836d2dba8461cbc2ae4e227ff85d60c7c9628d32b1fd142b7eb
6
+ metadata.gz: 97a2aff66c765f24289717ac79e0a25a6bf31ee3ec5b84b64c96e8573382b31b0a27c30f06692a296b3bfedd70ea9f34f1a451cea7de27d3fa4c61a7502bab98
7
+ data.tar.gz: b795d47af337e109dfafb9f41a0a720d0eb314c7ba7219193648505ec9dffa3874215b5d311256f625228a4f3e52b73153ee3d694a3d2f88d4c2fd0dd24960b1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,31 @@
1
+ ## 0.6.3 - 2019-10-28
2
+
3
+ * [enhancement] Add DATE type conveter (thanks to @tksfjt1024)
4
+
5
+ ## 0.6.2 - 2019-10-16
6
+
7
+ * [maintenance] Lock signet and google-api-client version (thanks to @hiroyuki-sato)
8
+
9
+ ## 0.6.1 - 2019-08-28
10
+
11
+ * [maintenance] Release a new gem not to include symlinks to make it work on Windows.
12
+
13
+ ## 0.6.0 - 2019-08-11
14
+
15
+ Cleanup `auth_method`:
16
+
17
+ * [enhancement] Support `auth_method: authorized_user` (OAuth)
18
+ * [incompatibility change] Rename `auth_method: json_key` to `auth_method: service_account` (`json_key` is kept for backward compatibility)
19
+ * [incompatibility change] Remove deprecated `auth_method: private_key` (p12 key)
20
+ * [incompatibility change] Change the default `auth_method` to `application_default` from `private_key` because `private_key` was dropped.
21
+
22
+ ## 0.5.0 - 2019-08-10
23
+
24
+ * [incompatibility change] Drop deprecated `time_partitioning`.`require_partition_filter`
25
+ * [incompatibility change] Drop `prevent_duplicate_insert` which has no use-case now
26
+ * [incompatibility change] Modes `replace`, `replace_backup`, `append`, and `delete_in_advance` require `auto_create_table: true` now because, previously, these modes had created a target table even with `auto_create_table: false` and made users being confused. Note that `auto_create_table: true` is always required even for a partition (a table name with a partition decorator) which may not require creating a table. This is for simplicity of logics and implementations.
27
+ * [incompatibility change] Change default value of `auto_create_table` to `true` because the above 4 modes, that is, except `append_direct` always require `auto_create_table: true` now.
28
+
1
29
  ## 0.4.14 - 2019-08-10
2
30
 
3
31
  * [enhancement] Support field partitioning correctly.
data/README.md CHANGED
@@ -23,34 +23,23 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
23
23
  Current version of this plugin supports Google API with Service Account Authentication, but does not support
24
24
  OAuth flow for installed applications.
25
25
 
26
- ### INCOMPATIBILITY CHANGES
27
-
28
- v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
29
-
30
- * `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
31
- * `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
32
- * `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
33
-
34
26
  ## Configuration
35
27
 
36
28
  #### Original options
37
29
 
38
30
  | name | type | required? | default | description |
39
31
  |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
40
- | mode | string | optional | "append" | See [Mode](#mode) |
41
- | auth_method | string | optional | "private_key" | `private_key` , `json_key` or `compute_engine`
42
- | service_account_email | string | required when auth_method is private_key | | Your Google service account email
43
- | p12_keyfile | string | required when auth_method is private_key | | Fullpath of private key in P12(PKCS12) format |
44
- | json_keyfile | string | required when auth_method is json_key | | Fullpath of json key |
45
- | project | string | required if json_keyfile is not given | | project_id |
32
+ | mode | string | optional | "append" | See [Mode](#mode) |
33
+ | auth_method | string | optional | "application\_default" | See [Authentication](#authentication) |
34
+ | json_keyfile | string | optional | | keyfile path or `content` |
35
+ | project | string | required unless service\_account's `json_keyfile` is given. | | project\_id |
46
36
  | dataset | string | required | | dataset |
47
37
  | location | string | optional | nil | geographic location of dataset. See [Location](#location) |
48
38
  | table | string | required | | table name, or table name with a partition decorator such as `table_name$20160929`|
49
39
  | auto_create_dataset | boolean | optional | false | automatically create dataset |
50
- | auto_create_table | boolean | optional | false | See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
40
+ | auto_create_table | boolean | optional | true | `false` is available only for `append_direct` mode. Other modes require `true`. See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
51
41
  | schema_file | string | optional | | /path/to/schema.json |
52
42
  | template_table | string | optional | | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
53
- | prevent_duplicate_insert | boolean | optional | false | See [Prevent Duplication](#prevent-duplication) |
54
43
  | job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
55
44
  | job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
56
45
  | is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
@@ -107,7 +96,6 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
107
96
  | time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
108
97
  | time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. |
109
98
  | time_partitioning.field | string | optional | nil | `DATE` or `TIMESTAMP` column used for partitioning |
110
- | time_partitioning.require_partition_filter | boolean | optional | nil | If true, valid partition filter is required when query |
111
99
  | clustering | hash | optional | nil | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
112
100
  | clustering.fields | array | required | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
113
101
  | schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
@@ -118,9 +106,8 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
118
106
  out:
119
107
  type: bigquery
120
108
  mode: append
121
- auth_method: private_key # default
122
- service_account_email: ABCXYZ123ABCXYZ123.gserviceaccount.com
123
- p12_keyfile: /path/to/p12_keyfile.p12
109
+ auth_method: service_account
110
+ json_keyfile: /path/to/json_keyfile.json
124
111
  project: your-project-000
125
112
  dataset: your_dataset_name
126
113
  table: your_table_name
@@ -128,7 +115,7 @@ out:
128
115
  source_format: NEWLINE_DELIMITED_JSON
129
116
  ```
130
117
 
131
- ### location
118
+ ### Location
132
119
 
133
120
  The geographic location of the dataset. Required except for US and EU.
134
121
 
@@ -136,7 +123,7 @@ GCS bucket should be in same region when you use `gcs_bucket`.
136
123
 
137
124
  See also [Dataset Locations | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/dataset-locations)
138
125
 
139
- ### mode
126
+ ### Mode
140
127
 
141
128
  5 modes are provided.
142
129
 
@@ -175,53 +162,69 @@ NOTE: BigQuery does not support replacing (actually, copying into) a non-partiti
175
162
 
176
163
  ### Authentication
177
164
 
178
- There are three methods supported to fetch access token for the service account.
165
+ There are four authentication methods
166
+
167
+ 1. `service_account` (or `json_key` for backward compatibility)
168
+ 1. `authorized_user`
169
+ 1. `compute_engine`
170
+ 1. `application_default`
171
+
172
+ #### service\_account (or json\_key)
179
173
 
180
- 1. Public-Private key pair of GCP(Google Cloud Platform)'s service account
181
- 2. JSON key of GCP(Google Cloud Platform)'s service account
182
- 3. Pre-defined access token (Google Compute Engine only)
174
+ Use GCP service account credentials.
175
+ You first need to create a service account, download its json key and deploy the key with embulk.
183
176
 
184
- #### Public-Private key pair of GCP's service account
177
+ ```yaml
178
+ out:
179
+ type: bigquery
180
+ auth_method: service_account
181
+ json_keyfile: /path/to/json_keyfile.json
182
+ ```
185
183
 
186
- You first need to create a service account (client ID),
187
- download its private key and deploy the key with embulk.
184
+ You can also embed contents of `json_keyfile` at config.yml.
188
185
 
189
186
  ```yaml
190
187
  out:
191
188
  type: bigquery
192
- auth_method: private_key # default
193
- service_account_email: ABCXYZ123ABCXYZ123.gserviceaccount.com
194
- p12_keyfile: /path/to/p12_keyfile.p12
189
+ auth_method: service_account
190
+ json_keyfile:
191
+ content: |
192
+ {
193
+ "private_key_id": "123456789",
194
+ "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
195
+ "client_email": "..."
196
+ }
195
197
  ```
196
198
 
197
- #### JSON key of GCP's service account
199
+ #### authorized\_user
198
200
 
199
- You first need to create a service account (client ID),
200
- download its json key and deploy the key with embulk.
201
+ Use Google user credentials.
202
+ You can get your credentials at `~/.config/gcloud/application_default_credentials.json` by running `gcloud auth login`.
201
203
 
202
204
  ```yaml
203
205
  out:
204
206
  type: bigquery
205
- auth_method: json_key
206
- json_keyfile: /path/to/json_keyfile.json
207
+ auth_method: authorized_user
208
+ json_keyfile: /path/to/credentials.json
207
209
  ```
208
210
 
209
- You can also embed contents of json_keyfile at config.yml.
211
+ You can also embed contents of `json_keyfile` at config.yml.
210
212
 
211
213
  ```yaml
212
214
  out:
213
215
  type: bigquery
214
- auth_method: json_key
216
+ auth_method: authorized_user
215
217
  json_keyfile:
216
218
  content: |
217
219
  {
218
- "private_key_id": "123456789",
219
- "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
220
- "client_email": "..."
221
- }
220
+ "client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
221
+ "client_secret":"xxxxxxxxxxx",
222
+ "refresh_token":"xxxxxxxxxxx",
223
+ "type":"authorized_user"
224
+ }
222
225
  ```
223
226
 
224
- #### Pre-defined access token(GCE only)
227
+ #### compute\_engine
225
228
 
226
229
  On the other hand, you don't need to explicitly create a service account for embulk when you
227
230
  run embulk in Google Compute Engine. In this third authentication method, you need to
@@ -234,6 +237,22 @@ out:
234
237
  auth_method: compute_engine
235
238
  ```
236
239
 
240
+ #### application\_default
241
+
242
+ Use Application Default Credentials (ADC). ADC is a strategy to locate Google Cloud Service Account credentials.
243
+
244
+ 1. ADC checks to see if the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set. If the variable is set, ADC uses the service account file that the variable points to.
245
+ 2. ADC checks to see if `~/.config/gcloud/application_default_credentials.json` is located. This file is created by running `gcloud auth application-default login`.
246
+ 3. Use the default service account for credentials if the application running on Compute Engine, App Engine, Kubernetes Engine, Cloud Functions or Cloud Run.
247
+
248
+ See https://cloud.google.com/docs/authentication/production for details.
249
+
250
+ ```yaml
251
+ out:
252
+ type: bigquery
253
+ auth_method: application_default
254
+ ```
255
+
237
256
  ### Table id formatting
238
257
 
239
258
  `table` and option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
@@ -242,21 +261,16 @@ Table ids are formatted at runtime
242
261
  using the local time of the embulk server.
243
262
 
244
263
  For example, with the configuration below,
245
- data is inserted into tables `table_2015_04`, `table_2015_05` and so on.
264
+ data is inserted into tables `table_20150503`, `table_20150504` and so on.
246
265
 
247
266
  ```yaml
248
267
  out:
249
268
  type: bigquery
250
- table: table_%Y_%m
269
+ table: table_%Y%m%d
251
270
  ```
252
271
 
253
272
  ### Dynamic table creating
254
273
 
255
- This plugin tries to create a table using BigQuery API when
256
-
257
- * mode is either of `delete_in_advance`, `replace`, `replace_backup`, `append`.
258
- * mode is `append_direct` and `auto_create_table` is true.
259
-
260
274
  There are 3 ways to set schema.
261
275
 
262
276
  #### Set schema.json
@@ -267,7 +281,7 @@ Please set file path of schema.json.
267
281
  out:
268
282
  type: bigquery
269
283
  auto_create_table: true
270
- table: table_%Y_%m
284
+ table: table_%Y%m%d
271
285
  schema_file: /path/to/schema.json
272
286
  ```
273
287
 
@@ -279,7 +293,7 @@ Plugin will try to read schema from existing table and use it as schema template
279
293
  out:
280
294
  type: bigquery
281
295
  auto_create_table: true
282
- table: table_%Y_%m
296
+ table: table_%Y%m%d
283
297
  template_table: existing_table_name
284
298
  ```
285
299
 
@@ -293,17 +307,17 @@ Column options are used to aid guessing BigQuery schema, or to define conversion
293
307
 
294
308
  - **column_options**: advanced: an array of options for columns
295
309
  - **name**: column name
296
- - **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, and `RECORD`. See belows for supported conversion type.
310
+ - **type**: BigQuery type such as `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE`, and `RECORD`. See belows for supported conversion type.
297
311
  - boolean: `BOOLEAN`, `STRING` (default: `BOOLEAN`)
298
312
  - long: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `INTEGER`)
299
313
  - double: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `FLOAT`)
300
- - string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `RECORD` (default: `STRING`)
301
- - timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP` (default: `TIMESTAMP`)
314
+ - string: `BOOLEAN`, `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE`, `RECORD` (default: `STRING`)
315
+ - timestamp: `INTEGER`, `FLOAT`, `STRING`, `TIMESTAMP`, `DATE` (default: `TIMESTAMP`)
302
316
  - json: `STRING`, `RECORD` (default: `STRING`)
303
317
  - **mode**: BigQuery mode such as `NULLABLE`, `REQUIRED`, and `REPEATED` (string, default: `NULLABLE`)
304
318
  - **fields**: Describes the nested schema fields if the type property is set to RECORD. Please note that this is **required** for `RECORD` column.
305
319
  - **timestamp_format**: timestamp format to convert into/from `timestamp` (string, default is `default_timestamp_format`)
306
- - **timezone**: timezone to convert into/from `timestamp` (string, default is `default_timezone`).
320
+ - **timezone**: timezone to convert into/from `timestamp`, `date` (string, default is `default_timezone`).
307
321
  - **default_timestamp_format**: default timestamp format for column_options (string, default is "%Y-%m-%d %H:%M:%S.%6N")
308
322
  - **default_timezone**: default timezone for column_options (string, default is "UTC")
309
323
 
@@ -355,22 +369,6 @@ out:
355
369
  payload_column_index: 0 # or, payload_column: payload
356
370
  ```
357
371
 
358
- ### Prevent Duplication
359
-
360
- `prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
361
-
362
- When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
363
-
364
- `job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
365
-
366
- [job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
367
-
368
- ```yaml
369
- out:
370
- type: bigquery
371
- prevent_duplicate_insert: true
372
- ```
373
-
374
372
  ### GCS Bucket
375
373
 
376
374
  This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs).
@@ -401,32 +399,31 @@ To load into a partition, specify `table` parameter with a partition decorator a
401
399
  out:
402
400
  type: bigquery
403
401
  table: table_name$20160929
404
- auto_create_table: true
405
402
  ```
406
403
 
407
- You may configure `time_partitioning` parameter together to create table via `auto_create_table: true` option as:
404
+ You may configure `time_partitioning` parameter together as:
408
405
 
409
406
  ```yaml
410
407
  out:
411
408
  type: bigquery
412
409
  table: table_name$20160929
413
- auto_create_table: true
414
410
  time_partitioning:
415
411
  type: DAY
416
412
  expiration_ms: 259200000
417
413
  ```
418
414
 
419
415
  You can also create column-based partitioning table as:
416
+
420
417
  ```yaml
421
418
  out:
422
419
  type: bigquery
423
420
  mode: replace
424
- auto_create_table: true
425
421
  table: table_name
426
422
  time_partitioning:
427
423
  type: DAY
428
424
  field: timestamp
429
425
  ```
426
+
430
427
  Note the `time_partitioning.field` should be top-level `DATE` or `TIMESTAMP`.
431
428
 
432
429
  Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though.
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.4.14"
3
+ spec.version = "0.6.3"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -8,11 +8,18 @@ Gem::Specification.new do |spec|
8
8
  spec.licenses = ["MIT"]
9
9
  spec.homepage = "https://github.com/embulk/embulk-output-bigquery"
10
10
 
11
- spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
11
+ # Exclude example directory which uses symlinks from generating gem.
12
+ # Symlinks do not work properly on the Windows platform without administrator privilege.
13
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"] - Dir["example/*" ]
12
14
  spec.test_files = spec.files.grep(%r{^(test|spec)/})
13
15
  spec.require_paths = ["lib"]
14
16
 
15
- spec.add_dependency 'google-api-client'
17
+ # TODO
18
+ # signet 0.12.0 and google-api-client 0.33.0 require >= Ruby 2.4.
19
+ # Embulk 0.9 use JRuby 9.1.X.Y and It compatible Ruby 2.3.
20
+ # So, Force install signet < 0.12 and google-api-client < 0.33.0
21
+ spec.add_dependency 'signet', '~> 0.7', '< 0.12.0'
22
+ spec.add_dependency 'google-api-client','< 0.33.0'
16
23
  spec.add_dependency 'time_with_zone'
17
24
 
18
25
  spec.add_development_dependency 'bundler', ['>= 1.10.6']
@@ -23,7 +23,7 @@ module Embulk
23
23
  # @return JSON string
24
24
  def self.load(v)
25
25
  if v.is_a?(String) # path
26
- File.read(v)
26
+ File.read(File.expand_path(v))
27
27
  elsif v.is_a?(Hash)
28
28
  v['content']
29
29
  end
@@ -33,9 +33,7 @@ module Embulk
33
33
  def self.configure(config, schema, task_count)
34
34
  task = {
35
35
  'mode' => config.param('mode', :string, :default => 'append'),
36
- 'auth_method' => config.param('auth_method', :string, :default => 'private_key'),
37
- 'service_account_email' => config.param('service_account_email', :string, :default => nil),
38
- 'p12_keyfile' => config.param('p12_keyfile', :string, :default => nil),
36
+ 'auth_method' => config.param('auth_method', :string, :default => 'application_default'),
39
37
  'json_keyfile' => config.param('json_keyfile', LocalFile, :default => nil),
40
38
  'project' => config.param('project', :string, :default => nil),
41
39
  'dataset' => config.param('dataset', :string),
@@ -45,7 +43,7 @@ module Embulk
45
43
  'table_old' => config.param('table_old', :string, :default => nil),
46
44
  'table_name_old' => config.param('table_name_old', :string, :default => nil), # lower version compatibility
47
45
  'auto_create_dataset' => config.param('auto_create_dataset', :bool, :default => false),
48
- 'auto_create_table' => config.param('auto_create_table', :bool, :default => false),
46
+ 'auto_create_table' => config.param('auto_create_table', :bool, :default => true),
49
47
  'schema_file' => config.param('schema_file', :string, :default => nil),
50
48
  'template_table' => config.param('template_table', :string, :default => nil),
51
49
 
@@ -53,7 +51,6 @@ module Embulk
53
51
  'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
54
52
  'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
55
53
  'is_skip_job_result_check' => config.param('is_skip_job_result_check', :bool, :default => false),
56
- 'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
57
54
  'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
58
55
  'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
59
56
  'abort_on_error' => config.param('abort_on_error', :bool, :default => nil),
@@ -105,10 +102,14 @@ module Embulk
105
102
  raise ConfigError.new "`mode` must be one of append, append_direct, replace, delete_in_advance, replace_backup"
106
103
  end
107
104
 
105
+ if %w[append replace delete_in_advance replace_backup].include?(task['mode']) and !task['auto_create_table']
106
+ raise ConfigError.new "`mode: #{task['mode']}` requires `auto_create_table: true`"
107
+ end
108
+
108
109
  if task['mode'] == 'replace_backup'
109
110
  task['table_old'] ||= task['table_name_old'] # for lower version compatibility
110
111
  if task['dataset_old'].nil? and task['table_old'].nil?
111
- raise ConfigError.new "`mode replace_backup` requires either of `dataset_old` or `table_old`"
112
+ raise ConfigError.new "`mode: replace_backup` requires either of `dataset_old` or `table_old`"
112
113
  end
113
114
  task['dataset_old'] ||= task['dataset']
114
115
  task['table_old'] ||= task['table']
@@ -122,28 +123,21 @@ module Embulk
122
123
  end
123
124
 
124
125
  task['auth_method'] = task['auth_method'].downcase
125
- unless %w[private_key json_key compute_engine application_default].include?(task['auth_method'])
126
- raise ConfigError.new "`auth_method` must be one of private_key, json_key, compute_engine, application_default"
127
- end
128
- if task['auth_method'] == 'private_key' and task['p12_keyfile'].nil?
129
- raise ConfigError.new "`p12_keyfile` is required for auth_method private_key"
126
+ unless %w[json_key service_account authorized_user compute_engine application_default].include?(task['auth_method'])
127
+ raise ConfigError.new "`auth_method` must be one of service_account (or json_key), authorized_user, compute_engine, application_default"
130
128
  end
131
- if task['auth_method'] == 'json_key' and task['json_keyfile'].nil?
132
- raise ConfigError.new "`json_keyfile` is required for auth_method json_key"
129
+ if (task['auth_method'] == 'service_account' or task['auth_method'] == 'json_key') and task['json_keyfile'].nil?
130
+ raise ConfigError.new "`json_keyfile` is required for auth_method: service_account (or json_key)"
133
131
  end
134
132
 
135
- jsonkey_params = nil
136
133
  if task['json_keyfile']
137
134
  begin
138
- jsonkey_params = JSON.parse(task['json_keyfile'])
135
+ json_key = JSON.parse(task['json_keyfile'])
136
+ task['project'] ||= json_key['project_id']
139
137
  rescue => e
140
138
  raise ConfigError.new "json_keyfile is not a JSON file"
141
139
  end
142
140
  end
143
-
144
- if jsonkey_params
145
- task['project'] ||= jsonkey_params['project_id']
146
- end
147
141
  if task['project'].nil?
148
142
  raise ConfigError.new "Required field \"project\" is not set"
149
143
  end
@@ -306,42 +300,18 @@ module Embulk
306
300
 
307
301
  case task['mode']
308
302
  when 'delete_in_advance'
309
- bigquery.delete_partition(task['table'])
303
+ bigquery.delete_table_or_partition(task['table'])
310
304
  bigquery.create_table_if_not_exists(task['table'])
311
305
  when 'replace'
312
306
  bigquery.create_table_if_not_exists(task['temp_table'])
313
- if Helper.has_partition_decorator?(task['table'])
314
- if task['auto_create_table']
315
- bigquery.create_table_if_not_exists(task['table'])
316
- else
317
- bigquery.get_table(task['table']) # raises NotFoundError
318
- end
319
- end
307
+ bigquery.create_table_if_not_exists(task['table']) # needs for when task['table'] is a partition
320
308
  when 'append'
321
309
  bigquery.create_table_if_not_exists(task['temp_table'])
322
- if Helper.has_partition_decorator?(task['table'])
323
- if task['auto_create_table']
324
- bigquery.create_table_if_not_exists(task['table'])
325
- else
326
- bigquery.get_table(task['table']) # raises NotFoundError
327
- end
328
- end
310
+ bigquery.create_table_if_not_exists(task['table']) # needs for when task['table'] is a partition
329
311
  when 'replace_backup'
330
312
  bigquery.create_table_if_not_exists(task['temp_table'])
331
- if Helper.has_partition_decorator?(task['table'])
332
- if task['auto_create_table']
333
- bigquery.create_table_if_not_exists(task['table'])
334
- else
335
- bigquery.get_table(task['table']) # raises NotFoundError
336
- end
337
- end
338
- if Helper.has_partition_decorator?(task['table_old'])
339
- if task['auto_create_table']
340
- bigquery.create_table_if_not_exists(task['table_old'], dataset: task['dataset_old'])
341
- else
342
- bigquery.get_table(task['table_old'], dataset: task['dataset_old']) # raises NotFoundError
343
- end
344
- end
313
+ bigquery.create_table_if_not_exists(task['table'])
314
+ bigquery.create_table_if_not_exists(task['table_old'], dataset: task['dataset_old']) # needs for when a partition
345
315
  else # append_direct
346
316
  if task['auto_create_table']
347
317
  bigquery.create_table_if_not_exists(task['table'])