embulk-output-bigquery 0.4.13 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +10 -6
- data/CHANGELOG.md +28 -0
- data/Gemfile +2 -0
- data/README.md +107 -75
- data/embulk-output-bigquery.gemspec +10 -3
- data/lib/embulk/output/bigquery.rb +31 -50
- data/lib/embulk/output/bigquery/auth.rb +35 -0
- data/lib/embulk/output/bigquery/bigquery_client.rb +24 -31
- data/lib/embulk/output/bigquery/google_client.rb +3 -34
- data/lib/embulk/output/bigquery/helper.rb +8 -4
- data/test/helper.rb +2 -1
- data/test/test_bigquery_client.rb +17 -21
- data/test/test_configure.rb +10 -19
- data/test/test_example.rb +5 -4
- data/test/test_transaction.rb +36 -76
- metadata +27 -49
- data/example/config_append_direct_schema_update_options.yml +0 -31
- data/example/config_client_options.yml +0 -33
- data/example/config_csv.yml +0 -30
- data/example/config_delete_in_advance.yml +0 -29
- data/example/config_delete_in_advance_partitioned_table.yml +0 -33
- data/example/config_expose_errors.yml +0 -30
- data/example/config_gcs.yml +0 -32
- data/example/config_guess_from_embulk_schema.yml +0 -29
- data/example/config_guess_with_column_options.yml +0 -40
- data/example/config_gzip.yml +0 -1
- data/example/config_jsonl.yml +0 -1
- data/example/config_max_threads.yml +0 -34
- data/example/config_min_ouput_tasks.yml +0 -34
- data/example/config_mode_append.yml +0 -30
- data/example/config_mode_append_direct.yml +0 -30
- data/example/config_nested_record.yml +0 -1
- data/example/config_payload_column.yml +0 -20
- data/example/config_payload_column_index.yml +0 -20
- data/example/config_prevent_duplicate_insert.yml +0 -30
- data/example/config_progress_log_interval.yml +0 -31
- data/example/config_replace.yml +0 -30
- data/example/config_replace_backup.yml +0 -32
- data/example/config_replace_backup_paritioned_table.yml +0 -34
- data/example/config_replace_paritioned_table.yml +0 -33
- data/example/config_replace_schema_update_options.yml +0 -33
- data/example/config_skip_file_generation.yml +0 -32
- data/example/config_table_strftime.yml +0 -30
- data/example/config_template_table.yml +0 -21
- data/example/config_uncompressed.yml +0 -1
- data/example/config_with_rehearsal.yml +0 -33
- data/example/example.csv +0 -17
- data/example/example.jsonl +0 -16
- data/example/example.yml +0 -1
- data/example/example2_1.csv +0 -1
- data/example/example2_2.csv +0 -1
- data/example/example4_1.csv +0 -1
- data/example/example4_2.csv +0 -1
- data/example/example4_3.csv +0 -1
- data/example/example4_4.csv +0 -1
- data/example/json_key.json +0 -12
- data/example/nested_example.jsonl +0 -16
- data/example/schema.json +0 -30
- data/example/schema_expose_errors.json +0 -30
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2168730943154d9fb8d8ebfce9e4a1c2130b16b5
|
|
4
|
+
data.tar.gz: 8c4549b91f75d3e7a874f310e0df791bd9c28030
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ae7e67855daddce745e3d9a8c5ee659d99c7e84510365bf216f63fb7ed92b9ce166cb2b005b76680020f67354a6f97cb3596d9b15490630cf1b927968e5b8f0e
|
|
7
|
+
data.tar.gz: 83531a34355dbf3af0ec602db9318ad52e0b6b9ba84b5ea1bef15f4a50b67fa69b664e0bf62deffe8f37ac62d9b995a6c5a135b69e5cf365df06d97f32264cfa
|
data/.travis.yml
CHANGED
|
@@ -1,17 +1,21 @@
|
|
|
1
1
|
language: ruby
|
|
2
2
|
matrix:
|
|
3
3
|
include:
|
|
4
|
-
- env: EMBULK_VERSION=0.8.39
|
|
5
|
-
rvm: jruby-9.1.5.0 # bundled jruby version
|
|
6
|
-
jdk: openjdk7 # embulk 0.8.x uses jdk7
|
|
7
4
|
- env: EMBULK_VERSION=0.9.15
|
|
8
|
-
rvm: jruby-9.1.
|
|
5
|
+
rvm: jruby-9.1.15.0 # bundled jruby version
|
|
9
6
|
jdk: openjdk8 # embulk 0.9.x uses jdk8
|
|
10
7
|
- env: EMBULK_VERSION=latest
|
|
11
|
-
rvm: jruby-9.1.
|
|
8
|
+
rvm: jruby-9.1.15.0 # ?
|
|
12
9
|
jdk: openjdk8 # ?
|
|
13
10
|
allow_failures:
|
|
14
11
|
- env: EMBULK_VERSION=latest
|
|
15
12
|
before_install:
|
|
16
13
|
- curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-${EMBULK_VERSION}.jar"
|
|
17
|
-
|
|
14
|
+
- chmod a+x embulk.jar
|
|
15
|
+
- BUNDLER_VERSION=$(echo "require 'bundler'; Bundler::VERSION" | ./embulk.jar irb | tail -n 2 | tr -d '"')
|
|
16
|
+
- gem uninstall bundler -x
|
|
17
|
+
- gem install bundler -v ${BUNDLER_VERSION}
|
|
18
|
+
install:
|
|
19
|
+
- ./embulk.jar bundle install --jobs=3 --retry=3 --path vendor/bundle
|
|
20
|
+
script:
|
|
21
|
+
- bundle exec env RUBYOPT="-r ./embulk.jar -r embulk -r embulk/java/bootstrap" rake test
|
data/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,31 @@
|
|
|
1
|
+
## 0.6.2 - 2019-10-16
|
|
2
|
+
|
|
3
|
+
* [maintenance] Lock signet and google-api-client version (thanks to @hiroyuki-sato)
|
|
4
|
+
|
|
5
|
+
## 0.6.1 - 2019-08-28
|
|
6
|
+
|
|
7
|
+
* [maintenance] Release a new gem not to include symlinks to make it work on Windows.
|
|
8
|
+
|
|
9
|
+
## 0.6.0 - 2019-08-11
|
|
10
|
+
|
|
11
|
+
Cleanup `auth_method`:
|
|
12
|
+
|
|
13
|
+
* [enhancement] Support `auth_method: authorized_user` (OAuth)
|
|
14
|
+
* [incompatibility change] Rename `auth_method: json_key` to `auth_method: service_account` (`json_key` is kept for backward compatibility)
|
|
15
|
+
* [incompatibility change] Remove deprecated `auth_method: private_key` (p12 key)
|
|
16
|
+
* [incompatibility change] Change the default `auth_method` to `application_default` from `private_key` because `private_key` was dropped.
|
|
17
|
+
|
|
18
|
+
## 0.5.0 - 2019-08-10
|
|
19
|
+
|
|
20
|
+
* [incompatibility change] Drop deprecated `time_partitioning`.`require_partition_filter`
|
|
21
|
+
* [incompatibility change] Drop `prevent_duplicate_insert` which has no use-case now
|
|
22
|
+
* [incompatibility change] Modes `replace`, `replace_backup`, `append`, and `delete_in_advance` require `auto_create_table: true` now because, previously, these modes had created a target table even with `auto_create_table: false` and made users being confused. Note that `auto_create_table: true` is always required even for a partition (a table name with a partition decorator) which may not require creating a table. This is for simplicity of logics and implementations.
|
|
23
|
+
* [incompatibility change] Change default value of `auto_create_table` to `true` because the above 4 modes, that is, except `append_direct` always require `auto_create_table: true` now.
|
|
24
|
+
|
|
25
|
+
## 0.4.14 - 2019-08-10
|
|
26
|
+
|
|
27
|
+
* [enhancement] Support field partitioning correctly.
|
|
28
|
+
|
|
1
29
|
## 0.4.13 - 2019-03-20
|
|
2
30
|
|
|
3
31
|
* [enhancement] Support clustered table as an experimental feature
|
data/Gemfile
CHANGED
data/README.md
CHANGED
|
@@ -23,34 +23,23 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
|
|
|
23
23
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
|
24
24
|
OAuth flow for installed applications.
|
|
25
25
|
|
|
26
|
-
### INCOMPATIBILITY CHANGES
|
|
27
|
-
|
|
28
|
-
v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
|
|
29
|
-
|
|
30
|
-
* `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
|
|
31
|
-
* `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
|
|
32
|
-
* `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
|
|
33
|
-
|
|
34
26
|
## Configuration
|
|
35
27
|
|
|
36
28
|
#### Original options
|
|
37
29
|
|
|
38
30
|
| name | type | required? | default | description |
|
|
39
31
|
|:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
|
|
40
|
-
| mode | string | optional | "append" | See [Mode](#mode)
|
|
41
|
-
| auth_method | string | optional | "
|
|
42
|
-
|
|
|
43
|
-
|
|
|
44
|
-
| json_keyfile | string | required when auth_method is json_key | | Fullpath of json key |
|
|
45
|
-
| project | string | required if json_keyfile is not given | | project_id |
|
|
32
|
+
| mode | string | optional | "append" | See [Mode](#mode) |
|
|
33
|
+
| auth_method | string | optional | "application\_default" | See [Authentication](#authentication) |
|
|
34
|
+
| json_keyfile | string | optional | | keyfile path or `content` |
|
|
35
|
+
| project | string | required unless service\_account's `json_keyfile` is given. | | project\_id |
|
|
46
36
|
| dataset | string | required | | dataset |
|
|
47
37
|
| location | string | optional | nil | geographic location of dataset. See [Location](#location) |
|
|
48
38
|
| table | string | required | | table name, or table name with a partition decorator such as `table_name$20160929`|
|
|
49
39
|
| auto_create_dataset | boolean | optional | false | automatically create dataset |
|
|
50
|
-
| auto_create_table | boolean | optional |
|
|
40
|
+
| auto_create_table | boolean | optional | true | `false` is available only for `append_direct` mode. Other modes require `true`. See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
|
|
51
41
|
| schema_file | string | optional | | /path/to/schema.json |
|
|
52
42
|
| template_table | string | optional | | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
|
|
53
|
-
| prevent_duplicate_insert | boolean | optional | false | See [Prevent Duplication](#prevent-duplication) |
|
|
54
43
|
| job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
|
|
55
44
|
| job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
|
|
56
45
|
| is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
|
|
@@ -107,8 +96,7 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
|
|
|
107
96
|
| time_partitioning.type | string | required | nil | The only type supported is DAY, which will generate one partition per day based on data loading time. |
|
|
108
97
|
| time_partitioning.expiration_ms | int | optional | nil | Number of milliseconds for which to keep the storage for a partition. |
|
|
109
98
|
| time_partitioning.field | string | optional | nil | `DATE` or `TIMESTAMP` column used for partitioning |
|
|
110
|
-
|
|
|
111
|
-
| clustering | hash | optional | nil | (Experimental) Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. NOTE: **clustered tables** is a beta release. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
|
|
99
|
+
| clustering | hash | optional | nil | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
|
|
112
100
|
| clustering.fields | array | required | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
|
|
113
101
|
| schema_update_options | array | optional | nil | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
|
|
114
102
|
|
|
@@ -118,9 +106,8 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
|
|
|
118
106
|
out:
|
|
119
107
|
type: bigquery
|
|
120
108
|
mode: append
|
|
121
|
-
auth_method:
|
|
122
|
-
|
|
123
|
-
p12_keyfile: /path/to/p12_keyfile.p12
|
|
109
|
+
auth_method: service_account
|
|
110
|
+
json_keyfile: /path/to/json_keyfile.json
|
|
124
111
|
project: your-project-000
|
|
125
112
|
dataset: your_dataset_name
|
|
126
113
|
table: your_table_name
|
|
@@ -128,7 +115,7 @@ out:
|
|
|
128
115
|
source_format: NEWLINE_DELIMITED_JSON
|
|
129
116
|
```
|
|
130
117
|
|
|
131
|
-
###
|
|
118
|
+
### Location
|
|
132
119
|
|
|
133
120
|
The geographic location of the dataset. Required except for US and EU.
|
|
134
121
|
|
|
@@ -136,7 +123,7 @@ GCS bucket should be in same region when you use `gcs_bucket`.
|
|
|
136
123
|
|
|
137
124
|
See also [Dataset Locations | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/dataset-locations)
|
|
138
125
|
|
|
139
|
-
###
|
|
126
|
+
### Mode
|
|
140
127
|
|
|
141
128
|
5 modes are provided.
|
|
142
129
|
|
|
@@ -158,6 +145,8 @@ This is not transactional, i.e., if fails, the target table could have some rows
|
|
|
158
145
|
|
|
159
146
|
```is_skip_job_result_check``` must be false when replace mode
|
|
160
147
|
|
|
148
|
+
NOTE: BigQuery does not support replacing (actually, copying into) a non-partitioned table with a paritioned table atomically. You must once delete the non-partitioned table, otherwise, you get `Incompatible table partitioning specification when copying to the column partitioned table` error.
|
|
149
|
+
|
|
161
150
|
##### replace_backup
|
|
162
151
|
|
|
163
152
|
1. Load to temporary table (Create and WRITE_APPEND in parallel)
|
|
@@ -173,53 +162,69 @@ This is not transactional, i.e., if fails, the target table could have some rows
|
|
|
173
162
|
|
|
174
163
|
### Authentication
|
|
175
164
|
|
|
176
|
-
There are
|
|
165
|
+
There are four authentication methods
|
|
177
166
|
|
|
178
|
-
1.
|
|
179
|
-
|
|
180
|
-
|
|
167
|
+
1. `service_account` (or `json_key` for backward compatibility)
|
|
168
|
+
1. `authorized_user`
|
|
169
|
+
1. `compute_engine`
|
|
170
|
+
1. `application_default`
|
|
181
171
|
|
|
182
|
-
####
|
|
172
|
+
#### service\_account (or json\_key)
|
|
183
173
|
|
|
184
|
-
|
|
185
|
-
download its
|
|
174
|
+
Use GCP service account credentials.
|
|
175
|
+
You first need to create a service account, download its json key and deploy the key with embulk.
|
|
186
176
|
|
|
187
177
|
```yaml
|
|
188
178
|
out:
|
|
189
179
|
type: bigquery
|
|
190
|
-
auth_method:
|
|
191
|
-
|
|
192
|
-
p12_keyfile: /path/to/p12_keyfile.p12
|
|
180
|
+
auth_method: service_account
|
|
181
|
+
json_keyfile: /path/to/json_keyfile.json
|
|
193
182
|
```
|
|
194
183
|
|
|
195
|
-
|
|
184
|
+
You can also embed contents of `json_keyfile` at config.yml.
|
|
196
185
|
|
|
197
|
-
|
|
198
|
-
|
|
186
|
+
```yaml
|
|
187
|
+
out:
|
|
188
|
+
type: bigquery
|
|
189
|
+
auth_method: service_account
|
|
190
|
+
json_keyfile:
|
|
191
|
+
content: |
|
|
192
|
+
{
|
|
193
|
+
"private_key_id": "123456789",
|
|
194
|
+
"private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
|
|
195
|
+
"client_email": "..."
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
#### authorized\_user
|
|
200
|
+
|
|
201
|
+
Use Google user credentials.
|
|
202
|
+
You can get your credentials at `~/.config/gcloud/application_default_credentials.json` by running `gcloud auth login`.
|
|
199
203
|
|
|
200
204
|
```yaml
|
|
201
205
|
out:
|
|
202
206
|
type: bigquery
|
|
203
|
-
auth_method:
|
|
204
|
-
json_keyfile: /path/to/
|
|
207
|
+
auth_method: authorized_user
|
|
208
|
+
json_keyfile: /path/to/credentials.json
|
|
205
209
|
```
|
|
206
210
|
|
|
207
|
-
You can also embed contents of json_keyfile at config.yml.
|
|
211
|
+
You can also embed contents of `json_keyfile` at config.yml.
|
|
208
212
|
|
|
209
213
|
```yaml
|
|
210
214
|
out:
|
|
211
215
|
type: bigquery
|
|
212
|
-
auth_method:
|
|
216
|
+
auth_method: authorized_user
|
|
213
217
|
json_keyfile:
|
|
214
218
|
content: |
|
|
215
219
|
{
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
+
"client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
|
|
221
|
+
"client_secret":"xxxxxxxxxxx",
|
|
222
|
+
"refresh_token":"xxxxxxxxxxx",
|
|
223
|
+
"type":"authorized_user"
|
|
224
|
+
}
|
|
220
225
|
```
|
|
221
226
|
|
|
222
|
-
####
|
|
227
|
+
#### compute\_engine
|
|
223
228
|
|
|
224
229
|
On the other hand, you don't need to explicitly create a service account for embulk when you
|
|
225
230
|
run embulk in Google Compute Engine. In this third authentication method, you need to
|
|
@@ -232,6 +237,22 @@ out:
|
|
|
232
237
|
auth_method: compute_engine
|
|
233
238
|
```
|
|
234
239
|
|
|
240
|
+
#### application\_default
|
|
241
|
+
|
|
242
|
+
Use Application Default Credentials (ADC). ADC is a strategy to locate Google Cloud Service Account credentials.
|
|
243
|
+
|
|
244
|
+
1. ADC checks to see if the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set. If the variable is set, ADC uses the service account file that the variable points to.
|
|
245
|
+
2. ADC checks to see if `~/.config/gcloud/application_default_credentials.json` is located. This file is created by running `gcloud auth application-default login`.
|
|
246
|
+
3. Use the default service account for credentials if the application running on Compute Engine, App Engine, Kubernetes Engine, Cloud Functions or Cloud Run.
|
|
247
|
+
|
|
248
|
+
See https://cloud.google.com/docs/authentication/production for details.
|
|
249
|
+
|
|
250
|
+
```yaml
|
|
251
|
+
out:
|
|
252
|
+
type: bigquery
|
|
253
|
+
auth_method: application_default
|
|
254
|
+
```
|
|
255
|
+
|
|
235
256
|
### Table id formatting
|
|
236
257
|
|
|
237
258
|
`table` and option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
|
|
@@ -240,20 +261,16 @@ Table ids are formatted at runtime
|
|
|
240
261
|
using the local time of the embulk server.
|
|
241
262
|
|
|
242
263
|
For example, with the configuration below,
|
|
243
|
-
data is inserted into tables `
|
|
264
|
+
data is inserted into tables `table_20150503`, `table_20150504` and so on.
|
|
244
265
|
|
|
245
266
|
```yaml
|
|
246
267
|
out:
|
|
247
268
|
type: bigquery
|
|
248
|
-
table: table_%
|
|
269
|
+
table: table_%Y%m%d
|
|
249
270
|
```
|
|
250
271
|
|
|
251
272
|
### Dynamic table creating
|
|
252
273
|
|
|
253
|
-
When `auto_create_table` is set to true, try to create the table using BigQuery API.
|
|
254
|
-
|
|
255
|
-
If table already exists, insert into it.
|
|
256
|
-
|
|
257
274
|
There are 3 ways to set schema.
|
|
258
275
|
|
|
259
276
|
#### Set schema.json
|
|
@@ -264,7 +281,7 @@ Please set file path of schema.json.
|
|
|
264
281
|
out:
|
|
265
282
|
type: bigquery
|
|
266
283
|
auto_create_table: true
|
|
267
|
-
table: table_%
|
|
284
|
+
table: table_%Y%m%d
|
|
268
285
|
schema_file: /path/to/schema.json
|
|
269
286
|
```
|
|
270
287
|
|
|
@@ -276,7 +293,7 @@ Plugin will try to read schema from existing table and use it as schema template
|
|
|
276
293
|
out:
|
|
277
294
|
type: bigquery
|
|
278
295
|
auto_create_table: true
|
|
279
|
-
table: table_%
|
|
296
|
+
table: table_%Y%m%d
|
|
280
297
|
template_table: existing_table_name
|
|
281
298
|
```
|
|
282
299
|
|
|
@@ -352,25 +369,9 @@ out:
|
|
|
352
369
|
payload_column_index: 0 # or, payload_column: payload
|
|
353
370
|
```
|
|
354
371
|
|
|
355
|
-
### Prevent Duplication
|
|
356
|
-
|
|
357
|
-
`prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
|
|
358
|
-
|
|
359
|
-
When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
|
|
360
|
-
|
|
361
|
-
`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
|
|
362
|
-
|
|
363
|
-
[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
|
|
364
|
-
|
|
365
|
-
```yaml
|
|
366
|
-
out:
|
|
367
|
-
type: bigquery
|
|
368
|
-
prevent_duplicate_insert: true
|
|
369
|
-
```
|
|
370
|
-
|
|
371
372
|
### GCS Bucket
|
|
372
373
|
|
|
373
|
-
This is useful to reduce number of consumed jobs, which is limited by [
|
|
374
|
+
This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs).
|
|
374
375
|
|
|
375
376
|
This plugin originally loads local files into BigQuery in parallel, that is, consumes a number of jobs, say 24 jobs on 24 CPU core machine for example (this depends on embulk parameters such as `min_output_tasks` and `max_threads`).
|
|
376
377
|
|
|
@@ -398,32 +399,31 @@ To load into a partition, specify `table` parameter with a partition decorator a
|
|
|
398
399
|
out:
|
|
399
400
|
type: bigquery
|
|
400
401
|
table: table_name$20160929
|
|
401
|
-
auto_create_table: true
|
|
402
402
|
```
|
|
403
403
|
|
|
404
|
-
You may configure `time_partitioning` parameter together
|
|
404
|
+
You may configure `time_partitioning` parameter together as:
|
|
405
405
|
|
|
406
406
|
```yaml
|
|
407
407
|
out:
|
|
408
408
|
type: bigquery
|
|
409
409
|
table: table_name$20160929
|
|
410
|
-
auto_create_table: true
|
|
411
410
|
time_partitioning:
|
|
412
411
|
type: DAY
|
|
413
412
|
expiration_ms: 259200000
|
|
414
413
|
```
|
|
415
414
|
|
|
416
415
|
You can also create column-based partitioning table as:
|
|
416
|
+
|
|
417
417
|
```yaml
|
|
418
418
|
out:
|
|
419
419
|
type: bigquery
|
|
420
420
|
mode: replace
|
|
421
|
-
auto_create_table: true
|
|
422
421
|
table: table_name
|
|
423
422
|
time_partitioning:
|
|
424
423
|
type: DAY
|
|
425
424
|
field: timestamp
|
|
426
425
|
```
|
|
426
|
+
|
|
427
427
|
Note the `time_partitioning.field` should be top-level `DATE` or `TIMESTAMP`.
|
|
428
428
|
|
|
429
429
|
Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though.
|
|
@@ -448,8 +448,40 @@ $ embulk run -X page_size=1 -b . -l trace example/example.yml
|
|
|
448
448
|
|
|
449
449
|
Place your embulk with `.jar` extension:
|
|
450
450
|
|
|
451
|
+
|
|
452
|
+
```
|
|
453
|
+
$ curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-latest.jar"
|
|
454
|
+
$ chmod a+x embulk.jar
|
|
455
|
+
```
|
|
456
|
+
|
|
457
|
+
Investigate JRUBY\_VERSION and Bundler::VERSION included in the embulk.jar:
|
|
458
|
+
|
|
459
|
+
```
|
|
460
|
+
$ echo JRUBY_VERSION | ./embulk.jar irb
|
|
461
|
+
2019-08-10 00:59:11.866 +0900: Embulk v0.9.17
|
|
462
|
+
Switch to inspect mode.
|
|
463
|
+
JRUBY_VERSION
|
|
464
|
+
"X.X.X.X"
|
|
465
|
+
|
|
466
|
+
$ echo "require 'bundler'; Bundler::VERSION" | ./embulk.jar irb
|
|
467
|
+
2019-08-10 01:59:10.460 +0900: Embulk v0.9.17
|
|
468
|
+
Switch to inspect mode.
|
|
469
|
+
require 'bundler'; Bundler::VERSION
|
|
470
|
+
"Y.Y.Y"
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
Install the same version of jruby (change X.X.X.X to the version shown above) and bundler:
|
|
474
|
+
|
|
475
|
+
```
|
|
476
|
+
$ rbenv install jruby-X.X.X.X
|
|
477
|
+
$ rbenv local jruby-X.X.X.X
|
|
478
|
+
$ gem install bundler -v Y.Y.Y
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
Install dependencies (NOTE: Use bundler included in the embulk.jar, otherwise, `gem 'embulk'` is not found):
|
|
482
|
+
|
|
451
483
|
```
|
|
452
|
-
$
|
|
484
|
+
$ ./embulk.jar bundle install --path vendor/bundle
|
|
453
485
|
```
|
|
454
486
|
|
|
455
487
|
Run tests with `env RUBYOPT="-r ./embulk.jar`:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Gem::Specification.new do |spec|
|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
|
3
|
-
spec.version = "0.
|
|
3
|
+
spec.version = "0.6.2"
|
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
|
@@ -8,11 +8,18 @@ Gem::Specification.new do |spec|
|
|
|
8
8
|
spec.licenses = ["MIT"]
|
|
9
9
|
spec.homepage = "https://github.com/embulk/embulk-output-bigquery"
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
# Exclude example directory which uses symlinks from generating gem.
|
|
12
|
+
# Symlinks do not work properly on the Windows platform without administrator privilege.
|
|
13
|
+
spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"] - Dir["example/*" ]
|
|
12
14
|
spec.test_files = spec.files.grep(%r{^(test|spec)/})
|
|
13
15
|
spec.require_paths = ["lib"]
|
|
14
16
|
|
|
15
|
-
|
|
17
|
+
# TODO
|
|
18
|
+
# signet 0.12.0 and google-api-client 0.33.0 require >= Ruby 2.4.
|
|
19
|
+
# Embulk 0.9 use JRuby 9.1.X.Y and It compatible Ruby 2.3.
|
|
20
|
+
# So, Force install signet < 0.12 and google-api-client < 0.33.0
|
|
21
|
+
spec.add_dependency 'signet', '~> 0.7', '< 0.12.0'
|
|
22
|
+
spec.add_dependency 'google-api-client','< 0.33.0'
|
|
16
23
|
spec.add_dependency 'time_with_zone'
|
|
17
24
|
|
|
18
25
|
spec.add_development_dependency 'bundler', ['>= 1.10.6']
|