RubyGems - embulk-output-bigquery - Versions diffs - 0.4.12 → 0.6.1 - Mend

embulk-output-bigquery 0.4.12 → 0.6.1

Files changed (60) hide show

checksums.yaml +4 -4
data/.travis.yml +10 -6
data/CHANGELOG.md +28 -0
data/Gemfile +2 -0
data/README.md +109 -75
data/embulk-output-bigquery.gemspec +4 -2
data/lib/embulk/output/bigquery.rb +38 -50
data/lib/embulk/output/bigquery/auth.rb +35 -0
data/lib/embulk/output/bigquery/bigquery_client.rb +31 -31
data/lib/embulk/output/bigquery/google_client.rb +3 -34
data/lib/embulk/output/bigquery/helper.rb +8 -4
data/test/helper.rb +2 -1
data/test/test_bigquery_client.rb +17 -21
data/test/test_configure.rb +19 -19
data/test/test_example.rb +5 -4
data/test/test_transaction.rb +36 -76
metadata +3 -45
data/example/config_append_direct_schema_update_options.yml +0 -31
data/example/config_client_options.yml +0 -33
data/example/config_csv.yml +0 -30
data/example/config_delete_in_advance.yml +0 -29
data/example/config_delete_in_advance_partitioned_table.yml +0 -33
data/example/config_expose_errors.yml +0 -30
data/example/config_gcs.yml +0 -32
data/example/config_guess_from_embulk_schema.yml +0 -29
data/example/config_guess_with_column_options.yml +0 -40
data/example/config_gzip.yml +0 -1
data/example/config_jsonl.yml +0 -1
data/example/config_max_threads.yml +0 -34
data/example/config_min_ouput_tasks.yml +0 -34
data/example/config_mode_append.yml +0 -30
data/example/config_mode_append_direct.yml +0 -30
data/example/config_nested_record.yml +0 -1
data/example/config_payload_column.yml +0 -20
data/example/config_payload_column_index.yml +0 -20
data/example/config_prevent_duplicate_insert.yml +0 -30
data/example/config_progress_log_interval.yml +0 -31
data/example/config_replace.yml +0 -30
data/example/config_replace_backup.yml +0 -32
data/example/config_replace_backup_paritioned_table.yml +0 -34
data/example/config_replace_paritioned_table.yml +0 -33
data/example/config_replace_schema_update_options.yml +0 -33
data/example/config_skip_file_generation.yml +0 -32
data/example/config_table_strftime.yml +0 -30
data/example/config_template_table.yml +0 -21
data/example/config_uncompressed.yml +0 -1
data/example/config_with_rehearsal.yml +0 -33
data/example/example.csv +0 -17
data/example/example.jsonl +0 -16
data/example/example.yml +0 -1
data/example/example2_1.csv +0 -1
data/example/example2_2.csv +0 -1
data/example/example4_1.csv +0 -1
data/example/example4_2.csv +0 -1
data/example/example4_3.csv +0 -1
data/example/example4_4.csv +0 -1
data/example/json_key.json +0 -12
data/example/nested_example.jsonl +0 -16
data/example/schema.json +0 -30
data/example/schema_expose_errors.json +0 -30

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9fceabdc34780426ea3ceda76c1afe00a4c99115207bb8feaf0151ce9ae48911
-  data.tar.gz: 708649547b8a6693a12722e376c5c923a6e3f3a17ef180ebed56ecc92d6270b3
+  metadata.gz: ddfd10c5e85614e1dae0333494333653f1af95b8158dfda8977f8b00d64b3478
+  data.tar.gz: 2cec70eaa49c828d7fe9347bc0d9699b9398f21db96880e997a66bdab23deb89
 SHA512:
-  metadata.gz: 8d31eb9867c7c70b9eb1b01bfb2889afae9a3693b328391d62ddc74cc552452832d50edb2af8d69285b9468cc99292c72de132f86bf48d2cb1920ab6f2be5fcf
-  data.tar.gz: 4bcab8f4bf48962985d9904c64530dd45b71dcd2afbc42388f3640636f0c1a971f0fcebda0db8bde2599ab54355210822bcf5de8d2196d42eceb9bca3d6145ae
+  metadata.gz: 4782a28272da610f8399aca50cc4ddaefea00b8dbf45a37bec24771d7ecdb05bbdcd6de85ff167c5c3745f6689413c215689bb8d420960705cd6cb2026e99932
+  data.tar.gz: 9dbabb787e2f1b5797ccb2a2cd8786ce28d0e0d01310cd522ea4894337a279e809de10abca14b50b836553b6de95df4afd886596d75e7193d4de60a5c6f95781

data/.travis.yml CHANGED Viewed

@@ -1,17 +1,21 @@
 language: ruby
 matrix:
   include:
-    - env: EMBULK_VERSION=0.8.39
-      rvm: jruby-9.1.5.0 # bundled jruby version
-      jdk: openjdk7 # embulk 0.8.x uses jdk7
     - env: EMBULK_VERSION=0.9.15
-      rvm: jruby-9.1.5.0 # bundled jruby version
+      rvm: jruby-9.1.15.0 # bundled jruby version
       jdk: openjdk8 # embulk 0.9.x uses jdk8
     - env: EMBULK_VERSION=latest
-      rvm: jruby-9.1.5.0 # ?
+      rvm: jruby-9.1.15.0 # ?
       jdk: openjdk8 # ?
   allow_failures:
     - env: EMBULK_VERSION=latest
 before_install:
   - curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-${EMBULK_VERSION}.jar"
-script: bundle exec env RUBYOPT="-r ./embulk.jar" rake test
+  - chmod a+x embulk.jar
+  - BUNDLER_VERSION=$(echo "require 'bundler'; Bundler::VERSION" | ./embulk.jar irb | tail -n 2 | tr -d '"')
+  - gem uninstall bundler -x
+  - gem install bundler -v ${BUNDLER_VERSION}
+install:
+  - ./embulk.jar bundle install --jobs=3 --retry=3 --path vendor/bundle
+script:
+  - bundle exec env RUBYOPT="-r ./embulk.jar  -r embulk -r embulk/java/bootstrap" rake test

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,31 @@
+## 0.6.1 - 2019-08-28
+* [maintenance] Release a new gem not to include symlinks to make it work on Windows.
+## 0.6.0 - 2019-08-11
+Cleanup `auth_method`:
+* [enhancement] Support `auth_method: authorized_user` (OAuth)
+* [incompatibility change] Rename `auth_method: json_key` to `auth_method: service_account` (`json_key` is kept for backward compatibility)
+* [incompatibility change] Remove deprecated `auth_method: private_key` (p12 key)
+* [incompatibility change] Change the default `auth_method` to `application_default` from `private_key` because `private_key` was dropped.
+## 0.5.0 - 2019-08-10
+* [incompatibility change] Drop deprecated `time_partitioning`.`require_partition_filter`
+* [incompatibility change] Drop `prevent_duplicate_insert` which has no use-case now
+* [incompatibility change] Modes `replace`, `replace_backup`, `append`, and `delete_in_advance` require `auto_create_table: true` now because, previously, these modes had created a target table even with `auto_create_table: false` and made users being confused. Note that `auto_create_table: true` is always required even for a partition (a table name with a partition decorator) which may not require creating a table. This is for simplicity of logics and implementations.
+* [incompatibility change] Change default value of `auto_create_table` to `true` because the above 4 modes, that is, except `append_direct` always require `auto_create_table: true` now.
+## 0.4.14 - 2019-08-10
+* [enhancement] Support field partitioning correctly.
+## 0.4.13 - 2019-03-20
+* [enhancement] Support clustered table as an experimental feature
 ## 0.4.12 - 2019-03-20
 * [maintenance] Fix `time_partitioning.requirePartitionFilter` was not working. Use `time_partitioning.require_partition_filter` (thanks to @gitetsu)

data/Gemfile CHANGED Viewed

@@ -1,6 +1,8 @@
 source 'https://rubygems.org/'
 gemspec
+gem 'embulk'
+gem 'liquid', '= 4.0.0' # the version included in embulk.jar
 gem 'embulk-parser-none'
 gem 'embulk-parser-jsonl'
 gem 'pry-nav'

data/README.md CHANGED Viewed

@@ -23,34 +23,23 @@ https://developers.google.com/bigquery/loading-data-into-bigquery
 Current version of this plugin supports Google API with Service Account Authentication, but does not support
 OAuth flow for installed applications.
-### INCOMPATIBILITY CHANGES
-v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGELOG.md) for details.
-* `formatter` option (formatter plugin support) is dropped. Use `source_format` option instead. (it already exists in v0.2.x too)
-* `encoders` option (encoder plugin support) is dropped. Use `compression` option instead (it already exists in v0.2.x too).
-* `mode: append` mode now expresses a transactional append, and `mode: append_direct` is one which is not transactional.
 ## Configuration
 #### Original options
 | name                                 | type        | required?  | default                  | description            |
 |:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
-|  mode                                | string      | optional   | "append"                 | See [Mode](#mode)     |
-|  auth_method                         | string      | optional   | "private_key"            | `private_key` , `json_key` or `compute_engine`
-|  service_account_email               | string      | required when auth_method is private_key  |   | Your Google service account email
-|  p12_keyfile                         | string      | required when auth_method is private_key  |   | Fullpath of private key in P12(PKCS12) format |
-|  json_keyfile                        | string      | required when auth_method is json_key     |   | Fullpath of json key |
-|  project                             | string      | required if json_keyfile is not given     |   | project_id |
+|  mode                                | string      | optional   | "append"                 | See [Mode](#mode)      |
+|  auth_method                         | string      | optional   | "application\_default"   | See [Authentication](#authentication) |
+|  json_keyfile                        | string      | optional   |                          | keyfile path or `content` |
+|  project                             | string      | required unless service\_account's `json_keyfile` is given. | | project\_id |
 |  dataset                             | string      | required   |                          | dataset |
 |  location                            | string      | optional   | nil                      | geographic location of dataset. See [Location](#location) |
 |  table                               | string      | required   |                          | table name, or table name with a partition decorator such as `table_name$20160929`|
 |  auto_create_dataset                 | boolean     | optional   | false                    | automatically create dataset |
-|  auto_create_table                   | boolean     | optional   | false                    | See [Dynamic Table Creating](#dynamic-table-creating) |
+|  auto_create_table                   | boolean     | optional   | true                     | `false` is available only for `append_direct` mode. Other modes require `true`. See [Dynamic Table Creating](#dynamic-table-creating) and [Time Partitioning](#time-partitioning) |
 |  schema_file                         | string      | optional   |                          | /path/to/schema.json |
 |  template_table                      | string      | optional   |                          | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
-|  prevent_duplicate_insert            | boolean     | optional   | false                    | See [Prevent Duplication](#prevent-duplication) |
 |  job_status_max_polling_time         | int         | optional   | 3600 sec                 | Max job status polling time |
 |  job_status_polling_interval         | int         | optional   | 10 sec                   | Job status polling interval |
 |  is_skip_job_result_check            | boolean     | optional   | false                    | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
@@ -107,7 +96,8 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
 |  time_partitioning.type           | string   | required  | nil     | The only type supported is DAY, which will generate one partition per day based on data loading time. |
 |  time_partitioning.expiration_ms  | int      | optional  | nil     | Number of milliseconds for which to keep the storage for a partition. |
 |  time_partitioning.field          | string   | optional  | nil     | `DATE` or `TIMESTAMP` column used for partitioning |
-|  time_partitioning.require_partition_filter | boolean      | optional  | nil     | If true, valid partition filter is required when query |
+|  clustering                       | hash     | optional  | nil     | Currently, clustering is supported for partitioned tables, so must be used with `time_partitioning` option. See [clustered tables](https://cloud.google.com/bigquery/docs/clustered-tables) |
+|  clustering.fields                | array    | required  | nil     | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
 |  schema_update_options            | array    | optional  | nil     | (Experimental) List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions). NOTE for the current status: `schema_update_options` does not work for `copy` job, that is, is not effective for most of modes such as `append`, `replace` and `replace_backup`. `delete_in_advance` deletes origin table so does not need to update schema. Only `append_direct` can utilize schema update. |
 ### Example
@@ -116,9 +106,8 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
 out:
   type: bigquery
   mode: append
-  auth_method: private_key   # default
-  service_account_email: ABCXYZ123ABCXYZ123.gserviceaccount.com
-  p12_keyfile: /path/to/p12_keyfile.p12
+  auth_method: service_account
+  json_keyfile: /path/to/json_keyfile.json
   project: your-project-000
   dataset: your_dataset_name
   table: your_table_name
@@ -126,7 +115,7 @@ out:
   source_format: NEWLINE_DELIMITED_JSON
 ```
-### location
+### Location
 The geographic location of the dataset. Required except for US and EU.
@@ -134,7 +123,7 @@ GCS bucket should be in same region when you use `gcs_bucket`.
 See also [Dataset Locations | BigQuery | Google Cloud](https://cloud.google.com/bigquery/docs/dataset-locations)
-### mode
+### Mode
 5 modes are provided.
@@ -156,6 +145,8 @@ This is not transactional, i.e., if fails, the target table could have some rows
 ```is_skip_job_result_check``` must be false when replace mode
+NOTE: BigQuery does not support replacing (actually, copying into) a non-partitioned table with a paritioned table atomically. You must once delete the non-partitioned table, otherwise, you get `Incompatible table partitioning specification when copying to the column partitioned table` error.
 ##### replace_backup
 1. Load to temporary table (Create and WRITE_APPEND in parallel)
@@ -171,53 +162,69 @@ This is not transactional, i.e., if fails, the target table could have some rows
 ### Authentication
-There are three methods supported to fetch access token for the service account.
+There are four authentication methods
-1. Public-Private key pair of GCP(Google Cloud Platform)'s service account
-2. JSON key of GCP(Google Cloud Platform)'s service account
-3. Pre-defined access token (Google Compute Engine only)
+1. `service_account` (or `json_key` for backward compatibility)
+1. `authorized_user`
+1. `compute_engine`
+1. `application_default`
-#### Public-Private key pair of GCP's service account
+#### service\_account (or json\_key)
-You first need to create a service account (client ID),
-download its private key and deploy the key with embulk.
+Use GCP service account credentials.
+You first need to create a service account, download its json key and deploy the key with embulk.
 ```yaml
 out:
   type: bigquery
-  auth_method: private_key   # default
-  service_account_email: ABCXYZ123ABCXYZ123.gserviceaccount.com
-  p12_keyfile: /path/to/p12_keyfile.p12
+  auth_method: service_account
+  json_keyfile: /path/to/json_keyfile.json
 ```
-#### JSON key of GCP's service account
+You can also embed contents of `json_keyfile` at config.yml.
-You first need to create a service account (client ID),
-download its json key and deploy the key with embulk.
+```yaml
+out:
+  type: bigquery
+  auth_method: service_account
+  json_keyfile:
+    content: |
+      {
+          "private_key_id": "123456789",
+          "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
+          "client_email": "..."
+      }
+```
+#### authorized\_user
+Use Google user credentials.
+You can get your credentials at `~/.config/gcloud/application_default_credentials.json` by running `gcloud auth login`.
 ```yaml
 out:
   type: bigquery
-  auth_method: json_key
-  json_keyfile: /path/to/json_keyfile.json
+  auth_method: authorized_user
+  json_keyfile: /path/to/credentials.json
 ```
-You can also embed contents of json_keyfile at config.yml.
+You can also embed contents of `json_keyfile` at config.yml.
 ```yaml
 out:
   type: bigquery
-  auth_method: json_key
+  auth_method: authorized_user
   json_keyfile:
     content: |
       {
-          "private_key_id": "123456789",
-          "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
-          "client_email": "..."
-       }
+        "client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
+        "client_secret":"xxxxxxxxxxx",
+        "refresh_token":"xxxxxxxxxxx",
+        "type":"authorized_user"
+      }
 ```
-#### Pre-defined access token(GCE only)
+#### compute\_engine
 On the other hand, you don't need to explicitly create a service account for embulk when you
 run embulk in Google Compute Engine. In this third authentication method, you need to
@@ -230,6 +237,22 @@ out:
   auth_method: compute_engine
 ```
+#### application\_default
+Use Application Default Credentials (ADC).  ADC is a strategy to locate Google Cloud Service Account credentials.
+1. ADC checks to see if the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set. If the variable is set, ADC uses the service account file that the variable points to.
+2. ADC checks to see if `~/.config/gcloud/application_default_credentials.json` is located. This file is created by running `gcloud auth application-default login`.
+3. Use the default service account for credentials if the application running on Compute Engine, App Engine, Kubernetes Engine, Cloud Functions or Cloud Run.
+See https://cloud.google.com/docs/authentication/production for details.
+```yaml
+out:
+  type: bigquery
+  auth_method: application_default
+```
 ### Table id formatting
 `table` and option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
@@ -238,20 +261,16 @@ Table ids are formatted at runtime
 using the local time of the embulk server.
 For example, with the configuration below,
-data is inserted into tables `table_2015_04`, `table_2015_05` and so on.
+data is inserted into tables `table_20150503`, `table_20150504` and so on.
 ```yaml
 out:
   type: bigquery
-  table: table_%Y_%m
+  table: table_%Y%m%d
 ```
 ### Dynamic table creating
-When `auto_create_table` is set to true, try to create the table using BigQuery API.
-If table already exists, insert into it.
 There are 3 ways to set schema.
 #### Set schema.json
@@ -262,7 +281,7 @@ Please set file path of schema.json.
 out:
   type: bigquery
   auto_create_table: true
-  table: table_%Y_%m
+  table: table_%Y%m%d
   schema_file: /path/to/schema.json
 ```
@@ -274,7 +293,7 @@ Plugin will try to read schema from existing table and use it as schema template
 out:
   type: bigquery
   auto_create_table: true
-  table: table_%Y_%m
+  table: table_%Y%m%d
   template_table: existing_table_name
 ```
@@ -350,25 +369,9 @@ out:
   payload_column_index: 0 # or, payload_column: payload
 ```
-### Prevent Duplication
-`prevent_duplicate_insert` option is used to prevent inserting same data for modes `append` or `append_direct`.
-When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options.
-`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding + ignore_unknown_values + allow_quoted_newlines)`
-[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency) so that same data can't be inserted with same settings repeatedly.
-```yaml
-out:
-  type: bigquery
-  prevent_duplicate_insert: true
-```
 ### GCS Bucket
-This is useful to reduce number of consumed jobs, which is limited by [50,000 jobs per project per day](https://cloud.google.com/bigquery/quota-policy#import).
+This is useful to reduce number of consumed jobs, which is limited by [100,000 jobs per project per day](https://cloud.google.com/bigquery/quotas#load_jobs).
 This plugin originally loads local files into BigQuery in parallel, that is, consumes a number of jobs, say 24 jobs on 24 CPU core machine for example (this depends on embulk parameters such as `min_output_tasks` and `max_threads`).
@@ -396,32 +399,31 @@ To load into a partition, specify `table` parameter with a partition decorator a
 out:
   type: bigquery
   table: table_name$20160929
-  auto_create_table: true
 ```
-You may configure `time_partitioning` parameter together to create table via `auto_create_table: true` option as:
+You may configure `time_partitioning` parameter together as:
 ```yaml
 out:
   type: bigquery
   table: table_name$20160929
-  auto_create_table: true
   time_partitioning:
     type: DAY
     expiration_ms: 259200000
 ```
 You can also create column-based partitioning table as:
 ```yaml
 out:
   type: bigquery
   mode: replace
-  auto_create_table: true
   table: table_name
   time_partitioning:
     type: DAY
     field: timestamp
 ```
 Note the `time_partitioning.field` should be top-level `DATE` or `TIMESTAMP`.
 Use [Tables: patch](https://cloud.google.com/bigquery/docs/reference/v2/tables/patch) API to update the schema of the partitioned table, embulk-output-bigquery itself does not support it, though.
@@ -446,8 +448,40 @@ $ embulk run -X page_size=1 -b . -l trace example/example.yml
 Place your embulk with `.jar` extension:
+```
+$ curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-latest.jar"
+$ chmod a+x embulk.jar
+```
+Investigate JRUBY\_VERSION and Bundler::VERSION included in the embulk.jar:
+```
+$ echo JRUBY_VERSION | ./embulk.jar irb
+2019-08-10 00:59:11.866 +0900: Embulk v0.9.17
+Switch to inspect mode.
+JRUBY_VERSION
+"X.X.X.X"
+$ echo "require 'bundler'; Bundler::VERSION" | ./embulk.jar irb
+2019-08-10 01:59:10.460 +0900: Embulk v0.9.17
+Switch to inspect mode.
+require 'bundler'; Bundler::VERSION
+"Y.Y.Y"
+```
+Install the same version of jruby (change X.X.X.X to the version shown above) and bundler:
+```
+$ rbenv install jruby-X.X.X.X
+$ rbenv local jruby-X.X.X.X
+$ gem install bundler -v Y.Y.Y
+```
+Install dependencies (NOTE: Use bundler included in the embulk.jar, otherwise, `gem 'embulk'` is not found):
 ```
-$ cp -a $(which embulk) embulk.jar
+$ ./embulk.jar bundle install --path vendor/bundle
 ```
 Run tests with `env RUBYOPT="-r ./embulk.jar`:
@@ -466,7 +500,7 @@ $ bundle exec env RUBYOPT="-r ./embulk.jar" ruby test/test_example.rb
 ### Release gem:
-Fix gemspec, then
+Change the version of gemspec, and write CHANGELOG.md. Then,
 ```
 $ bundle exec rake release

data/embulk-output-bigquery.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name          = "embulk-output-bigquery"
-  spec.version       = "0.4.12"
+  spec.version       = "0.6.1"
   spec.authors       = ["Satoshi Akama", "Naotoshi Seo"]
   spec.summary       = "Google BigQuery output plugin for Embulk"
   spec.description   = "Embulk plugin that insert records to Google BigQuery."
@@ -8,7 +8,9 @@ Gem::Specification.new do |spec|
   spec.licenses      = ["MIT"]
   spec.homepage      = "https://github.com/embulk/embulk-output-bigquery"
-  spec.files         = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
+  # Exclude example directory which uses symlinks from generating gem.
+  # Symlinks do not work properly on the Windows platform without administrator privilege.
+  spec.files         = `git ls-files`.split("\n") + Dir["classpath/*.jar"] - Dir["example/*" ]
   spec.test_files    = spec.files.grep(%r{^(test|spec)/})
   spec.require_paths = ["lib"]