RubyGems - embulk-output-bigquery - Versions diffs - 0.4.0 → 0.4.1 - Mend

embulk-output-bigquery 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/README.md +23 -3
data/embulk-output-bigquery.gemspec +1 -1
data/example/config_append_direct_schema_update_options.yml +31 -0
data/lib/embulk/output/bigquery.rb +14 -5
data/lib/embulk/output/bigquery/bigquery_client.rb +18 -2
data/test/test_bigquery_client.rb +4 -4
data/test/test_configure.rb +8 -0
data/test/test_transaction.rb +18 -19
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 71bc9b253f725436a06e183667cbc87720c3719b
-  data.tar.gz: a32e43da05a4f90ab72c5715ffdf6b08501996d4
+  metadata.gz: 1ae4bf7af71e37194f768fad9e16e415747fee70
+  data.tar.gz: 796f2c3253d5600c439597f7ca495a7d1d8bac95
 SHA512:
-  metadata.gz: bd3d8aefbc98c2f044b782f807f595603ac7b11052a06b6486803fd2f6871127058a50e9c69ffc1fac92b75de9561c57e99ad9ba3cd8899507e93085d45ed615
-  data.tar.gz: 813b6455f463940968232b4332b8553698b9ef99ad4f3f5af6800b10223c33498fde9f8915604090f85dc7c2f78d16a865cd90da2174447c7f84ab3ef80a4cf8
+  metadata.gz: 3011c4128b2ed28a0fd84d0e2a592d706434ea24746c3aaf44f2e86f588b83da80e59b2931426cba0d390906ec283690966a4eb3b092d7d8edce364fc6ecc2b2
+  data.tar.gz: e969f903a71e5bf500fc57fb7a8b2ae2e4ffa8760557c329d3013de19959016ba7ef0578df3bdb7fb7cb0f62a3cb7a58f62cdfa23bb81d46476390884461e1a3

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,7 @@
+## 0.4.1 - 2016-10-03
+* [enhancement] Support `schema_update_options` option
 ## 0.4.0 - 2016-10-01
 * [enhancement] Support partitioned table

data/README.md CHANGED Viewed

@@ -100,9 +100,10 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
 |  encoding                         | string   | optional  | "UTF-8" | `UTF-8` or `ISO-8859-1` |
 |  ignore_unknown_values            | boolean  | optional  | false   | |
 |  allow_quoted_newlines            | boolean  | optional  | false   | Set true, if data contains newline characters. It may cause slow procsssing |
-|  time_partitioning                | hash     | optional  | nil     | See [Time Partitioning](#time-partitioning) |
+|  time_partitioning                | hash     | optional  | `{"type":"DAY"}` if `table` parameter has a partition decorator, otherwise nil | See [Time Partitioning](#time-partitioning) |
 |  time_partitioning.type           | string   | required  | nil     | The only type supported is DAY, which will generate one partition per day based on data loading time. |
 |  time_partitioning.expiration__ms | int      | optional  | nil     | Number of milliseconds for which to keep the storage for a partition. partition |
+|  schema_update_options            | array    | optional  | nil     | List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) |
 ### Example
@@ -365,7 +366,7 @@ Using `gcs_bucket` option, such strategy is enabled. You may also use `auto_crea
 out:
   type: bigquery
   gcs_bucket: bucket_name
-  auto_create_gcs_bucket: false
+  auto_create_gcs_bucket: true
 ```
 ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers into GCS.
@@ -391,11 +392,30 @@ out:
   type: bigquery
   table: table_name$20160929
   auto_create_table: true
-  time-partitioning:
+  time_partitioning:
     type: DAY
     expiration_ms: 259200000
 ```
+Use `schema_update_options` to allow the schema of the desitination table to be updated as a side effect of the load job as:
+```yaml
+out:
+  type: bigquery
+  table: table_name$20160929
+  auto_create_table: true
+  time_partitioning:
+    type: DAY
+    expiration_ms: 259200000
+  schema_update_options:
+    - ALLOW_FIELD_ADDITION
+    - ALLOW_FIELD_RELAXATION
+```
+It seems that only adding a new column, and relaxing non-necessary columns to be `NULLABLE` are supported now.
+Deleting columns, and renaming columns are not supported.
+See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) for details.
 ## Development
 ### Run example:

data/embulk-output-bigquery.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name          = "embulk-output-bigquery"
-  spec.version       = "0.4.0"
+  spec.version       = "0.4.1"
   spec.authors       = ["Satoshi Akama", "Naotoshi Seo"]
   spec.summary       = "Google BigQuery output plugin for Embulk"
   spec.description   = "Embulk plugin that insert records to Google BigQuery."

data/example/config_append_direct_schema_update_options.yml ADDED Viewed

@@ -0,0 +1,31 @@
+in:
+  type: file
+  path_prefix: example/example.csv
+  parser:
+    type: csv
+    charset: UTF-8
+    newline: CRLF
+    null_string: 'NULL'
+    skip_header_lines: 1
+    comment_line_marker: '#'
+    columns:
+      - {name: date,        type: string}
+      - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
+      - {name: "null",      type: string}
+      - {name: long,        type: long}
+      - {name: string,      type: string}
+      - {name: double,     type: double}
+      - {name: boolean,     type: boolean}
+out:
+  type: bigquery
+  mode: append_direct
+  auth_method: json_key
+  json_keyfile: example/your-project-000.json
+  dataset: your_dataset_name
+  table: your_table_name
+  source_format: NEWLINE_DELIMITED_JSON
+  compression: NONE
+  auto_create_dataset: true
+  auto_create_table: true
+  schema_file: example/schema.json
+  schema_update_options: [ALLOW_FIELD_ADDITION, ALLOW_FIELD_RELAXATION]

data/lib/embulk/output/bigquery.rb CHANGED Viewed

@@ -86,6 +86,7 @@ module Embulk
           'ignore_unknown_values'          => config.param('ignore_unknown_values',          :bool,    :default => false),
           'allow_quoted_newlines'          => config.param('allow_quoted_newlines',          :bool,    :default => false),
           'time_partitioning'              => config.param('time_partitioning',              :hash,    :default => nil),
+          'schema_update_options'          => config.param('schema_update_options',          :array,   :default => nil),
           # for debug
           'skip_load'                      => config.param('skip_load',                      :bool,    :default => false),
@@ -230,6 +231,14 @@ module Embulk
           task['time_partitioning'] = {'type' => 'DAY'}
         end
+        if task['schema_update_options']
+          task['schema_update_options'].each do |schema_update_option|
+            unless %w[ALLOW_FIELD_ADDITION ALLOW_FIELD_RELAXATION].include?(schema_update_option)
+              raise ConfigError.new "`schema_update_options` must contain either of ALLOW_FIELD_ADDITION or ALLOW_FIELD_RELAXATION or both"
+            end
+          end
+        end
         task
       end
@@ -292,19 +301,19 @@ module Embulk
           else
             bigquery.delete_table(task['table'])
           end
-          bigquery.create_table(task['table'], options: task)
+          bigquery.create_table(task['table'])
         when 'replace', 'replace_backup', 'append'
-          bigquery.create_table(task['temp_table'], options: task)
+          bigquery.create_table(task['temp_table'])
           if task['time_partitioning']
             if task['auto_create_table']
-              bigquery.create_table(task['table'], options: task)
+              bigquery.create_table(task['table'])
             else
               bigquery.get_table(task['table']) # raises NotFoundError
             end
           end
         else # append_direct
           if task['auto_create_table']
-            bigquery.create_table(task['table'], options: task)
+            bigquery.create_table(task['table'])
           else
             bigquery.get_table(task['table']) # raises NotFoundError
           end
@@ -313,7 +322,7 @@ module Embulk
         if task['mode'] == 'replace_backup'
           if task['time_partitioning'] and Helper.has_partition_decorator?(task['table_old'])
             if task['auto_create_table']
-              bigquery.create_table(task['table_old'], dataset: task['dataset_old'], options: task)
+              bigquery.create_table(task['table_old'], dataset: task['dataset_old'])
             else
               bigquery.get_table(task['table_old'], dataset: task['dataset_old']) # raises NotFoundError
             end

data/lib/embulk/output/bigquery/bigquery_client.rb CHANGED Viewed

@@ -194,6 +194,10 @@ module Embulk
                 }
               }
+              if @task['schema_update_options']
+                body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
+              end
               opts = {
                 upload_source: path,
                 content_type: "application/octet-stream",
@@ -254,6 +258,10 @@ module Embulk
                 }
               }
+              if @task['schema_update_options']
+                body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
+              end
               opts = {}
               Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
               response = with_network_retry { client.insert_job(@project, body, opts) }
@@ -371,10 +379,16 @@ module Embulk
           end
         end
-        def create_table(table, dataset: nil, options: {})
+        def create_table(table, dataset: nil, options: nil)
           begin
-            table = Helper.chomp_partition_decorator(table)
             dataset ||= @dataset
+            options ||= {}
+            options['time_partitioning'] ||= @task['time_partitioning']
+            if Helper.has_partition_decorator?(table)
+              options['time_partitioning'] ||= {'type' => 'DAY'}
+              table = Helper.chomp_partition_decorator(table)
+            end
             Embulk.logger.info { "embulk-output-bigquery: Create table... #{@project}:#{dataset}.#{table}" }
             body = {
               table_reference: {
@@ -384,12 +398,14 @@ module Embulk
                 fields: fields,
               }
             }
             if options['time_partitioning']
               body[:time_partitioning] = {
                 type: options['time_partitioning']['type'],
                 expiration_ms: options['time_partitioning']['expiration_ms'],
               }
             end
             opts = {}
             Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{dataset}, #{body}, #{opts})" }
             with_network_retry { client.insert_table(@project, dataset, body, opts) }

data/test/test_bigquery_client.rb CHANGED Viewed

@@ -110,7 +110,7 @@ else
             client.delete_table('your_table_name')
             assert_nothing_raised do
               client.create_table('your_table_name$20160929', options:{
-                'time_partitioning' => {'type'=>'DAY'}
+                'time_partitioning' => {'type'=>'DAY', 'expiration_ms'=>1000}
               })
             end
           end
@@ -153,15 +153,15 @@ else
         sub_test_case "delete_partition" do
           def test_delete_partition
-            client.create_table('your_table_name$20160929', options:{
-              'time_partitioning' => {'type'=>'DAY'}
-            })
+            client.delete_table('your_table_name')
+            client.create_table('your_table_name$20160929')
             assert_nothing_raised { client.delete_partition('your_table_name$20160929') }
           ensure
             client.delete_table('your_table_name')
           end
           def test_delete_partition_of_non_partitioned_table
+            client.delete_table('your_table_name')
             client.create_table('your_table_name')
             assert_raise { client.delete_partition('your_table_name$20160929') }
           ensure

data/test/test_configure.rb CHANGED Viewed

@@ -266,6 +266,14 @@ module Embulk
         task = Bigquery.configure(config, schema, processor_count)
         assert_equal 'DAY', task['time_partitioning']['type']
       end
+      def test_schema_update_options
+        config = least_config.merge('schema_update_options' => ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'])
+        assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
+        config = least_config.merge('schema_update_options' => ['FOO'])
+        assert_raise { Bigquery.configure(config, schema, processor_count) }
+      end
     end
   end
 end

data/test/test_transaction.rb CHANGED Viewed

@@ -55,7 +55,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).create_dataset(config['dataset'])
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
@@ -74,7 +74,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).create_dataset(config['dataset'])
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
@@ -87,7 +87,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).delete_table(config['table'])
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
@@ -98,7 +98,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).delete_partition(config['table'])
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
@@ -110,7 +110,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
             mock(obj).delete_table(config['temp_table'])
           end
@@ -122,7 +122,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(config['table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
             mock(obj).delete_table(config['temp_table'])
@@ -135,8 +135,8 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['temp_table'])
+            mock(obj).create_table(config['table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
             mock(obj).delete_table(config['temp_table'])
           end
@@ -151,7 +151,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).get_dataset(config['dataset_old'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(task['table'])
             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -168,7 +168,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).create_dataset(config['dataset'])
             mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(task['table'])
             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -185,7 +185,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).get_dataset(config['dataset_old'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(task['table'])
             mock(obj).get_table(task['table_old'], dataset: config['dataset_old'])
@@ -204,9 +204,9 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).get_dataset(config['dataset_old'])
-            mock(obj).create_table(config['temp_table'], options: task)
-            mock(obj).create_table(task['table'], options: task)
-            mock(obj).create_table(task['table_old'], dataset: config['dataset_old'], options: task)
+            mock(obj).create_table(config['temp_table'])
+            mock(obj).create_table(task['table'])
+            mock(obj).create_table(task['table_old'], dataset: config['dataset_old'])
             mock(obj).get_table(task['table'])
             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -224,7 +224,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
             mock(obj).delete_table(config['temp_table'])
           end
@@ -236,7 +236,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(config['table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
             mock(obj).delete_table(config['temp_table'])
@@ -249,15 +249,14 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['temp_table'])
+            mock(obj).create_table(config['table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
             mock(obj).delete_table(config['temp_table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
       end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: embulk-output-bigquery
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.4.1
 platform: ruby
 authors:
 - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-10-01 00:00:00.000000000 Z
+date: 2016-10-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: google-api-client
@@ -97,6 +97,7 @@ files:
 - README.md
 - Rakefile
 - embulk-output-bigquery.gemspec
+- example/config_append_direct_schema_update_options.yml
 - example/config_client_options.yml
 - example/config_csv.yml
 - example/config_delete_in_advance.yml