RubyGems - embulk-output-bigquery - Versions diffs - 0.4.0 → 0.4.1 - Mend

embulk-output-bigquery 0.4.0 → 0.4.1

Files changed (11) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/README.md +23 -3
data/embulk-output-bigquery.gemspec +1 -1
data/example/config_append_direct_schema_update_options.yml +31 -0
data/lib/embulk/output/bigquery.rb +14 -5
data/lib/embulk/output/bigquery/bigquery_client.rb +18 -2
data/test/test_bigquery_client.rb +4 -4
data/test/test_configure.rb +8 -0
data/test/test_transaction.rb +18 -19
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 71bc9b253f725436a06e183667cbc87720c3719b
-  data.tar.gz: a32e43da05a4f90ab72c5715ffdf6b08501996d4
+  metadata.gz: 1ae4bf7af71e37194f768fad9e16e415747fee70
+  data.tar.gz: 796f2c3253d5600c439597f7ca495a7d1d8bac95
 SHA512:
-  metadata.gz: bd3d8aefbc98c2f044b782f807f595603ac7b11052a06b6486803fd2f6871127058a50e9c69ffc1fac92b75de9561c57e99ad9ba3cd8899507e93085d45ed615
-  data.tar.gz: 813b6455f463940968232b4332b8553698b9ef99ad4f3f5af6800b10223c33498fde9f8915604090f85dc7c2f78d16a865cd90da2174447c7f84ab3ef80a4cf8
+  metadata.gz: 3011c4128b2ed28a0fd84d0e2a592d706434ea24746c3aaf44f2e86f588b83da80e59b2931426cba0d390906ec283690966a4eb3b092d7d8edce364fc6ecc2b2
+  data.tar.gz: e969f903a71e5bf500fc57fb7a8b2ae2e4ffa8760557c329d3013de19959016ba7ef0578df3bdb7fb7cb0f62a3cb7a58f62cdfa23bb81d46476390884461e1a3

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,7 @@
+## 0.4.1 - 2016-10-03
+* [enhancement] Support `schema_update_options` option
 ## 0.4.0 - 2016-10-01
 * [enhancement] Support partitioned table

data/README.md CHANGED Viewed

@@ -100,9 +100,10 @@ Following options are same as [bq command-line tools](https://cloud.google.com/b
 |  encoding                         | string   | optional  | "UTF-8" | `UTF-8` or `ISO-8859-1` |
 |  ignore_unknown_values            | boolean  | optional  | false   | |
 |  allow_quoted_newlines            | boolean  | optional  | false   | Set true, if data contains newline characters. It may cause slow procsssing |
-|  time_partitioning                | hash     | optional  | nil     | See [Time Partitioning](#time-partitioning) |
+|  time_partitioning                | hash     | optional  | `{"type":"DAY"}` if `table` parameter has a partition decorator, otherwise nil | See [Time Partitioning](#time-partitioning) |
 |  time_partitioning.type           | string   | required  | nil     | The only type supported is DAY, which will generate one partition per day based on data loading time. |
 |  time_partitioning.expiration__ms | int      | optional  | nil     | Number of milliseconds for which to keep the storage for a partition. partition |
+|  schema_update_options            | array    | optional  | nil     | List of `ALLOW_FIELD_ADDITION` or `ALLOW_FIELD_RELAXATION` or both. See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) |
 ### Example
@@ -365,7 +366,7 @@ Using `gcs_bucket` option, such strategy is enabled. You may also use `auto_crea
 out:
   type: bigquery
   gcs_bucket: bucket_name
-  auto_create_gcs_bucket: false
+  auto_create_gcs_bucket: true
 ```
 ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers into GCS.
@@ -391,11 +392,30 @@ out:
   type: bigquery
   table: table_name$20160929
   auto_create_table: true
-  time-partitioning:
+  time_partitioning:
     type: DAY
     expiration_ms: 259200000
 ```
+Use `schema_update_options` to allow the schema of the desitination table to be updated as a side effect of the load job as:
+```yaml
+out:
+  type: bigquery
+  table: table_name$20160929
+  auto_create_table: true
+  time_partitioning:
+    type: DAY
+    expiration_ms: 259200000
+  schema_update_options:
+    - ALLOW_FIELD_ADDITION
+    - ALLOW_FIELD_RELAXATION
+```
+It seems that only adding a new column, and relaxing non-necessary columns to be `NULLABLE` are supported now.
+Deleting columns, and renaming columns are not supported.
+See [jobs#configuration.load.schemaUpdateOptions](https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load.schemaUpdateOptions) for details.
 ## Development
 ### Run example:

data/embulk-output-bigquery.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name          = "embulk-output-bigquery"
-  spec.version       = "0.4.0"
+  spec.version       = "0.4.1"
   spec.authors       = ["Satoshi Akama", "Naotoshi Seo"]
   spec.summary       = "Google BigQuery output plugin for Embulk"
   spec.description   = "Embulk plugin that insert records to Google BigQuery."

data/example/config_append_direct_schema_update_options.yml ADDED Viewed

@@ -0,0 +1,31 @@
+in:
+  type: file
+  path_prefix: example/example.csv
+  parser:
+    type: csv
+    charset: UTF-8
+    newline: CRLF
+    null_string: 'NULL'
+    skip_header_lines: 1
+    comment_line_marker: '#'
+    columns:
+      - {name: date,        type: string}
+      - {name: timestamp,   type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
+      - {name: "null",      type: string}
+      - {name: long,        type: long}
+      - {name: string,      type: string}
+      - {name: double,     type: double}
+      - {name: boolean,     type: boolean}
+out:
+  type: bigquery
+  mode: append_direct
+  auth_method: json_key
+  json_keyfile: example/your-project-000.json
+  dataset: your_dataset_name
+  table: your_table_name
+  source_format: NEWLINE_DELIMITED_JSON
+  compression: NONE
+  auto_create_dataset: true
+  auto_create_table: true
+  schema_file: example/schema.json
+  schema_update_options: [ALLOW_FIELD_ADDITION, ALLOW_FIELD_RELAXATION]

data/lib/embulk/output/bigquery.rb CHANGED Viewed

@@ -86,6 +86,7 @@ module Embulk
           'ignore_unknown_values'          => config.param('ignore_unknown_values',          :bool,    :default => false),
           'allow_quoted_newlines'          => config.param('allow_quoted_newlines',          :bool,    :default => false),
           'time_partitioning'              => config.param('time_partitioning',              :hash,    :default => nil),
+          'schema_update_options'          => config.param('schema_update_options',          :array,   :default => nil),
           # for debug
           'skip_load'                      => config.param('skip_load',                      :bool,    :default => false),
@@ -230,6 +231,14 @@ module Embulk
           task['time_partitioning'] = {'type' => 'DAY'}
         end
+        if task['schema_update_options']
+          task['schema_update_options'].each do |schema_update_option|
+            unless %w[ALLOW_FIELD_ADDITION ALLOW_FIELD_RELAXATION].include?(schema_update_option)
+              raise ConfigError.new "`schema_update_options` must contain either of ALLOW_FIELD_ADDITION or ALLOW_FIELD_RELAXATION or both"
+            end
+          end
+        end
         task
       end
@@ -292,19 +301,19 @@ module Embulk
           else
             bigquery.delete_table(task['table'])
           end
-          bigquery.create_table(task['table'], options: task)
+          bigquery.create_table(task['table'])
         when 'replace', 'replace_backup', 'append'
-          bigquery.create_table(task['temp_table'], options: task)
+          bigquery.create_table(task['temp_table'])
           if task['time_partitioning']
             if task['auto_create_table']
-              bigquery.create_table(task['table'], options: task)
+              bigquery.create_table(task['table'])
             else
               bigquery.get_table(task['table']) # raises NotFoundError
             end
           end
         else # append_direct
           if task['auto_create_table']
-            bigquery.create_table(task['table'], options: task)
+            bigquery.create_table(task['table'])
           else
             bigquery.get_table(task['table']) # raises NotFoundError
           end
@@ -313,7 +322,7 @@ module Embulk
         if task['mode'] == 'replace_backup'
           if task['time_partitioning'] and Helper.has_partition_decorator?(task['table_old'])
             if task['auto_create_table']
-              bigquery.create_table(task['table_old'], dataset: task['dataset_old'], options: task)
+              bigquery.create_table(task['table_old'], dataset: task['dataset_old'])
             else
               bigquery.get_table(task['table_old'], dataset: task['dataset_old']) # raises NotFoundError
             end

data/lib/embulk/output/bigquery/bigquery_client.rb CHANGED Viewed

@@ -194,6 +194,10 @@ module Embulk
                 }
               }
+              if @task['schema_update_options']
+                body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
+              end
               opts = {
                 upload_source: path,
                 content_type: "application/octet-stream",
@@ -254,6 +258,10 @@ module Embulk
                 }
               }
+              if @task['schema_update_options']
+                body[:configuration][:load][:schema_update_options] = @task['schema_update_options']
+              end
               opts = {}
               Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
               response = with_network_retry { client.insert_job(@project, body, opts) }
@@ -371,10 +379,16 @@ module Embulk
           end
         end
-        def create_table(table, dataset: nil, options: {})
+        def create_table(table, dataset: nil, options: nil)
           begin
-            table = Helper.chomp_partition_decorator(table)
             dataset ||= @dataset
+            options ||= {}
+            options['time_partitioning'] ||= @task['time_partitioning']
+            if Helper.has_partition_decorator?(table)
+              options['time_partitioning'] ||= {'type' => 'DAY'}
+              table = Helper.chomp_partition_decorator(table)
+            end
             Embulk.logger.info { "embulk-output-bigquery: Create table... #{@project}:#{dataset}.#{table}" }
             body = {
               table_reference: {
@@ -384,12 +398,14 @@ module Embulk
                 fields: fields,
               }
             }
             if options['time_partitioning']
               body[:time_partitioning] = {
                 type: options['time_partitioning']['type'],
                 expiration_ms: options['time_partitioning']['expiration_ms'],
               }
             end
             opts = {}
             Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{dataset}, #{body}, #{opts})" }
             with_network_retry { client.insert_table(@project, dataset, body, opts) }

data/test/test_bigquery_client.rb CHANGED Viewed

@@ -110,7 +110,7 @@ else
             client.delete_table('your_table_name')
             assert_nothing_raised do
               client.create_table('your_table_name$20160929', options:{
-                'time_partitioning' => {'type'=>'DAY'}
+                'time_partitioning' => {'type'=>'DAY', 'expiration_ms'=>1000}
               })
             end
           end
@@ -153,15 +153,15 @@ else
         sub_test_case "delete_partition" do
           def test_delete_partition
-            client.create_table('your_table_name$20160929', options:{
-              'time_partitioning' => {'type'=>'DAY'}
-            })
+            client.delete_table('your_table_name')
+            client.create_table('your_table_name$20160929')
             assert_nothing_raised { client.delete_partition('your_table_name$20160929') }
           ensure
             client.delete_table('your_table_name')
           end
           def test_delete_partition_of_non_partitioned_table
+            client.delete_table('your_table_name')
             client.create_table('your_table_name')
             assert_raise { client.delete_partition('your_table_name$20160929') }
           ensure

data/test/test_configure.rb CHANGED Viewed

@@ -266,6 +266,14 @@ module Embulk
         task = Bigquery.configure(config, schema, processor_count)
         assert_equal 'DAY', task['time_partitioning']['type']
       end
+      def test_schema_update_options
+        config = least_config.merge('schema_update_options' => ['ALLOW_FIELD_ADDITION', 'ALLOW_FIELD_RELAXATION'])
+        assert_nothing_raised { Bigquery.configure(config, schema, processor_count) }
+        config = least_config.merge('schema_update_options' => ['FOO'])
+        assert_raise { Bigquery.configure(config, schema, processor_count) }
+      end
     end
   end
 end

data/test/test_transaction.rb CHANGED Viewed

@@ -55,7 +55,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).create_dataset(config['dataset'])
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
@@ -74,7 +74,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).create_dataset(config['dataset'])
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
@@ -87,7 +87,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).delete_table(config['table'])
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
@@ -98,7 +98,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).delete_partition(config['table'])
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
@@ -110,7 +110,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
             mock(obj).delete_table(config['temp_table'])
           end
@@ -122,7 +122,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(config['table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
             mock(obj).delete_table(config['temp_table'])
@@ -135,8 +135,8 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['temp_table'])
+            mock(obj).create_table(config['table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
             mock(obj).delete_table(config['temp_table'])
           end
@@ -151,7 +151,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).get_dataset(config['dataset_old'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(task['table'])
             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -168,7 +168,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).create_dataset(config['dataset'])
             mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(task['table'])
             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -185,7 +185,7 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).get_dataset(config['dataset_old'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(task['table'])
             mock(obj).get_table(task['table_old'], dataset: config['dataset_old'])
@@ -204,9 +204,9 @@ module Embulk
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
             mock(obj).get_dataset(config['dataset_old'])
-            mock(obj).create_table(config['temp_table'], options: task)
-            mock(obj).create_table(task['table'], options: task)
-            mock(obj).create_table(task['table_old'], dataset: config['dataset_old'], options: task)
+            mock(obj).create_table(config['temp_table'])
+            mock(obj).create_table(task['table'])
+            mock(obj).create_table(task['table_old'], dataset: config['dataset_old'])
             mock(obj).get_table(task['table'])
             mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
@@ -224,7 +224,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
             mock(obj).delete_table(config['temp_table'])
           end
@@ -236,7 +236,7 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
+            mock(obj).create_table(config['temp_table'])
             mock(obj).get_table(config['table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
             mock(obj).delete_table(config['temp_table'])
@@ -249,15 +249,14 @@ module Embulk
           task = Bigquery.configure(config, schema, processor_count)
           any_instance_of(BigqueryClient) do |obj|
             mock(obj).get_dataset(config['dataset'])
-            mock(obj).create_table(config['temp_table'], options: task)
-            mock(obj).create_table(config['table'], options: task)
+            mock(obj).create_table(config['temp_table'])
+            mock(obj).create_table(config['table'])
             mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
             mock(obj).delete_table(config['temp_table'])
           end
           Bigquery.transaction(config, schema, processor_count, &control)
         end
       end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: embulk-output-bigquery
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.4.1
 platform: ruby
 authors:
 - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-10-01 00:00:00.000000000 Z
+date: 2016-10-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: google-api-client
@@ -97,6 +97,7 @@ files:
 - README.md
 - Rakefile
 - embulk-output-bigquery.gemspec
+- example/config_append_direct_schema_update_options.yml
 - example/config_client_options.yml
 - example/config_csv.yml
 - example/config_delete_in_advance.yml