RubyGems - fluent-plugin-bigquery-custom - Versions diffs - 0.3.0 → 0.3.1 - Mend

fluent-plugin-bigquery-custom 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +29 -0
data/lib/fluent/plugin/bigquery/version.rb +1 -1
data/lib/fluent/plugin/out_bigquery.rb +38 -13
data/test/plugin/test_out_bigquery.rb +18 -7
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 9f634996c0de109e264c651d08ac1e118a9694d2
-  data.tar.gz: edfb078ea6100688d83c5bcce0f7f5298a4e7d84
+  metadata.gz: 1d8724c6a0d0b2cfbdc48170fae356e341b70880
+  data.tar.gz: 0ce6f9e76d9cfb56a09ea34a6ad2023ec653d4bb
 SHA512:
-  metadata.gz: d960bd5956b8ae9da5522f1372e698afaa0807e35ce67d2fe2cdc56837c626d822d43938e4162aa43c151affd52ff5cf02f8cf8fac2c50303aa0f4af0712b232
-  data.tar.gz: c8a4e351374c459aebd6ec3a72970c96f8895f9f54140ebf6dc1cadd828ba093407ddd2995a75f95e00bd7d2ac245ef36cf05e8152702bce0723bfa26bd8003d
+  metadata.gz: 7fa15645710affa98c22858899a7053c598c3aa9a025ebc4c35471e21ad6f3d483d9b2a7fc6a12f4bf1ec52a926e911085680971503abb74ed7e867af8eef058
+  data.tar.gz: 4a0473a65349f20448f9ac7c1265630907ce29d036f82d1a6e153cf468a88bd0b8a241d7a7e806748b0b25c6b8649943c6bbbd9ab9b04bf79d2f1aaa6a65b0ec

data/README.md CHANGED Viewed

@@ -26,6 +26,7 @@ OAuth flow for installed applications.
   - `skip_invalid_rows`
   - `max_bad_records`
   - `ignore_unknown_values`
+  - `prevent_duplicate_load`
 - Improve error handling
 ## Configuration
@@ -407,6 +408,34 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
 </match>
 ```
+### Prevent duplicate load
+If you want to detect duplicate load job, you set `prevent_duplicate_load` to `true`
+`prevent_duplicate_load` makes load job\_id consistent.
+For example, even if fluentd process crashed during waiting for job, fluentd can resume waiting for same job.
+```apache
+<match dummy>
+  type bigquery
+  ...
+  prevent_duplicate_load true
+</match>
+```
+job\_id is calculated by SHA1. The factors are ...
+- upload source path (file buffer path)
+- dataset
+- table
+- schema
+- `max_bad_records`
+- `ignore_unknown_values`
+NOTE: Duplicate job error does not invoke `flush_secondary`.
+NOTE: This option affects only when use file buffer.
 ## TODO
 * Automatically configured flush/buffer options

data/lib/fluent/plugin/bigquery/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Fluent
   module BigQueryPlugin
-    VERSION = "0.3.0"
+    VERSION = "0.3.1"
   end
 end

data/lib/fluent/plugin/out_bigquery.rb CHANGED Viewed

@@ -102,7 +102,10 @@ module Fluent
     config_param :utc, :bool, default: nil
     config_param :time_field, :string, default: nil
+    # insert_id_field (only insert)
     config_param :insert_id_field, :string, default: nil
+    # prevent_duplicate_load (only load)
+    config_param :prevent_duplicate_load, :bool, default: false
     config_param :method, :string, default: 'insert' # or 'load'
@@ -164,6 +167,7 @@ module Fluent
       if @method == "insert"
         extend(InsertImplementation)
       elsif @method == "load"
+        require 'digest/sha1'
         extend(LoadImplementation)
       else
         raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
@@ -422,10 +426,13 @@ module Fluent
           raise "table created. send rows next time."
         end
+        reason = e.respond_to?(:reason) ? e.reason : nil
         log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
-        if e.reason == "backendError"
-          raise "failed to insert into bigquery, retry" # TODO: error class
-        elsif @secondary
+        raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
+        # other error handling
+        if @secondary
           flush_secondary(@secondary)
         end
       end
@@ -447,8 +454,13 @@ module Fluent
       def _write(chunk, table_id)
         res = nil
+        job_id = nil
         create_upload_source(chunk) do |upload_source|
-          res = client.insert_job(@project, {
+          if @prevent_duplicate_load
+            job_id = create_job_id(upload_source.path, @dataset, @table, @fields.to_a, @max_bad_records, @ignore_unknown_values)
+          end
+          configuration = {
             configuration: {
               load: {
                 destination_table: {
@@ -465,28 +477,37 @@ module Fluent
                 max_bad_records: @max_bad_records,
               }
             }
-          }, {upload_source: upload_source, content_type: "application/octet-stream"})
+          }
+          configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
+          res = client.insert_job(@project, configuration, {upload_source: upload_source, content_type: "application/octet-stream"})
         end
-        wait_load(res, table_id)
+        wait_load(res.job_reference.job_id)
       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
         # api_error? -> client cache clear
         @cached_client = nil
-        log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
-        if e.reason == "backendError"
-          raise "failed to insert into bigquery, retry" # TODO: error class
-        elsif @secondary
+        reason = e.respond_to?(:reason) ? e.reason : nil
+        log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
+        raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
+        return wait_load(job_id) if e.status_code == 409 && e.message =~ /Job/ # duplicate load job
+        # other error handling
+        if @secondary
           flush_secondary(@secondary)
         end
       end
       private
-      def wait_load(res, table_id)
+      def wait_load(job_id)
         wait_interval = 10
-        _response = res
+        _response = client.get_job(@project, job_id)
+        table_id = _response.configuration.load.destination_table.table_id
         until _response.status.state == "DONE"
-          log.debug "wait for load job finish", state: _response.status.state
+          log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
           sleep wait_interval
           _response = client.get_job(@project, _response.job_reference.job_id)
         end
@@ -527,6 +548,10 @@ module Fluent
           end
         end
       end
+      def create_job_id(upload_source_path, dataset, table, schema, max_bad_records, ignore_unknown_values)
+        "fluentd_job_" + Digest::SHA1.hexdigest("#{upload_source_path}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
+      end
     end
     class FieldSchema

data/test/plugin/test_out_bigquery.rb CHANGED Viewed

@@ -881,13 +881,24 @@ class BigQueryOutputTest < Test::Unit::TestCase
           }
         }
       }, {upload_source: io, content_type: "application/octet-stream"}) {
-        s = stub!
-        status_stub = stub!
-        s.status { status_stub }
-        status_stub.state { "DONE" }
-        status_stub.error_result { nil }
-        status_stub.errors { nil }
-        s
+        Google::Apis::BigqueryV2::Job.new({
+          job_reference: Google::Apis::BigqueryV2::JobReference.new({job_id: "job_id"})
+        })
+      }
+      expect.get_job('yourproject_id', "job_id") {
+        Google::Apis::BigqueryV2::Job.new({
+          configuration: Google::Apis::BigqueryV2::JobConfiguration.new({
+            load: Google::Apis::BigqueryV2::JobConfigurationLoad.new({
+              destination_table: Google::Apis::BigqueryV2::TableReference.new({
+                project_id: 'yourproject_id',
+                dataset_id: 'yourdataset_id',
+                table_id: 'foo',
+              }),
+            })
+          }),
+          status: Google::Apis::BigqueryV2::JobStatus.new({state: "DONE"}),
+        })
       }
     end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fluent-plugin-bigquery-custom
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.3.1
 platform: ruby
 authors:
 - Tomohiro Hashidate
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-01-11 00:00:00.000000000 Z
+date: 2016-01-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake