RubyGems - embulk-output-bigquery - Versions diffs - 0.3.4 → 0.3.5 - Mend

embulk-output-bigquery 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/embulk-output-bigquery.gemspec +1 -1
data/lib/embulk/output/bigquery/bigquery_client.rb +159 -133
data/lib/embulk/output/bigquery/gcs_client.rb +17 -0
data/lib/embulk/output/bigquery/google_client.rb +2 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f68ceb57a4eff6886157c585425526389623d0a2
-  data.tar.gz: b44323059a3057bb5de7fdd7b00d61ce970f3386
+  metadata.gz: b0856b220a3d9c7b78dbffe45b35edf8e10b4fba
+  data.tar.gz: 8ce985e90cfd9aa9b88c6cb7ec994d6e89184584
 SHA512:
-  metadata.gz: 5cc7b1245bda2ae8c5d581c67a09ce0685c7812658c3c47e195362290fd50c13abfb7a3e9bb2360bc01a6d6aa82009ce190bef667cfb1df2cddaeb653c162c14
-  data.tar.gz: 4f8611f292a61750568c7b15e7ae6f83bc83d09ae3f64b2359b8f6f4e4d4b7ac115e09e6e8fbb5cc2e98b89103b8d1aba0640e0d89b035dbab2e5feea0d47449
+  metadata.gz: 02c3432b3494df8ca2f1901dda75488380574a7047228434a800d1578975ce9b6abb0bd65a0a5fc95285077d6963d19160241ed96e2b42d28cd30920e8c66230
+  data.tar.gz: dd7aec6748550836b6195d5ee7d403ca126a52d194fe9a0a8e3bf7257bb7639931f3e6df7fde2908a1e8f610577b085ca30c50876a800040ff7b71aedb80e1f8

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,8 @@
+## 0.3.5 - 2016-06-13
+* [enhancement] retry backendError and internalError in waiting load job
+* [enhancement] retry Broken pipe and Connection reset in inserting object to GCS
 ## 0.3.4 - 2016-06-01
 * [new feature] Add `gcs_bucket` option to load multiple files from a GCS bucket with one load job

data/embulk-output-bigquery.gemspec CHANGED Viewed

@@ -1,6 +1,6 @@
 Gem::Specification.new do |spec|
   spec.name          = "embulk-output-bigquery"
-  spec.version       = "0.3.4"
+  spec.version       = "0.3.5"
   spec.authors       = ["Satoshi Akama", "Naotoshi Seo"]
   spec.summary       = "Google BigQuery output plugin for Embulk"
   spec.description   = "Embulk plugin that insert records to Google BigQuery."

data/lib/embulk/output/bigquery/bigquery_client.rb CHANGED Viewed

@@ -40,59 +40,77 @@ module Embulk
           self.fields
         end
-        # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
-        # @return [Array] responses
-        def load_from_gcs(object_uris, table)
+        def with_retry_job(&block)
+          retries = 0
           begin
-            # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
-            # we should generate job_id in client code, otherwise, retrying would cause duplication
-            if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
-              job_id = Helper.create_load_job_id(@task, path, fields)
+            yield
+          rescue BackendError, InternalError => e
+            if retries < @task['retries']
+              retries += 1
+              Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.message}" }
+              retry
             else
-              job_id = "embulk_load_job_#{SecureRandom.uuid}"
+              Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.message}" }
+              raise e
             end
-            Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
+          end
+        end
-            body = {
-              job_reference: {
-                project_id: @project,
-                job_id: job_id,
-              },
-              configuration: {
-                load: {
-                  destination_table: {
-                    project_id: @project,
-                    dataset_id: @dataset,
-                    table_id: table,
-                  },
-                  schema: {
-                    fields: fields,
-                  },
-                  write_disposition: 'WRITE_APPEND',
-                  source_format:         @task['source_format'],
-                  max_bad_records:       @task['max_bad_records'],
-                  field_delimiter:       @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
-                  encoding:              @task['encoding'],
-                  ignore_unknown_values: @task['ignore_unknown_values'],
-                  allow_quoted_newlines: @task['allow_quoted_newlines'],
-                  source_uris: object_uris,
+        # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
+        # @return [Array] responses
+        def load_from_gcs(object_uris, table)
+          with_retry_job do
+            begin
+              # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
+              # we should generate job_id in client code, otherwise, retrying would cause duplication
+              if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
+                job_id = Helper.create_load_job_id(@task, path, fields)
+              else
+                job_id = "embulk_load_job_#{SecureRandom.uuid}"
+              end
+              Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
+              body = {
+                job_reference: {
+                  project_id: @project,
+                  job_id: job_id,
+                },
+                configuration: {
+                  load: {
+                    destination_table: {
+                      project_id: @project,
+                      dataset_id: @dataset,
+                      table_id: table,
+                    },
+                    schema: {
+                      fields: fields,
+                    },
+                    write_disposition: 'WRITE_APPEND',
+                    source_format:         @task['source_format'],
+                    max_bad_records:       @task['max_bad_records'],
+                    field_delimiter:       @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
+                    encoding:              @task['encoding'],
+                    ignore_unknown_values: @task['ignore_unknown_values'],
+                    allow_quoted_newlines: @task['allow_quoted_newlines'],
+                    source_uris: object_uris,
+                  }
                 }
               }
-            }
-            opts = {}
+              opts = {}
-            Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
-            response = client.insert_job(@project, body, opts)
-            unless @task['is_skip_job_result_check']
-              response = wait_load('Load', response)
+              Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
+              response = client.insert_job(@project, body, opts)
+              unless @task['is_skip_job_result_check']
+                response = wait_load('Load', response)
+              end
+              [response]
+            rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+              response = {status_code: e.status_code, message: e.message, error_class: e.class}
+              Embulk.logger.error {
+                "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
+              }
+              raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
             end
-            [response]
-          rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
-            response = {status_code: e.status_code, message: e.message, error_class: e.class}
-            Embulk.logger.error {
-              "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
-            }
-            raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
           end
         end
@@ -126,90 +144,93 @@ module Embulk
         end
         def load(path, table)
-          begin
-            if File.exist?(path)
-              # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
-              # we should generate job_id in client code, otherwise, retrying would cause duplication
-              if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
-                job_id = Helper.create_load_job_id(@task, path, fields)
+          with_retry_job do
+            begin
+              if File.exist?(path)
+                # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
+                # we should generate job_id in client code, otherwise, retrying would cause duplication
+                if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
+                  job_id = Helper.create_load_job_id(@task, path, fields)
+                else
+                  job_id = "embulk_load_job_#{SecureRandom.uuid}"
+                end
+                Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
               else
-                job_id = "embulk_load_job_#{SecureRandom.uuid}"
+                Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
+                return
               end
-              Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
-            else
-              Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
-              return
-            end
-            body = {
-              job_reference: {
-                project_id: @project,
-                job_id: job_id,
-              },
-              configuration: {
-                load: {
-                  destination_table: {
-                    project_id: @project,
-                    dataset_id: @dataset,
-                    table_id: table,
-                  },
-                  schema: {
-                    fields: fields,
-                  },
-                  write_disposition: 'WRITE_APPEND',
-                  source_format:         @task['source_format'],
-                  max_bad_records:       @task['max_bad_records'],
-                  field_delimiter:       @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
-                  encoding:              @task['encoding'],
-                  ignore_unknown_values: @task['ignore_unknown_values'],
-                  allow_quoted_newlines: @task['allow_quoted_newlines'],
+              body = {
+                job_reference: {
+                  project_id: @project,
+                  job_id: job_id,
+                },
+                configuration: {
+                  load: {
+                    destination_table: {
+                      project_id: @project,
+                      dataset_id: @dataset,
+                      table_id: table,
+                    },
+                    schema: {
+                      fields: fields,
+                    },
+                    write_disposition: 'WRITE_APPEND',
+                    source_format:         @task['source_format'],
+                    max_bad_records:       @task['max_bad_records'],
+                    field_delimiter:       @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
+                    encoding:              @task['encoding'],
+                    ignore_unknown_values: @task['ignore_unknown_values'],
+                    allow_quoted_newlines: @task['allow_quoted_newlines'],
+                  }
                 }
               }
-            }
-            opts = {
-              upload_source: path,
-              content_type: "application/octet-stream",
-              # options: {
-              #   retries: @task['retries'],
-              #   timeout_sec: @task['timeout_sec'],
-              #   open_timeout_sec: @task['open_timeout_sec']
-              # },
-            }
-            Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
-            response = client.insert_job(@project, body, opts)
-            unless @task['is_skip_job_result_check']
-              response = wait_load('Load', response)
+              opts = {
+                upload_source: path,
+                content_type: "application/octet-stream",
+                # options: {
+                #   retries: @task['retries'],
+                #   timeout_sec: @task['timeout_sec'],
+                #   open_timeout_sec: @task['open_timeout_sec']
+                # },
+              }
+              Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
+              response = client.insert_job(@project, body, opts)
+              unless @task['is_skip_job_result_check']
+                response = wait_load('Load', response)
+              end
+            rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+              response = {status_code: e.status_code, message: e.message, error_class: e.class}
+              Embulk.logger.error {
+                "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
+              }
+              raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
             end
-          rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
-            response = {status_code: e.status_code, message: e.message, error_class: e.class}
-            Embulk.logger.error {
-              "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
-            }
-            raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
           end
         end
         def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
-          begin
-            destination_dataset ||= @dataset
-            job_id = "embulk_copy_job_#{SecureRandom.uuid}"
+          with_retry_job do
+            begin
+              destination_dataset ||= @dataset
+              job_id = "embulk_copy_job_#{SecureRandom.uuid}"
-            Embulk.logger.info {
-              "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
-              "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
-            }
+              Embulk.logger.info {
+                "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
+                "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
+              }
-            body = {
-              job_reference: {
-                project_id: @project,
-                job_id: job_id,
-              },
-              configuration: {
-                copy: {
-                  create_deposition: 'CREATE_IF_NEEDED',
-                  write_disposition: write_disposition,
-                  source_table: {
+              body = {
+                job_reference: {
+                  project_id: @project,
+                  job_id: job_id,
+                },
+                configuration: {
+                  copy: {
+                    create_deposition: 'CREATE_IF_NEEDED',
+                    write_disposition: write_disposition,
+                    source_table: {
                     project_id: @project,
                     dataset_id: @dataset,
                     table_id: source_table,
@@ -219,21 +240,22 @@ module Embulk
                     dataset_id: destination_dataset,
                     table_id: destination_table,
                   },
+                  }
                 }
               }
-            }
-            opts = {}
-            Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
-            response = client.insert_job(@project, body, opts)
-            wait_load('Copy', response)
-          rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
-            response = {status_code: e.status_code, message: e.message, error_class: e.class}
-            Embulk.logger.error {
-              "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
-            }
-            raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
-              "to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
+              opts = {}
+              Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
+              response = client.insert_job(@project, body, opts)
+              wait_load('Copy', response)
+            rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+              response = {status_code: e.status_code, message: e.message, error_class: e.class}
+              Embulk.logger.error {
+                "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
+              }
+              raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
+                "to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
+            end
           end
         end
@@ -273,11 +295,15 @@ module Embulk
           # `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
           # Otherwise, this returns nil.
           if _errors = _response.status.errors
-            Embulk.logger.error {
-              "embulk-output-bigquery: get_job(#{@project}, #{job_id}), " \
-              "errors:#{_errors.map(&:to_h)}"
-            }
-            raise Error, "failed during waiting a #{kind} job, errors:#{_errors.map(&:to_h)}"
+            msg = "failed during waiting a #{kind} job, get_job(#{@project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
+            if _errors.any? {|error| error.reason == 'backendError' }
+              raise BackendError, msg
+            elsif _errors.any? {|error| error.reason == 'internalError' }
+              raise InternalError, msg
+            else
+              Embulk.logger.error { "embulk-output-bigquery: #{msg}" }
+              raise Error, msg
+            end
           end
           Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }

data/lib/embulk/output/bigquery/gcs_client.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 require 'uri'
+require 'java'
 require 'google/apis/storage_v1'
 require_relative 'google_client'
 require_relative 'helper'
@@ -49,6 +50,7 @@ module Embulk
           object_uri = URI.join("gs://#{bucket}", object).to_s
           started = Time.now
+          retries = 0
           begin
             Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
             body = {
@@ -68,6 +70,21 @@ module Embulk
               "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
             }
             raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
+          rescue ::Java::Java.net.SocketException => e
+            # I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
+            # I am doubting as this is caused by Google's unstable network
+            # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
+            if e.message == 'Broken pipe' || e.message == 'Connection reset'
+              if retries < @task['retries']
+                response = {message: e.message, error_class: e.class}
+                Embulk.logger.warn {
+                  "embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
+                }
+                retries += 1 # want to share with google-api-ruby-client, but it is difficult
+                retry
+              end
+            end
+            raise e
           end
         end

data/lib/embulk/output/bigquery/google_client.rb CHANGED Viewed

@@ -6,6 +6,8 @@ module Embulk
       class Error < StandardError; end
       class JobTimeoutError < Error; end
       class NotFoundError < Error; end
+      class BackendError < Error; end
+      class InternalError < Error; end
       class GoogleClient
         def initialize(task, scope, client_class)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: embulk-output-bigquery
 version: !ruby/object:Gem::Version
-  version: 0.3.4
+  version: 0.3.5
 platform: ruby
 authors:
 - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-06-01 00:00:00.000000000 Z
+date: 2016-06-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: google-api-client