RubyGems - fluent-plugin-bigquery - Versions diffs - 1.2.0 → 2.0.0.beta - Mend

fluent-plugin-bigquery 1.2.0 → 2.0.0.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/.travis.yml +2 -9
data/README.md +68 -65
data/lib/fluent/plugin/bigquery/version.rb +1 -1
data/lib/fluent/plugin/bigquery/writer.rb +45 -39
data/lib/fluent/plugin/out_bigquery_base.rb +211 -0
data/lib/fluent/plugin/out_bigquery_insert.rb +131 -0
data/lib/fluent/plugin/out_bigquery_load.rb +220 -0
data/test/helper.rb +3 -1
data/test/plugin/test_out_bigquery_base.rb +579 -0
data/test/plugin/test_out_bigquery_insert.rb +420 -0
data/test/plugin/test_out_bigquery_load.rb +310 -0
metadata +13 -7
data/lib/fluent/plugin/out_bigquery.rb +0 -500
data/test/plugin/test_out_bigquery.rb +0 -1276

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ef9d7a814ddde267ffd9bf2e10837c162a01b12b
-  data.tar.gz: 5ca29129b5dfb449abd7e1a22d6d22fcba47a862
+  metadata.gz: '07998acf05ddb3e647da13a4b5c734dc16f8cc77'
+  data.tar.gz: 1fce9fc906cbf72083a4f8132c0ac1d985a95d6d
 SHA512:
-  metadata.gz: 658fb6ece7816aeb711960179fa53544fe639f97aa41bbfea1a0fe007edf1a54fe11aef4c63f0ea1d42f2b2582c9ed335c5919aaec2ec18f3d53fdfa3e9d9be1
-  data.tar.gz: 5ac9138e736eaf7dcf5b9fe1ecd46b92c67979313d93862fd061640f5dc5fc8f4bef3364cb0f053ab3a5a9d7d34ffeb1fef58dbbb532f59f99befe899b9066c6
+  metadata.gz: 04cfd6d3080d9424e25bd75ae1a9600259fe94ed933adceab66c02eb11afdb49eeddc393c305f0927dd64f967d1e72835fde9566cd54b2e53805e85ffe7a1516
+  data.tar.gz: 8de74527cf12be2c6553e4a582cc25c47a1773cdc165800f212aae563f7ffa048679260515a51f55e244b641b968badeaa4349cf4369ad2363d22aff1c1cbe7d

data/.travis.yml CHANGED

@@ -1,20 +1,13 @@
 language: ruby
 rvm:
-  - 2.1
   - 2.2
   - 2.3.3
+  - 2.4.3
+  - 2.5.0
 gemfile:
   - Gemfile
-  - gemfiles/activesupport-4.gemfile
-matrix:
-  exclude:
-    - rvm: 2.0
-      gemfile: Gemfile
-    - rvm: 2.1
-      gemfile: Gemfile
 before_install:
   - gem update bundler

data/README.md CHANGED

@@ -1,13 +1,17 @@
 # fluent-plugin-bigquery
+**This README is for v2.0.0.beta. but it is not released yet. sorry.**
 [Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery.
-- **Plugin type**: BufferedOutput
+- **Plugin type**: Output
 * insert data over streaming inserts
+  * plugin type is `bigquery_insert`
   * for continuous real-time insertions
   * https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
 * load data
+  * plugin type is `bigquery_load`
   * for data loading as batch jobs, for big amount of data
   * https://developers.google.com/bigquery/loading-data-into-bigquery
@@ -31,42 +35,47 @@ Because embbeded gem dependency sometimes restricts ruby environment.
 ### Options
-| name                                   | type          | required?                                    | placeholder? | default                    | description                                                                                                                                                                                |
-| :------------------------------------- | :------------ | :-----------                                 | :----------  | :------------------------- | :-----------------------                                                                                                                                                                   |
-| method                                 | string        | no                                           | no           | insert                     | `insert` (Streaming Insert) or `load` (load job)                                                                                                                                           |
-| auth_method                            | enum          | yes                                          | no           | private_key                | `private_key` or `json_key` or `compute_engine` or `application_default`                                                                                                                   |
-| email                                  | string        | yes (private_key)                            | no           | nil                        | GCP Service Account Email                                                                                                                                                                  |
-| private_key_path                       | string        | yes (private_key)                            | no           | nil                        | GCP Private Key file path                                                                                                                                                                  |
-| private_key_passphrase                 | string        | yes (private_key)                            | no           | nil                        | GCP Private Key Passphrase                                                                                                                                                                 |
-| json_key                               | string        | yes (json_key)                               | no           | nil                        | GCP JSON Key file path or JSON Key string                                                                                                                                                  |
-| project                                | string        | yes                                          | yes          | nil                        |                                                                                                                                                                                            |
-| dataset                                | string        | yes                                          | yes          | nil                        |                                                                                                                                                                                            |
-| table                                  | string        | yes (either `tables`)                        | yes          | nil                        |                                                                                                                                                                                            |
-| tables                                 | array(string) | yes (either `table`)                         | yes          | nil                        | can set multi table names splitted by `,`                                                                                                                                                  |
-| template_suffix                        | string        | no                                           | yes          | nil                        | can use `%{time_slice}` placeholder replaced by `time_slice_format`                                                                                                                        |
-| auto_create_table                      | bool          | no                                           | no           | false                      | If true, creates table automatically                                                                                                                                                       |
-| skip_invalid_rows                      | bool          | no                                           | no           | false                      | Only `insert` method.                                                                                                                                                                      |
-| max_bad_records                        | integer       | no                                           | no           | 0                          | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result.                                                                       |
-| ignore_unknown_values                  | bool          | no                                           | no           | false                      | Accept rows that contain values that do not match the schema. The unknown values are ignored.                                                                                              |
-| schema                                 | array         | yes (either `fetch_schema` or `schema_path`) | no           | nil                        | Schema Definition. It is formatted by JSON.                                                                                                                                                |
-| schema_path                            | string        | yes (either `fetch_schema`)                  | no           | nil                        | Schema Definition file path. It is formatted by JSON.                                                                                                                                      |
-| fetch_schema                           | bool          | yes (either `schema_path`)                   | no           | false                      | If true, fetch table schema definition from Bigquery table automatically.                                                                                                                  |
-| fetch_schema_table                     | string        | no                                           | yes          | nil                        | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored                                                                                     |
-| schema_cache_expire                    | integer       | no                                           | no           | 600                        | Value is second. If current time is after expiration interval, re-fetch table schema definition.                                                                                           |
-| insert_id_field                        | string        | no                                           | no           | nil                        | Use key as `insert_id` of Streaming Insert API parameter.                                                                                                                                  |
-| add_insert_timestamp                   | string        | no                                           | no           | nil                        | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
-| allow_retry_insert_errors              | bool          | no                                           | no           | false                      | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate.                                                                              |
-| request_timeout_sec                    | integer       | no                                           | no           | nil                        | Bigquery API response timeout                                                                                                                                                              |
-| request_open_timeout_sec               | integer       | no                                           | no           | 60                         | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value.                                                                                           |
-| time_partitioning_type                 | enum          | no (either day)                              | no           | nil                        | Type of bigquery time partitioning feature(experimental feature on BigQuery).                                                                                                              |
-| time_partitioning_expiration           | time          | no                                           | no           | nil                        | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery)                                                                                                 |
-### Deprecated
-| name                                   | type          | required?    | placeholder? | default                    | description              |
-| :------------------------------------- | :------------ | :----------- | :----------  | :------------------------- | :----------------------- |
-| replace_record_key                     | bool          | no           | no           | false                      | Use other filter plugin. |
-| replace_record_key_regexp{1-10}        | string        | no           | no           | nil                        |                          |
+#### common
+| name                                   | type          | required?                                    | placeholder? | default                    | description                                                                                            |
+| :------------------------------------- | :------------ | :-----------                                 | :----------  | :------------------------- | :-----------------------                                                                               |
+| auth_method                            | enum          | yes                                          | no           | private_key                | `private_key` or `json_key` or `compute_engine` or `application_default`                               |
+| email                                  | string        | yes (private_key)                            | no           | nil                        | GCP Service Account Email                                                                              |
+| private_key_path                       | string        | yes (private_key)                            | no           | nil                        | GCP Private Key file path                                                                              |
+| private_key_passphrase                 | string        | yes (private_key)                            | no           | nil                        | GCP Private Key Passphrase                                                                             |
+| json_key                               | string        | yes (json_key)                               | no           | nil                        | GCP JSON Key file path or JSON Key string                                                              |
+| project                                | string        | yes                                          | yes          | nil                        |                                                                                                        |
+| dataset                                | string        | yes                                          | yes          | nil                        |                                                                                                        |
+| table                                  | string        | yes (either `tables`)                        | yes          | nil                        |                                                                                                        |
+| tables                                 | array(string) | yes (either `table`)                         | yes          | nil                        | can set multi table names splitted by `,`                                                              |
+| auto_create_table                      | bool          | no                                           | no           | false                      | If true, creates table automatically                                                                   |
+| ignore_unknown_values                  | bool          | no                                           | no           | false                      | Accept rows that contain values that do not match the schema. The unknown values are ignored.          |
+| schema                                 | array         | yes (either `fetch_schema` or `schema_path`) | no           | nil                        | Schema Definition. It is formatted by JSON.                                                            |
+| schema_path                            | string        | yes (either `fetch_schema`)                  | no           | nil                        | Schema Definition file path. It is formatted by JSON.                                                  |
+| fetch_schema                           | bool          | yes (either `schema_path`)                   | no           | false                      | If true, fetch table schema definition from Bigquery table automatically.                              |
+| fetch_schema_table                     | string        | no                                           | yes          | nil                        | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
+| schema_cache_expire                    | integer       | no                                           | no           | 600                        | Value is second. If current time is after expiration interval, re-fetch table schema definition.       |
+| request_timeout_sec                    | integer       | no                                           | no           | nil                        | Bigquery API response timeout                                                                          |
+| request_open_timeout_sec               | integer       | no                                           | no           | 60                         | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value.       |
+| time_partitioning_type                 | enum          | no (either day)                              | no           | nil                        | Type of bigquery time partitioning feature(experimental feature on BigQuery).                          |
+| time_partitioning_expiration           | time          | no                                           | no           | nil                        | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery)             |
+#### bigquery_insert
+| name                                   | type          | required?    | placeholder? | default                    | description                                                                                                                                                                                |
+| :------------------------------------- | :------------ | :----------- | :----------  | :------------------------- | :-----------------------                                                                                                                                                                   |
+| template_suffix                        | string        | no           | yes          | nil                        | can use `%{time_slice}` placeholder replaced by `time_slice_format`                                                                                                                        |
+| skip_invalid_rows                      | bool          | no           | no           | false                      |                                                                                                                                                                                            |
+| insert_id_field                        | string        | no           | no           | nil                        | Use key as `insert_id` of Streaming Insert API parameter. see. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor                                                    |
+| add_insert_timestamp                   | string        | no           | no           | nil                        | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
+| allow_retry_insert_errors              | bool          | no           | no           | false                      | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate.                                                                              |
+#### bigquery_load
+| name                                   | type          | required?    | placeholder? | default                    | description                                                                                                                                    |
+| :------------------------------------- | :------------ | :----------- | :----------  | :------------------------- | :-----------------------                                                                                                                       |
+| source_format                          | enum          | no           | no           | json                       | Specify source format `json` or `csv` or `avro`. If you change this parameter, you must change formatter plugin via `<format>` config section. |
+| max_bad_records                        | integer       | no           | no           | 0                          | If the number of bad records exceeds this value, an invalid error is returned in the job result.                                               |
 ### Buffer section
@@ -77,9 +86,9 @@ Because embbeded gem dependency sometimes restricts ruby environment.
 | total_limit_size                       | integer       | no           | 1GB (insert) or 32GB (load)    |                                    |
 | chunk_records_limit                    | integer       | no           | 500 (insert) or nil (load)     |                                    |
 | flush_mode                             | enum          | no           | interval                       | default, lazy, interval, immediate |
-| flush_interval                         | float         | no           | 0.25 (insert) or nil (load)    |                                    |
-| flush_thread_interval                  | float         | no           | 0.05 (insert) or nil (load)    |                                    |
-| flush_thread_burst_interval            | float         | no           | 0.05 (insert) or nil (load)    |                                    |
+| flush_interval                         | float         | no           | 1.0 (insert) or 3600 (load)    |                                    |
+| flush_thread_interval                  | float         | no           | 0.05 (insert) or 5 (load)      |                                    |
+| flush_thread_burst_interval            | float         | no           | 0.05 (insert) or 5 (load)      |                                    |
 And, other params (defined by base class) are available
@@ -142,9 +151,7 @@ Configure insert specifications with target table schema, with your credentials.
 ```apache
 <match dummy>
-  @type bigquery
-  method insert    # default
+  @type bigquery_insert
   auth_method private_key   # default
   email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
@@ -181,14 +188,12 @@ For high rate inserts over streaming inserts, you should specify flush intervals
 ```apache
 <match dummy>
-  @type bigquery
-  method insert    # default
+  @type bigquery_insert
   <buffer>
     flush_interval 0.1  # flush as frequent as possible
-    buffer_queue_limit 10240        # 1MB * 10240 -> 10GB!
+    total_limit_size 10g
     flush_thread_count 16
   </buffer>
@@ -256,16 +261,12 @@ section in the Google BigQuery document.
 ### Load
 ```apache
 <match bigquery>
-  @type bigquery
-  method load
+  @type bigquery_load
   <buffer>
-  @type file
-  path bigquery.*.buffer
-  flush_interval 1800
-  flush_at_shutdown true
-  timekey_use_utc
+    path bigquery.*.buffer
+    flush_at_shutdown true
+    timekey_use_utc
   </buffer>
   auth_method json_key
@@ -302,7 +303,7 @@ download its JSON key and deploy the key with fluentd.
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   auth_method json_key
   json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
@@ -319,7 +320,7 @@ You need to only include `private_key` and `client_email` key from JSON key file
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   auth_method json_key
   json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
@@ -340,7 +341,7 @@ Compute Engine instance, then you can configure fluentd like this.
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   auth_method compute_engine
@@ -382,7 +383,7 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   ...
@@ -430,7 +431,7 @@ Use placeholder.
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   ...
   table   accesslog$%Y%m%d
@@ -453,7 +454,7 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   ...
@@ -477,7 +478,7 @@ you can also specify nested fields by prefixing their belonging record fields.
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   ...
@@ -528,7 +529,7 @@ The second method is to specify a path to a BigQuery schema file instead of list
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   ...
@@ -541,7 +542,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   ...
@@ -559,10 +560,12 @@ you are still recommended to specify JSON types for TIMESTAMP fields as "time" f
 BigQuery uses `insertId` property to detect duplicate insertion requests (see [data consistency](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataconsistency) in Google BigQuery documents).
 You can set `insert_id_field` option to specify the field to use as `insertId` property.
+`insert_id_field` can use fluentd record_accessor format like `$['key1'][0]['key2']`.
+(detail. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)
 ```apache
 <match dummy>
-  @type bigquery
+  @type bigquery_insert
   ...

data/lib/fluent/plugin/bigquery/version.rb CHANGED

@@ -1,5 +1,5 @@
 module Fluent
   module BigQueryPlugin
-    VERSION = "1.2.0".freeze
+    VERSION = "2.0.0.beta".freeze
   end
 end

data/lib/fluent/plugin/bigquery/writer.rb CHANGED

@@ -7,22 +7,15 @@ module Fluent
         @options = options
         @log = log
         @num_errors_per_chunk = {}
-        @cached_client_expiration = Time.now + 1800
       end
       def client
-        return @client if @client && @cached_client_expiration > Time.now
-        client = Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
+        @client ||= Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
           cl.authorization = get_auth
           cl.client_options.open_timeout_sec = @options[:open_timeout_sec] if @options[:open_timeout_sec]
           cl.client_options.read_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
           cl.client_options.send_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
         end
-        @cached_client_expiration = Time.now + 1800
-        @client = client
       end
       def create_table(project, dataset, table_id, record_schema)
@@ -49,10 +42,7 @@ module Fluent
           end
           client.insert_table(project, dataset, definition, {})
           log.debug "create table", project_id: project, dataset: dataset, table: table_id
-          @client = nil
         rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
-          @client = nil
           message = e.message
           if e.status_code == 409 && /Already Exists:/ =~ message
             log.debug "already created table", project_id: project, dataset: dataset, table: table_id
@@ -81,7 +71,6 @@ module Fluent
         schema
       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
-        @client = nil
         message = e.message
         log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
         nil
@@ -111,8 +100,6 @@ module Fluent
           end
         end
       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
-        @client = nil
         reason = e.respond_to?(:reason) ? e.reason : nil
         error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason }
         wrapped = Fluent::BigQuery::Error.wrap(e)
@@ -125,7 +112,17 @@ module Fluent
         raise wrapped
       end
-      def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields)
+      JobReference = Struct.new(:chunk_id, :chunk_id_hex, :project_id, :dataset_id, :table_id, :job_id) do
+        def as_hash(*keys)
+          if keys.empty?
+            to_h
+          else
+            to_h.select { |k, _| keys.include?(k) }
+          end
+        end
+      end
+      def create_load_job(chunk_id, chunk_id_hex, project, dataset, table_id, upload_source, fields)
         configuration = {
           configuration: {
             load: {
@@ -145,7 +142,7 @@ module Fluent
           }
         }
-        job_id = create_job_id(chunk_id, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
+        job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
         configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if @options[:time_partitioning_type]
         configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
@@ -167,11 +164,8 @@ module Fluent
             content_type: "application/octet-stream",
           }
         )
-        wait_load_job(chunk_id, project, dataset, res.job_reference.job_id, table_id)
-        @num_errors_per_chunk.delete(chunk_id)
+        JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id)
       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
-        @client = nil
         reason = e.respond_to?(:reason) ? e.reason : nil
         log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
@@ -187,44 +181,56 @@ module Fluent
         end
         if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
-          wait_load_job(chunk_id, project, dataset, job_id, table_id)
-          @num_errors_per_chunk.delete(chunk_id)
-          return
+          return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id)
         end
         raise Fluent::BigQuery::Error.wrap(e)
       end
-      def wait_load_job(chunk_id, project, dataset, job_id, table_id)
-        wait_interval = 10
-        _response = client.get_job(project, job_id)
+      def fetch_load_job(job_reference)
+        project = job_reference.project_id
+        job_id = job_reference.job_id
+        res = client.get_job(project, job_id)
+        log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
-        until _response.status.state == "DONE"
-          log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
-          sleep wait_interval
-          _response = client.get_job(project, _response.job_reference.job_id)
+        if res.status.state == "DONE"
+          res
         end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        e = Fluent::BigQuery::Error.wrap(e)
+        raise e unless e.retryable?
+      end
+      def commit_load_job(chunk_id_hex, response)
+        job_id = response.id
+        project = response.configuration.load.destination_table.project_id
+        dataset = response.configuration.load.destination_table.dataset_id
+        table_id = response.configuration.load.destination_table.table_id
-        errors = _response.status.errors
+        errors = response.status.errors
         if errors
           errors.each do |e|
-            log.error "job.insert API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
+            log.error "job.load API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
           end
         end
-        error_result = _response.status.error_result
+        error_result = response.status.error_result
         if error_result
-          log.error "job.insert API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
+          log.error "job.load API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
           if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
-            @num_errors_per_chunk[chunk_id] = @num_errors_per_chunk[chunk_id].to_i + 1
+            @num_errors_per_chunk[chunk_id_hex] = @num_errors_per_chunk[chunk_id_hex].to_i + 1
             raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry")
           else
-            @num_errors_per_chunk.delete(chunk_id)
+            @num_errors_per_chunk.delete(chunk_id_hex)
             raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry")
           end
         end
-        log.debug "finish load job", state: _response.status.state
+        stats = response.statistics.load
+        duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
+        log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats.input_file_bytes, input_files: stats.input_files, output_bytes: stats.output_bytes, output_rows: stats.output_rows, bad_records: stats.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
+        @num_errors_per_chunk.delete(chunk_id_hex)
       end
       private
@@ -291,8 +297,8 @@ module Fluent
         table_id.gsub(/\$\d+$/, "")
       end
-      def create_job_id(chunk_id, dataset, table, schema)
-        job_id_key = "#{chunk_id}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id]}"
+      def create_job_id(chunk_id_hex, dataset, table, schema)
+        job_id_key = "#{chunk_id_hex}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id_hex]}"
         @log.debug "job_id_key: #{job_id_key}"
         "fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
       end