fluent-plugin-bigquery 1.2.0 → 2.0.0.beta

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ef9d7a814ddde267ffd9bf2e10837c162a01b12b
4
- data.tar.gz: 5ca29129b5dfb449abd7e1a22d6d22fcba47a862
3
+ metadata.gz: '07998acf05ddb3e647da13a4b5c734dc16f8cc77'
4
+ data.tar.gz: 1fce9fc906cbf72083a4f8132c0ac1d985a95d6d
5
5
  SHA512:
6
- metadata.gz: 658fb6ece7816aeb711960179fa53544fe639f97aa41bbfea1a0fe007edf1a54fe11aef4c63f0ea1d42f2b2582c9ed335c5919aaec2ec18f3d53fdfa3e9d9be1
7
- data.tar.gz: 5ac9138e736eaf7dcf5b9fe1ecd46b92c67979313d93862fd061640f5dc5fc8f4bef3364cb0f053ab3a5a9d7d34ffeb1fef58dbbb532f59f99befe899b9066c6
6
+ metadata.gz: 04cfd6d3080d9424e25bd75ae1a9600259fe94ed933adceab66c02eb11afdb49eeddc393c305f0927dd64f967d1e72835fde9566cd54b2e53805e85ffe7a1516
7
+ data.tar.gz: 8de74527cf12be2c6553e4a582cc25c47a1773cdc165800f212aae563f7ffa048679260515a51f55e244b641b968badeaa4349cf4369ad2363d22aff1c1cbe7d
@@ -1,20 +1,13 @@
1
1
  language: ruby
2
2
 
3
3
  rvm:
4
- - 2.1
5
4
  - 2.2
6
5
  - 2.3.3
6
+ - 2.4.3
7
+ - 2.5.0
7
8
 
8
9
  gemfile:
9
10
  - Gemfile
10
- - gemfiles/activesupport-4.gemfile
11
-
12
- matrix:
13
- exclude:
14
- - rvm: 2.0
15
- gemfile: Gemfile
16
- - rvm: 2.1
17
- gemfile: Gemfile
18
11
 
19
12
  before_install:
20
13
  - gem update bundler
data/README.md CHANGED
@@ -1,13 +1,17 @@
1
1
  # fluent-plugin-bigquery
2
2
 
3
+ **This README is for v2.0.0.beta. but it is not released yet. sorry.**
4
+
3
5
  [Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery.
4
6
 
5
- - **Plugin type**: BufferedOutput
7
+ - **Plugin type**: Output
6
8
 
7
9
  * insert data over streaming inserts
10
+ * plugin type is `bigquery_insert`
8
11
  * for continuous real-time insertions
9
12
  * https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
10
13
  * load data
14
+ * plugin type is `bigquery_load`
11
15
  * for data loading as batch jobs, for big amount of data
12
16
  * https://developers.google.com/bigquery/loading-data-into-bigquery
13
17
 
@@ -31,42 +35,47 @@ Because embbeded gem dependency sometimes restricts ruby environment.
31
35
 
32
36
  ### Options
33
37
 
34
- | name | type | required? | placeholder? | default | description |
35
- | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
36
- | method | string | no | no | insert | `insert` (Streaming Insert) or `load` (load job) |
37
- | auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
38
- | email | string | yes (private_key) | no | nil | GCP Service Account Email |
39
- | private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
40
- | private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
41
- | json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
42
- | project | string | yes | yes | nil | |
43
- | dataset | string | yes | yes | nil | |
44
- | table | string | yes (either `tables`) | yes | nil | |
45
- | tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` |
46
- | template_suffix | string | no | yes | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
47
- | auto_create_table | bool | no | no | false | If true, creates table automatically |
48
- | skip_invalid_rows | bool | no | no | false | Only `insert` method. |
49
- | max_bad_records | integer | no | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
50
- | ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
51
- | schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
52
- | schema_path | string | yes (either `fetch_schema`) | no | nil | Schema Definition file path. It is formatted by JSON. |
53
- | fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
54
- | fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
55
- | schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
56
- | insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
57
- | add_insert_timestamp | string | no | no | nil | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
58
- | allow_retry_insert_errors | bool | no | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
59
- | request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
60
- | request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
61
- | time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
62
- | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
63
-
64
- ### Deprecated
65
-
66
- | name | type | required? | placeholder? | default | description |
67
- | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
68
- | replace_record_key | bool | no | no | false | Use other filter plugin. |
69
- | replace_record_key_regexp{1-10} | string | no | no | nil | |
38
+ #### common
39
+
40
+ | name | type | required? | placeholder? | default | description |
41
+ | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
42
+ | auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
43
+ | email | string | yes (private_key) | no | nil | GCP Service Account Email |
44
+ | private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
45
+ | private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
46
+ | json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
47
+ | project | string | yes | yes | nil | |
48
+ | dataset | string | yes | yes | nil | |
49
+ | table | string | yes (either `tables`) | yes | nil | |
50
+ | tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` |
51
+ | auto_create_table | bool | no | no | false | If true, creates table automatically |
52
+ | ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
53
+ | schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
54
+ | schema_path | string | yes (either `fetch_schema`) | no | nil | Schema Definition file path. It is formatted by JSON. |
55
+ | fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
56
+ | fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
57
+ | schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
58
+ | request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
59
+ | request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
60
+ | time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
61
+ | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
62
+
63
+ #### bigquery_insert
64
+
65
+ | name | type | required? | placeholder? | default | description |
66
+ | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
67
+ | template_suffix | string | no | yes | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
68
+ | skip_invalid_rows | bool | no | no | false | |
69
+ | insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. see. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor |
70
+ | add_insert_timestamp | string | no | no | nil | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
71
+ | allow_retry_insert_errors | bool | no | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
72
+
73
+ #### bigquery_load
74
+
75
+ | name | type | required? | placeholder? | default | description |
76
+ | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
77
+ | source_format | enum | no | no | json | Specify source format `json` or `csv` or `avro`. If you change this parameter, you must change formatter plugin via `<format>` config section. |
78
+ | max_bad_records | integer | no | no | 0 | If the number of bad records exceeds this value, an invalid error is returned in the job result. |
70
79
 
71
80
  ### Buffer section
72
81
 
@@ -77,9 +86,9 @@ Because embbeded gem dependency sometimes restricts ruby environment.
77
86
  | total_limit_size | integer | no | 1GB (insert) or 32GB (load) | |
78
87
  | chunk_records_limit | integer | no | 500 (insert) or nil (load) | |
79
88
  | flush_mode | enum | no | interval | default, lazy, interval, immediate |
80
- | flush_interval | float | no | 0.25 (insert) or nil (load) | |
81
- | flush_thread_interval | float | no | 0.05 (insert) or nil (load) | |
82
- | flush_thread_burst_interval | float | no | 0.05 (insert) or nil (load) | |
89
+ | flush_interval | float | no | 1.0 (insert) or 3600 (load) | |
90
+ | flush_thread_interval | float | no | 0.05 (insert) or 5 (load) | |
91
+ | flush_thread_burst_interval | float | no | 0.05 (insert) or 5 (load) | |
83
92
 
84
93
  And, other params (defined by base class) are available
85
94
 
@@ -142,9 +151,7 @@ Configure insert specifications with target table schema, with your credentials.
142
151
 
143
152
  ```apache
144
153
  <match dummy>
145
- @type bigquery
146
-
147
- method insert # default
154
+ @type bigquery_insert
148
155
 
149
156
  auth_method private_key # default
150
157
  email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
@@ -181,14 +188,12 @@ For high rate inserts over streaming inserts, you should specify flush intervals
181
188
 
182
189
  ```apache
183
190
  <match dummy>
184
- @type bigquery
185
-
186
- method insert # default
191
+ @type bigquery_insert
187
192
 
188
193
  <buffer>
189
194
  flush_interval 0.1 # flush as frequent as possible
190
195
 
191
- buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
196
+ total_limit_size 10g
192
197
 
193
198
  flush_thread_count 16
194
199
  </buffer>
@@ -256,16 +261,12 @@ section in the Google BigQuery document.
256
261
  ### Load
257
262
  ```apache
258
263
  <match bigquery>
259
- @type bigquery
260
-
261
- method load
264
+ @type bigquery_load
262
265
 
263
266
  <buffer>
264
- @type file
265
- path bigquery.*.buffer
266
- flush_interval 1800
267
- flush_at_shutdown true
268
- timekey_use_utc
267
+ path bigquery.*.buffer
268
+ flush_at_shutdown true
269
+ timekey_use_utc
269
270
  </buffer>
270
271
 
271
272
  auth_method json_key
@@ -302,7 +303,7 @@ download its JSON key and deploy the key with fluentd.
302
303
 
303
304
  ```apache
304
305
  <match dummy>
305
- @type bigquery
306
+ @type bigquery_insert
306
307
 
307
308
  auth_method json_key
308
309
  json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
@@ -319,7 +320,7 @@ You need to only include `private_key` and `client_email` key from JSON key file
319
320
 
320
321
  ```apache
321
322
  <match dummy>
322
- @type bigquery
323
+ @type bigquery_insert
323
324
 
324
325
  auth_method json_key
325
326
  json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
@@ -340,7 +341,7 @@ Compute Engine instance, then you can configure fluentd like this.
340
341
 
341
342
  ```apache
342
343
  <match dummy>
343
- @type bigquery
344
+ @type bigquery_insert
344
345
 
345
346
  auth_method compute_engine
346
347
 
@@ -382,7 +383,7 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
382
383
 
383
384
  ```apache
384
385
  <match dummy>
385
- @type bigquery
386
+ @type bigquery_insert
386
387
 
387
388
  ...
388
389
 
@@ -430,7 +431,7 @@ Use placeholder.
430
431
 
431
432
  ```apache
432
433
  <match dummy>
433
- @type bigquery
434
+ @type bigquery_insert
434
435
 
435
436
  ...
436
437
  table accesslog$%Y%m%d
@@ -453,7 +454,7 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
453
454
 
454
455
  ```apache
455
456
  <match dummy>
456
- @type bigquery
457
+ @type bigquery_insert
457
458
 
458
459
  ...
459
460
 
@@ -477,7 +478,7 @@ you can also specify nested fields by prefixing their belonging record fields.
477
478
 
478
479
  ```apache
479
480
  <match dummy>
480
- @type bigquery
481
+ @type bigquery_insert
481
482
 
482
483
  ...
483
484
 
@@ -528,7 +529,7 @@ The second method is to specify a path to a BigQuery schema file instead of list
528
529
 
529
530
  ```apache
530
531
  <match dummy>
531
- @type bigquery
532
+ @type bigquery_insert
532
533
 
533
534
  ...
534
535
 
@@ -541,7 +542,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
541
542
 
542
543
  ```apache
543
544
  <match dummy>
544
- @type bigquery
545
+ @type bigquery_insert
545
546
 
546
547
  ...
547
548
 
@@ -559,10 +560,12 @@ you are still recommended to specify JSON types for TIMESTAMP fields as "time" f
559
560
 
560
561
  BigQuery uses `insertId` property to detect duplicate insertion requests (see [data consistency](https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataconsistency) in Google BigQuery documents).
561
562
  You can set `insert_id_field` option to specify the field to use as `insertId` property.
563
+ `insert_id_field` can use fluentd record_accessor format like `$['key1'][0]['key2']`.
564
+ (detail. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)
562
565
 
563
566
  ```apache
564
567
  <match dummy>
565
- @type bigquery
568
+ @type bigquery_insert
566
569
 
567
570
  ...
568
571
 
@@ -1,5 +1,5 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "1.2.0".freeze
3
+ VERSION = "2.0.0.beta".freeze
4
4
  end
5
5
  end
@@ -7,22 +7,15 @@ module Fluent
7
7
  @options = options
8
8
  @log = log
9
9
  @num_errors_per_chunk = {}
10
-
11
- @cached_client_expiration = Time.now + 1800
12
10
  end
13
11
 
14
12
  def client
15
- return @client if @client && @cached_client_expiration > Time.now
16
-
17
- client = Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
13
+ @client ||= Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
18
14
  cl.authorization = get_auth
19
15
  cl.client_options.open_timeout_sec = @options[:open_timeout_sec] if @options[:open_timeout_sec]
20
16
  cl.client_options.read_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
21
17
  cl.client_options.send_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
22
18
  end
23
-
24
- @cached_client_expiration = Time.now + 1800
25
- @client = client
26
19
  end
27
20
 
28
21
  def create_table(project, dataset, table_id, record_schema)
@@ -49,10 +42,7 @@ module Fluent
49
42
  end
50
43
  client.insert_table(project, dataset, definition, {})
51
44
  log.debug "create table", project_id: project, dataset: dataset, table: table_id
52
- @client = nil
53
45
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
54
- @client = nil
55
-
56
46
  message = e.message
57
47
  if e.status_code == 409 && /Already Exists:/ =~ message
58
48
  log.debug "already created table", project_id: project, dataset: dataset, table: table_id
@@ -81,7 +71,6 @@ module Fluent
81
71
 
82
72
  schema
83
73
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
84
- @client = nil
85
74
  message = e.message
86
75
  log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
87
76
  nil
@@ -111,8 +100,6 @@ module Fluent
111
100
  end
112
101
  end
113
102
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
114
- @client = nil
115
-
116
103
  reason = e.respond_to?(:reason) ? e.reason : nil
117
104
  error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason }
118
105
  wrapped = Fluent::BigQuery::Error.wrap(e)
@@ -125,7 +112,17 @@ module Fluent
125
112
  raise wrapped
126
113
  end
127
114
 
128
- def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields)
115
+ JobReference = Struct.new(:chunk_id, :chunk_id_hex, :project_id, :dataset_id, :table_id, :job_id) do
116
+ def as_hash(*keys)
117
+ if keys.empty?
118
+ to_h
119
+ else
120
+ to_h.select { |k, _| keys.include?(k) }
121
+ end
122
+ end
123
+ end
124
+
125
+ def create_load_job(chunk_id, chunk_id_hex, project, dataset, table_id, upload_source, fields)
129
126
  configuration = {
130
127
  configuration: {
131
128
  load: {
@@ -145,7 +142,7 @@ module Fluent
145
142
  }
146
143
  }
147
144
 
148
- job_id = create_job_id(chunk_id, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
145
+ job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
149
146
  configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if @options[:time_partitioning_type]
150
147
  configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
151
148
 
@@ -167,11 +164,8 @@ module Fluent
167
164
  content_type: "application/octet-stream",
168
165
  }
169
166
  )
170
- wait_load_job(chunk_id, project, dataset, res.job_reference.job_id, table_id)
171
- @num_errors_per_chunk.delete(chunk_id)
167
+ JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id)
172
168
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
173
- @client = nil
174
-
175
169
  reason = e.respond_to?(:reason) ? e.reason : nil
176
170
  log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
177
171
 
@@ -187,44 +181,56 @@ module Fluent
187
181
  end
188
182
 
189
183
  if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
190
- wait_load_job(chunk_id, project, dataset, job_id, table_id)
191
- @num_errors_per_chunk.delete(chunk_id)
192
- return
184
+ return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id)
193
185
  end
194
186
 
195
187
  raise Fluent::BigQuery::Error.wrap(e)
196
188
  end
197
189
 
198
- def wait_load_job(chunk_id, project, dataset, job_id, table_id)
199
- wait_interval = 10
200
- _response = client.get_job(project, job_id)
190
+ def fetch_load_job(job_reference)
191
+ project = job_reference.project_id
192
+ job_id = job_reference.job_id
193
+
194
+ res = client.get_job(project, job_id)
195
+ log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
201
196
 
202
- until _response.status.state == "DONE"
203
- log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
204
- sleep wait_interval
205
- _response = client.get_job(project, _response.job_reference.job_id)
197
+ if res.status.state == "DONE"
198
+ res
206
199
  end
200
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
201
+ e = Fluent::BigQuery::Error.wrap(e)
202
+ raise e unless e.retryable?
203
+ end
204
+
205
+ def commit_load_job(chunk_id_hex, response)
206
+ job_id = response.id
207
+ project = response.configuration.load.destination_table.project_id
208
+ dataset = response.configuration.load.destination_table.dataset_id
209
+ table_id = response.configuration.load.destination_table.table_id
207
210
 
208
- errors = _response.status.errors
211
+ errors = response.status.errors
209
212
  if errors
210
213
  errors.each do |e|
211
- log.error "job.insert API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
214
+ log.error "job.load API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
212
215
  end
213
216
  end
214
217
 
215
- error_result = _response.status.error_result
218
+ error_result = response.status.error_result
216
219
  if error_result
217
- log.error "job.insert API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
220
+ log.error "job.load API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
218
221
  if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
219
- @num_errors_per_chunk[chunk_id] = @num_errors_per_chunk[chunk_id].to_i + 1
222
+ @num_errors_per_chunk[chunk_id_hex] = @num_errors_per_chunk[chunk_id_hex].to_i + 1
220
223
  raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry")
221
224
  else
222
- @num_errors_per_chunk.delete(chunk_id)
225
+ @num_errors_per_chunk.delete(chunk_id_hex)
223
226
  raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry")
224
227
  end
225
228
  end
226
229
 
227
- log.debug "finish load job", state: _response.status.state
230
+ stats = response.statistics.load
231
+ duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
232
+ log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats.input_file_bytes, input_files: stats.input_files, output_bytes: stats.output_bytes, output_rows: stats.output_rows, bad_records: stats.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
233
+ @num_errors_per_chunk.delete(chunk_id_hex)
228
234
  end
229
235
 
230
236
  private
@@ -291,8 +297,8 @@ module Fluent
291
297
  table_id.gsub(/\$\d+$/, "")
292
298
  end
293
299
 
294
- def create_job_id(chunk_id, dataset, table, schema)
295
- job_id_key = "#{chunk_id}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id]}"
300
+ def create_job_id(chunk_id_hex, dataset, table, schema)
301
+ job_id_key = "#{chunk_id_hex}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id_hex]}"
296
302
  @log.debug "job_id_key: #{job_id_key}"
297
303
  "fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
298
304
  end