fluent-plugin-bigquery-custom 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9f634996c0de109e264c651d08ac1e118a9694d2
4
- data.tar.gz: edfb078ea6100688d83c5bcce0f7f5298a4e7d84
3
+ metadata.gz: 1d8724c6a0d0b2cfbdc48170fae356e341b70880
4
+ data.tar.gz: 0ce6f9e76d9cfb56a09ea34a6ad2023ec653d4bb
5
5
  SHA512:
6
- metadata.gz: d960bd5956b8ae9da5522f1372e698afaa0807e35ce67d2fe2cdc56837c626d822d43938e4162aa43c151affd52ff5cf02f8cf8fac2c50303aa0f4af0712b232
7
- data.tar.gz: c8a4e351374c459aebd6ec3a72970c96f8895f9f54140ebf6dc1cadd828ba093407ddd2995a75f95e00bd7d2ac245ef36cf05e8152702bce0723bfa26bd8003d
6
+ metadata.gz: 7fa15645710affa98c22858899a7053c598c3aa9a025ebc4c35471e21ad6f3d483d9b2a7fc6a12f4bf1ec52a926e911085680971503abb74ed7e867af8eef058
7
+ data.tar.gz: 4a0473a65349f20448f9ac7c1265630907ce29d036f82d1a6e153cf468a88bd0b8a241d7a7e806748b0b25c6b8649943c6bbbd9ab9b04bf79d2f1aaa6a65b0ec
data/README.md CHANGED
@@ -26,6 +26,7 @@ OAuth flow for installed applications.
26
26
  - `skip_invalid_rows`
27
27
  - `max_bad_records`
28
28
  - `ignore_unknown_values`
29
+ - `prevent_duplicate_load`
29
30
  - Improve error handling
30
31
 
31
32
  ## Configuration
@@ -407,6 +408,34 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
407
408
  </match>
408
409
  ```
409
410
 
411
+ ### Prevent duplicate load
412
+
413
+ If you want to detect duplicate load job, you set `prevent_duplicate_load` to `true`
414
+ `prevent_duplicate_load` makes load job\_id consistent.
415
+ For example, even if fluentd process crashed during waiting for job, fluentd can resume waiting for same job.
416
+
417
+ ```apache
418
+ <match dummy>
419
+ type bigquery
420
+
421
+ ...
422
+
423
+ prevent_duplicate_load true
424
+ </match>
425
+ ```
426
+
427
+ job\_id is calculated by SHA1. The factors are ...
428
+
429
+ - upload source path (file buffer path)
430
+ - dataset
431
+ - table
432
+ - schema
433
+ - `max_bad_records`
434
+ - `ignore_unknown_values`
435
+
436
+ NOTE: Duplicate job error does not invoke `flush_secondary`.
437
+ NOTE: This option affects only when use file buffer.
438
+
410
439
  ## TODO
411
440
 
412
441
  * Automatically configured flush/buffer options
@@ -1,6 +1,6 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.3.0"
3
+ VERSION = "0.3.1"
4
4
  end
5
5
  end
6
6
 
@@ -102,7 +102,10 @@ module Fluent
102
102
  config_param :utc, :bool, default: nil
103
103
  config_param :time_field, :string, default: nil
104
104
 
105
+ # insert_id_field (only insert)
105
106
  config_param :insert_id_field, :string, default: nil
107
+ # prevent_duplicate_load (only load)
108
+ config_param :prevent_duplicate_load, :bool, default: false
106
109
 
107
110
  config_param :method, :string, default: 'insert' # or 'load'
108
111
 
@@ -164,6 +167,7 @@ module Fluent
164
167
  if @method == "insert"
165
168
  extend(InsertImplementation)
166
169
  elsif @method == "load"
170
+ require 'digest/sha1'
167
171
  extend(LoadImplementation)
168
172
  else
169
173
  raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
@@ -422,10 +426,13 @@ module Fluent
422
426
  raise "table created. send rows next time."
423
427
  end
424
428
 
429
+ reason = e.respond_to?(:reason) ? e.reason : nil
425
430
  log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
426
- if e.reason == "backendError"
427
- raise "failed to insert into bigquery, retry" # TODO: error class
428
- elsif @secondary
431
+
432
+ raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
433
+
434
+ # other error handling
435
+ if @secondary
429
436
  flush_secondary(@secondary)
430
437
  end
431
438
  end
@@ -447,8 +454,13 @@ module Fluent
447
454
 
448
455
  def _write(chunk, table_id)
449
456
  res = nil
457
+ job_id = nil
458
+
450
459
  create_upload_source(chunk) do |upload_source|
451
- res = client.insert_job(@project, {
460
+ if @prevent_duplicate_load
461
+ job_id = create_job_id(upload_source.path, @dataset, @table, @fields.to_a, @max_bad_records, @ignore_unknown_values)
462
+ end
463
+ configuration = {
452
464
  configuration: {
453
465
  load: {
454
466
  destination_table: {
@@ -465,28 +477,37 @@ module Fluent
465
477
  max_bad_records: @max_bad_records,
466
478
  }
467
479
  }
468
- }, {upload_source: upload_source, content_type: "application/octet-stream"})
480
+ }
481
+ configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
482
+ res = client.insert_job(@project, configuration, {upload_source: upload_source, content_type: "application/octet-stream"})
469
483
  end
470
- wait_load(res, table_id)
484
+
485
+ wait_load(res.job_reference.job_id)
471
486
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
472
487
  # api_error? -> client cache clear
473
488
  @cached_client = nil
474
489
 
475
- log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
476
- if e.reason == "backendError"
477
- raise "failed to insert into bigquery, retry" # TODO: error class
478
- elsif @secondary
490
+ reason = e.respond_to?(:reason) ? e.reason : nil
491
+ log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
492
+
493
+ raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
494
+ return wait_load(job_id) if e.status_code == 409 && e.message =~ /Job/ # duplicate load job
495
+
496
+ # other error handling
497
+ if @secondary
479
498
  flush_secondary(@secondary)
480
499
  end
481
500
  end
482
501
 
483
502
  private
484
503
 
485
- def wait_load(res, table_id)
504
+ def wait_load(job_id)
486
505
  wait_interval = 10
487
- _response = res
506
+ _response = client.get_job(@project, job_id)
507
+ table_id = _response.configuration.load.destination_table.table_id
508
+
488
509
  until _response.status.state == "DONE"
489
- log.debug "wait for load job finish", state: _response.status.state
510
+ log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
490
511
  sleep wait_interval
491
512
  _response = client.get_job(@project, _response.job_reference.job_id)
492
513
  end
@@ -527,6 +548,10 @@ module Fluent
527
548
  end
528
549
  end
529
550
  end
551
+
552
+ def create_job_id(upload_source_path, dataset, table, schema, max_bad_records, ignore_unknown_values)
553
+ "fluentd_job_" + Digest::SHA1.hexdigest("#{upload_source_path}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
554
+ end
530
555
  end
531
556
 
532
557
  class FieldSchema
@@ -881,13 +881,24 @@ class BigQueryOutputTest < Test::Unit::TestCase
881
881
  }
882
882
  }
883
883
  }, {upload_source: io, content_type: "application/octet-stream"}) {
884
- s = stub!
885
- status_stub = stub!
886
- s.status { status_stub }
887
- status_stub.state { "DONE" }
888
- status_stub.error_result { nil }
889
- status_stub.errors { nil }
890
- s
884
+ Google::Apis::BigqueryV2::Job.new({
885
+ job_reference: Google::Apis::BigqueryV2::JobReference.new({job_id: "job_id"})
886
+ })
887
+ }
888
+
889
+ expect.get_job('yourproject_id', "job_id") {
890
+ Google::Apis::BigqueryV2::Job.new({
891
+ configuration: Google::Apis::BigqueryV2::JobConfiguration.new({
892
+ load: Google::Apis::BigqueryV2::JobConfigurationLoad.new({
893
+ destination_table: Google::Apis::BigqueryV2::TableReference.new({
894
+ project_id: 'yourproject_id',
895
+ dataset_id: 'yourdataset_id',
896
+ table_id: 'foo',
897
+ }),
898
+ })
899
+ }),
900
+ status: Google::Apis::BigqueryV2::JobStatus.new({state: "DONE"}),
901
+ })
891
902
  }
892
903
  end
893
904
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery-custom
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomohiro Hashidate
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-11 00:00:00.000000000 Z
11
+ date: 2016-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake