fluent-plugin-bigquery-custom 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9f634996c0de109e264c651d08ac1e118a9694d2
4
- data.tar.gz: edfb078ea6100688d83c5bcce0f7f5298a4e7d84
3
+ metadata.gz: 1d8724c6a0d0b2cfbdc48170fae356e341b70880
4
+ data.tar.gz: 0ce6f9e76d9cfb56a09ea34a6ad2023ec653d4bb
5
5
  SHA512:
6
- metadata.gz: d960bd5956b8ae9da5522f1372e698afaa0807e35ce67d2fe2cdc56837c626d822d43938e4162aa43c151affd52ff5cf02f8cf8fac2c50303aa0f4af0712b232
7
- data.tar.gz: c8a4e351374c459aebd6ec3a72970c96f8895f9f54140ebf6dc1cadd828ba093407ddd2995a75f95e00bd7d2ac245ef36cf05e8152702bce0723bfa26bd8003d
6
+ metadata.gz: 7fa15645710affa98c22858899a7053c598c3aa9a025ebc4c35471e21ad6f3d483d9b2a7fc6a12f4bf1ec52a926e911085680971503abb74ed7e867af8eef058
7
+ data.tar.gz: 4a0473a65349f20448f9ac7c1265630907ce29d036f82d1a6e153cf468a88bd0b8a241d7a7e806748b0b25c6b8649943c6bbbd9ab9b04bf79d2f1aaa6a65b0ec
data/README.md CHANGED
@@ -26,6 +26,7 @@ OAuth flow for installed applications.
26
26
  - `skip_invalid_rows`
27
27
  - `max_bad_records`
28
28
  - `ignore_unknown_values`
29
+ - `prevent_duplicate_load`
29
30
  - Improve error handling
30
31
 
31
32
  ## Configuration
@@ -407,6 +408,34 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
407
408
  </match>
408
409
  ```
409
410
 
411
+ ### Prevent duplicate load
412
+
413
+ If you want to detect duplicate load job, you set `prevent_duplicate_load` to `true`
414
+ `prevent_duplicate_load` makes load job\_id consistent.
415
+ For example, even if fluentd process crashed during waiting for job, fluentd can resume waiting for same job.
416
+
417
+ ```apache
418
+ <match dummy>
419
+ type bigquery
420
+
421
+ ...
422
+
423
+ prevent_duplicate_load true
424
+ </match>
425
+ ```
426
+
427
+ job\_id is calculated by SHA1. The factors are ...
428
+
429
+ - upload source path (file buffer path)
430
+ - dataset
431
+ - table
432
+ - schema
433
+ - `max_bad_records`
434
+ - `ignore_unknown_values`
435
+
436
+ NOTE: Duplicate job error does not invoke `flush_secondary`.
437
+ NOTE: This option affects only when use file buffer.
438
+
410
439
  ## TODO
411
440
 
412
441
  * Automatically configured flush/buffer options
@@ -1,6 +1,6 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.3.0"
3
+ VERSION = "0.3.1"
4
4
  end
5
5
  end
6
6
 
@@ -102,7 +102,10 @@ module Fluent
102
102
  config_param :utc, :bool, default: nil
103
103
  config_param :time_field, :string, default: nil
104
104
 
105
+ # insert_id_field (only insert)
105
106
  config_param :insert_id_field, :string, default: nil
107
+ # prevent_duplicate_load (only load)
108
+ config_param :prevent_duplicate_load, :bool, default: false
106
109
 
107
110
  config_param :method, :string, default: 'insert' # or 'load'
108
111
 
@@ -164,6 +167,7 @@ module Fluent
164
167
  if @method == "insert"
165
168
  extend(InsertImplementation)
166
169
  elsif @method == "load"
170
+ require 'digest/sha1'
167
171
  extend(LoadImplementation)
168
172
  else
169
173
  raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
@@ -422,10 +426,13 @@ module Fluent
422
426
  raise "table created. send rows next time."
423
427
  end
424
428
 
429
+ reason = e.respond_to?(:reason) ? e.reason : nil
425
430
  log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
426
- if e.reason == "backendError"
427
- raise "failed to insert into bigquery, retry" # TODO: error class
428
- elsif @secondary
431
+
432
+ raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
433
+
434
+ # other error handling
435
+ if @secondary
429
436
  flush_secondary(@secondary)
430
437
  end
431
438
  end
@@ -447,8 +454,13 @@ module Fluent
447
454
 
448
455
  def _write(chunk, table_id)
449
456
  res = nil
457
+ job_id = nil
458
+
450
459
  create_upload_source(chunk) do |upload_source|
451
- res = client.insert_job(@project, {
460
+ if @prevent_duplicate_load
461
+ job_id = create_job_id(upload_source.path, @dataset, @table, @fields.to_a, @max_bad_records, @ignore_unknown_values)
462
+ end
463
+ configuration = {
452
464
  configuration: {
453
465
  load: {
454
466
  destination_table: {
@@ -465,28 +477,37 @@ module Fluent
465
477
  max_bad_records: @max_bad_records,
466
478
  }
467
479
  }
468
- }, {upload_source: upload_source, content_type: "application/octet-stream"})
480
+ }
481
+ configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
482
+ res = client.insert_job(@project, configuration, {upload_source: upload_source, content_type: "application/octet-stream"})
469
483
  end
470
- wait_load(res, table_id)
484
+
485
+ wait_load(res.job_reference.job_id)
471
486
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
472
487
  # api_error? -> client cache clear
473
488
  @cached_client = nil
474
489
 
475
- log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
476
- if e.reason == "backendError"
477
- raise "failed to insert into bigquery, retry" # TODO: error class
478
- elsif @secondary
490
+ reason = e.respond_to?(:reason) ? e.reason : nil
491
+ log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
492
+
493
+ raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
494
+ return wait_load(job_id) if e.status_code == 409 && e.message =~ /Job/ # duplicate load job
495
+
496
+ # other error handling
497
+ if @secondary
479
498
  flush_secondary(@secondary)
480
499
  end
481
500
  end
482
501
 
483
502
  private
484
503
 
485
- def wait_load(res, table_id)
504
+ def wait_load(job_id)
486
505
  wait_interval = 10
487
- _response = res
506
+ _response = client.get_job(@project, job_id)
507
+ table_id = _response.configuration.load.destination_table.table_id
508
+
488
509
  until _response.status.state == "DONE"
489
- log.debug "wait for load job finish", state: _response.status.state
510
+ log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
490
511
  sleep wait_interval
491
512
  _response = client.get_job(@project, _response.job_reference.job_id)
492
513
  end
@@ -527,6 +548,10 @@ module Fluent
527
548
  end
528
549
  end
529
550
  end
551
+
552
+ def create_job_id(upload_source_path, dataset, table, schema, max_bad_records, ignore_unknown_values)
553
+ "fluentd_job_" + Digest::SHA1.hexdigest("#{upload_source_path}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
554
+ end
530
555
  end
531
556
 
532
557
  class FieldSchema
@@ -881,13 +881,24 @@ class BigQueryOutputTest < Test::Unit::TestCase
881
881
  }
882
882
  }
883
883
  }, {upload_source: io, content_type: "application/octet-stream"}) {
884
- s = stub!
885
- status_stub = stub!
886
- s.status { status_stub }
887
- status_stub.state { "DONE" }
888
- status_stub.error_result { nil }
889
- status_stub.errors { nil }
890
- s
884
+ Google::Apis::BigqueryV2::Job.new({
885
+ job_reference: Google::Apis::BigqueryV2::JobReference.new({job_id: "job_id"})
886
+ })
887
+ }
888
+
889
+ expect.get_job('yourproject_id', "job_id") {
890
+ Google::Apis::BigqueryV2::Job.new({
891
+ configuration: Google::Apis::BigqueryV2::JobConfiguration.new({
892
+ load: Google::Apis::BigqueryV2::JobConfigurationLoad.new({
893
+ destination_table: Google::Apis::BigqueryV2::TableReference.new({
894
+ project_id: 'yourproject_id',
895
+ dataset_id: 'yourdataset_id',
896
+ table_id: 'foo',
897
+ }),
898
+ })
899
+ }),
900
+ status: Google::Apis::BigqueryV2::JobStatus.new({state: "DONE"}),
901
+ })
891
902
  }
892
903
  end
893
904
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery-custom
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomohiro Hashidate
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-11 00:00:00.000000000 Z
11
+ date: 2016-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake