fluent-plugin-bigquery-custom 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +29 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/out_bigquery.rb +38 -13
- data/test/plugin/test_out_bigquery.rb +18 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d8724c6a0d0b2cfbdc48170fae356e341b70880
|
4
|
+
data.tar.gz: 0ce6f9e76d9cfb56a09ea34a6ad2023ec653d4bb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7fa15645710affa98c22858899a7053c598c3aa9a025ebc4c35471e21ad6f3d483d9b2a7fc6a12f4bf1ec52a926e911085680971503abb74ed7e867af8eef058
|
7
|
+
data.tar.gz: 4a0473a65349f20448f9ac7c1265630907ce29d036f82d1a6e153cf468a88bd0b8a241d7a7e806748b0b25c6b8649943c6bbbd9ab9b04bf79d2f1aaa6a65b0ec
|
data/README.md
CHANGED
@@ -26,6 +26,7 @@ OAuth flow for installed applications.
|
|
26
26
|
- `skip_invalid_rows`
|
27
27
|
- `max_bad_records`
|
28
28
|
- `ignore_unknown_values`
|
29
|
+
- `prevent_duplicate_load`
|
29
30
|
- Improve error handling
|
30
31
|
|
31
32
|
## Configuration
|
@@ -407,6 +408,34 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
|
|
407
408
|
</match>
|
408
409
|
```
|
409
410
|
|
411
|
+
### Prevent duplicate load
|
412
|
+
|
413
|
+
If you want to detect duplicate load job, you set `prevent_duplicate_load` to `true`
|
414
|
+
`prevent_duplicate_load` makes load job\_id consistent.
|
415
|
+
For example, even if fluentd process crashed during waiting for job, fluentd can resume waiting for same job.
|
416
|
+
|
417
|
+
```apache
|
418
|
+
<match dummy>
|
419
|
+
type bigquery
|
420
|
+
|
421
|
+
...
|
422
|
+
|
423
|
+
prevent_duplicate_load true
|
424
|
+
</match>
|
425
|
+
```
|
426
|
+
|
427
|
+
job\_id is calculated by SHA1. The factors are ...
|
428
|
+
|
429
|
+
- upload source path (file buffer path)
|
430
|
+
- dataset
|
431
|
+
- table
|
432
|
+
- schema
|
433
|
+
- `max_bad_records`
|
434
|
+
- `ignore_unknown_values`
|
435
|
+
|
436
|
+
NOTE: Duplicate job error does not invoke `flush_secondary`.
|
437
|
+
NOTE: This option affects only when use file buffer.
|
438
|
+
|
410
439
|
## TODO
|
411
440
|
|
412
441
|
* Automatically configured flush/buffer options
|
@@ -102,7 +102,10 @@ module Fluent
|
|
102
102
|
config_param :utc, :bool, default: nil
|
103
103
|
config_param :time_field, :string, default: nil
|
104
104
|
|
105
|
+
# insert_id_field (only insert)
|
105
106
|
config_param :insert_id_field, :string, default: nil
|
107
|
+
# prevent_duplicate_load (only load)
|
108
|
+
config_param :prevent_duplicate_load, :bool, default: false
|
106
109
|
|
107
110
|
config_param :method, :string, default: 'insert' # or 'load'
|
108
111
|
|
@@ -164,6 +167,7 @@ module Fluent
|
|
164
167
|
if @method == "insert"
|
165
168
|
extend(InsertImplementation)
|
166
169
|
elsif @method == "load"
|
170
|
+
require 'digest/sha1'
|
167
171
|
extend(LoadImplementation)
|
168
172
|
else
|
169
173
|
raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
|
@@ -422,10 +426,13 @@ module Fluent
|
|
422
426
|
raise "table created. send rows next time."
|
423
427
|
end
|
424
428
|
|
429
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
425
430
|
log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
|
426
|
-
|
427
|
-
|
428
|
-
|
431
|
+
|
432
|
+
raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
|
433
|
+
|
434
|
+
# other error handling
|
435
|
+
if @secondary
|
429
436
|
flush_secondary(@secondary)
|
430
437
|
end
|
431
438
|
end
|
@@ -447,8 +454,13 @@ module Fluent
|
|
447
454
|
|
448
455
|
def _write(chunk, table_id)
|
449
456
|
res = nil
|
457
|
+
job_id = nil
|
458
|
+
|
450
459
|
create_upload_source(chunk) do |upload_source|
|
451
|
-
|
460
|
+
if @prevent_duplicate_load
|
461
|
+
job_id = create_job_id(upload_source.path, @dataset, @table, @fields.to_a, @max_bad_records, @ignore_unknown_values)
|
462
|
+
end
|
463
|
+
configuration = {
|
452
464
|
configuration: {
|
453
465
|
load: {
|
454
466
|
destination_table: {
|
@@ -465,28 +477,37 @@ module Fluent
|
|
465
477
|
max_bad_records: @max_bad_records,
|
466
478
|
}
|
467
479
|
}
|
468
|
-
}
|
480
|
+
}
|
481
|
+
configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
|
482
|
+
res = client.insert_job(@project, configuration, {upload_source: upload_source, content_type: "application/octet-stream"})
|
469
483
|
end
|
470
|
-
|
484
|
+
|
485
|
+
wait_load(res.job_reference.job_id)
|
471
486
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
472
487
|
# api_error? -> client cache clear
|
473
488
|
@cached_client = nil
|
474
489
|
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
490
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
491
|
+
log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
492
|
+
|
493
|
+
raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
|
494
|
+
return wait_load(job_id) if e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
495
|
+
|
496
|
+
# other error handling
|
497
|
+
if @secondary
|
479
498
|
flush_secondary(@secondary)
|
480
499
|
end
|
481
500
|
end
|
482
501
|
|
483
502
|
private
|
484
503
|
|
485
|
-
def wait_load(
|
504
|
+
def wait_load(job_id)
|
486
505
|
wait_interval = 10
|
487
|
-
_response =
|
506
|
+
_response = client.get_job(@project, job_id)
|
507
|
+
table_id = _response.configuration.load.destination_table.table_id
|
508
|
+
|
488
509
|
until _response.status.state == "DONE"
|
489
|
-
log.debug "wait for load job finish", state: _response.status.state
|
510
|
+
log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
|
490
511
|
sleep wait_interval
|
491
512
|
_response = client.get_job(@project, _response.job_reference.job_id)
|
492
513
|
end
|
@@ -527,6 +548,10 @@ module Fluent
|
|
527
548
|
end
|
528
549
|
end
|
529
550
|
end
|
551
|
+
|
552
|
+
def create_job_id(upload_source_path, dataset, table, schema, max_bad_records, ignore_unknown_values)
|
553
|
+
"fluentd_job_" + Digest::SHA1.hexdigest("#{upload_source_path}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
|
554
|
+
end
|
530
555
|
end
|
531
556
|
|
532
557
|
class FieldSchema
|
@@ -881,13 +881,24 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
881
881
|
}
|
882
882
|
}
|
883
883
|
}, {upload_source: io, content_type: "application/octet-stream"}) {
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
884
|
+
Google::Apis::BigqueryV2::Job.new({
|
885
|
+
job_reference: Google::Apis::BigqueryV2::JobReference.new({job_id: "job_id"})
|
886
|
+
})
|
887
|
+
}
|
888
|
+
|
889
|
+
expect.get_job('yourproject_id', "job_id") {
|
890
|
+
Google::Apis::BigqueryV2::Job.new({
|
891
|
+
configuration: Google::Apis::BigqueryV2::JobConfiguration.new({
|
892
|
+
load: Google::Apis::BigqueryV2::JobConfigurationLoad.new({
|
893
|
+
destination_table: Google::Apis::BigqueryV2::TableReference.new({
|
894
|
+
project_id: 'yourproject_id',
|
895
|
+
dataset_id: 'yourdataset_id',
|
896
|
+
table_id: 'foo',
|
897
|
+
}),
|
898
|
+
})
|
899
|
+
}),
|
900
|
+
status: Google::Apis::BigqueryV2::JobStatus.new({state: "DONE"}),
|
901
|
+
})
|
891
902
|
}
|
892
903
|
end
|
893
904
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery-custom
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomohiro Hashidate
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|