fluent-plugin-bigquery-custom 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +29 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/out_bigquery.rb +38 -13
- data/test/plugin/test_out_bigquery.rb +18 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d8724c6a0d0b2cfbdc48170fae356e341b70880
|
4
|
+
data.tar.gz: 0ce6f9e76d9cfb56a09ea34a6ad2023ec653d4bb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7fa15645710affa98c22858899a7053c598c3aa9a025ebc4c35471e21ad6f3d483d9b2a7fc6a12f4bf1ec52a926e911085680971503abb74ed7e867af8eef058
|
7
|
+
data.tar.gz: 4a0473a65349f20448f9ac7c1265630907ce29d036f82d1a6e153cf468a88bd0b8a241d7a7e806748b0b25c6b8649943c6bbbd9ab9b04bf79d2f1aaa6a65b0ec
|
data/README.md
CHANGED
@@ -26,6 +26,7 @@ OAuth flow for installed applications.
|
|
26
26
|
- `skip_invalid_rows`
|
27
27
|
- `max_bad_records`
|
28
28
|
- `ignore_unknown_values`
|
29
|
+
- `prevent_duplicate_load`
|
29
30
|
- Improve error handling
|
30
31
|
|
31
32
|
## Configuration
|
@@ -407,6 +408,34 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
|
|
407
408
|
</match>
|
408
409
|
```
|
409
410
|
|
411
|
+
### Prevent duplicate load
|
412
|
+
|
413
|
+
If you want to detect duplicate load job, you set `prevent_duplicate_load` to `true`
|
414
|
+
`prevent_duplicate_load` makes load job\_id consistent.
|
415
|
+
For example, even if fluentd process crashed during waiting for job, fluentd can resume waiting for same job.
|
416
|
+
|
417
|
+
```apache
|
418
|
+
<match dummy>
|
419
|
+
type bigquery
|
420
|
+
|
421
|
+
...
|
422
|
+
|
423
|
+
prevent_duplicate_load true
|
424
|
+
</match>
|
425
|
+
```
|
426
|
+
|
427
|
+
job\_id is calculated by SHA1. The factors are ...
|
428
|
+
|
429
|
+
- upload source path (file buffer path)
|
430
|
+
- dataset
|
431
|
+
- table
|
432
|
+
- schema
|
433
|
+
- `max_bad_records`
|
434
|
+
- `ignore_unknown_values`
|
435
|
+
|
436
|
+
NOTE: Duplicate job error does not invoke `flush_secondary`.
|
437
|
+
NOTE: This option affects only when use file buffer.
|
438
|
+
|
410
439
|
## TODO
|
411
440
|
|
412
441
|
* Automatically configured flush/buffer options
|
@@ -102,7 +102,10 @@ module Fluent
|
|
102
102
|
config_param :utc, :bool, default: nil
|
103
103
|
config_param :time_field, :string, default: nil
|
104
104
|
|
105
|
+
# insert_id_field (only insert)
|
105
106
|
config_param :insert_id_field, :string, default: nil
|
107
|
+
# prevent_duplicate_load (only load)
|
108
|
+
config_param :prevent_duplicate_load, :bool, default: false
|
106
109
|
|
107
110
|
config_param :method, :string, default: 'insert' # or 'load'
|
108
111
|
|
@@ -164,6 +167,7 @@ module Fluent
|
|
164
167
|
if @method == "insert"
|
165
168
|
extend(InsertImplementation)
|
166
169
|
elsif @method == "load"
|
170
|
+
require 'digest/sha1'
|
167
171
|
extend(LoadImplementation)
|
168
172
|
else
|
169
173
|
raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
|
@@ -422,10 +426,13 @@ module Fluent
|
|
422
426
|
raise "table created. send rows next time."
|
423
427
|
end
|
424
428
|
|
429
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
425
430
|
log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
|
426
|
-
|
427
|
-
|
428
|
-
|
431
|
+
|
432
|
+
raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
|
433
|
+
|
434
|
+
# other error handling
|
435
|
+
if @secondary
|
429
436
|
flush_secondary(@secondary)
|
430
437
|
end
|
431
438
|
end
|
@@ -447,8 +454,13 @@ module Fluent
|
|
447
454
|
|
448
455
|
def _write(chunk, table_id)
|
449
456
|
res = nil
|
457
|
+
job_id = nil
|
458
|
+
|
450
459
|
create_upload_source(chunk) do |upload_source|
|
451
|
-
|
460
|
+
if @prevent_duplicate_load
|
461
|
+
job_id = create_job_id(upload_source.path, @dataset, @table, @fields.to_a, @max_bad_records, @ignore_unknown_values)
|
462
|
+
end
|
463
|
+
configuration = {
|
452
464
|
configuration: {
|
453
465
|
load: {
|
454
466
|
destination_table: {
|
@@ -465,28 +477,37 @@ module Fluent
|
|
465
477
|
max_bad_records: @max_bad_records,
|
466
478
|
}
|
467
479
|
}
|
468
|
-
}
|
480
|
+
}
|
481
|
+
configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
|
482
|
+
res = client.insert_job(@project, configuration, {upload_source: upload_source, content_type: "application/octet-stream"})
|
469
483
|
end
|
470
|
-
|
484
|
+
|
485
|
+
wait_load(res.job_reference.job_id)
|
471
486
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
472
487
|
# api_error? -> client cache clear
|
473
488
|
@cached_client = nil
|
474
489
|
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
490
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
491
|
+
log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
492
|
+
|
493
|
+
raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
|
494
|
+
return wait_load(job_id) if e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
495
|
+
|
496
|
+
# other error handling
|
497
|
+
if @secondary
|
479
498
|
flush_secondary(@secondary)
|
480
499
|
end
|
481
500
|
end
|
482
501
|
|
483
502
|
private
|
484
503
|
|
485
|
-
def wait_load(
|
504
|
+
def wait_load(job_id)
|
486
505
|
wait_interval = 10
|
487
|
-
_response =
|
506
|
+
_response = client.get_job(@project, job_id)
|
507
|
+
table_id = _response.configuration.load.destination_table.table_id
|
508
|
+
|
488
509
|
until _response.status.state == "DONE"
|
489
|
-
log.debug "wait for load job finish", state: _response.status.state
|
510
|
+
log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
|
490
511
|
sleep wait_interval
|
491
512
|
_response = client.get_job(@project, _response.job_reference.job_id)
|
492
513
|
end
|
@@ -527,6 +548,10 @@ module Fluent
|
|
527
548
|
end
|
528
549
|
end
|
529
550
|
end
|
551
|
+
|
552
|
+
def create_job_id(upload_source_path, dataset, table, schema, max_bad_records, ignore_unknown_values)
|
553
|
+
"fluentd_job_" + Digest::SHA1.hexdigest("#{upload_source_path}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
|
554
|
+
end
|
530
555
|
end
|
531
556
|
|
532
557
|
class FieldSchema
|
@@ -881,13 +881,24 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
881
881
|
}
|
882
882
|
}
|
883
883
|
}, {upload_source: io, content_type: "application/octet-stream"}) {
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
884
|
+
Google::Apis::BigqueryV2::Job.new({
|
885
|
+
job_reference: Google::Apis::BigqueryV2::JobReference.new({job_id: "job_id"})
|
886
|
+
})
|
887
|
+
}
|
888
|
+
|
889
|
+
expect.get_job('yourproject_id', "job_id") {
|
890
|
+
Google::Apis::BigqueryV2::Job.new({
|
891
|
+
configuration: Google::Apis::BigqueryV2::JobConfiguration.new({
|
892
|
+
load: Google::Apis::BigqueryV2::JobConfigurationLoad.new({
|
893
|
+
destination_table: Google::Apis::BigqueryV2::TableReference.new({
|
894
|
+
project_id: 'yourproject_id',
|
895
|
+
dataset_id: 'yourdataset_id',
|
896
|
+
table_id: 'foo',
|
897
|
+
}),
|
898
|
+
})
|
899
|
+
}),
|
900
|
+
status: Google::Apis::BigqueryV2::JobStatus.new({state: "DONE"}),
|
901
|
+
})
|
891
902
|
}
|
892
903
|
end
|
893
904
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery-custom
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomohiro Hashidate
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|