fluent-plugin-bigquery 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb46d9ded6ca44476f2a241a4a08f5abff3e99f4
|
4
|
+
data.tar.gz: 2729484cdd6de6edbd9636f0c01eeb69c9b0368b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aa84153cb3e53c093cc888f93ea211e1f6852f2f6a08ad7eab875438d7e7c0a5be8ab9b1c8b9c181d3655c981b47e756ed9adf06fbe71142f98bb9f128f773e2
|
7
|
+
data.tar.gz: a0fd64ab52abe46eccde000d364ce79dca01a3ae3d9dde48d36963ae4ca03bfe9e17dc913b10d5ea6706765dff2cbc0a8bc34df7b1ba1a3345accd60283478e0
|
@@ -4,7 +4,7 @@ module Fluent
|
|
4
4
|
class Error < StandardError
|
5
5
|
RETRYABLE_ERROR_REASON = %w(backendError internalError rateLimitExceeded tableUnavailable).freeze
|
6
6
|
RETRYABLE_INSERT_ERRORS_REASON = %w(timeout).freeze
|
7
|
-
RETRYABLE_STATUS_CODE = [500, 503]
|
7
|
+
RETRYABLE_STATUS_CODE = [500, 502, 503, 504]
|
8
8
|
|
9
9
|
class << self
|
10
10
|
def wrap(google_api_error, message = nil, force_unretryable: false)
|
@@ -1,10 +1,10 @@
|
|
1
1
|
module Fluent
|
2
2
|
module BigQuery
|
3
3
|
class Writer
|
4
|
-
def initialize(log, auth_method,
|
4
|
+
def initialize(log, auth_method, options = {})
|
5
5
|
@auth_method = auth_method
|
6
6
|
@scope = "https://www.googleapis.com/auth/bigquery"
|
7
|
-
@
|
7
|
+
@options = options
|
8
8
|
@log = log
|
9
9
|
@num_errors_per_chunk = {}
|
10
10
|
|
@@ -22,7 +22,7 @@ module Fluent
|
|
22
22
|
@client = client
|
23
23
|
end
|
24
24
|
|
25
|
-
def create_table(project, dataset, table_id, record_schema
|
25
|
+
def create_table(project, dataset, table_id, record_schema)
|
26
26
|
create_table_retry_limit = 3
|
27
27
|
create_table_retry_wait = 1
|
28
28
|
create_table_retry_count = 0
|
@@ -38,10 +38,10 @@ module Fluent
|
|
38
38
|
}
|
39
39
|
}
|
40
40
|
|
41
|
-
if time_partitioning_type
|
41
|
+
if @options[:time_partitioning_type]
|
42
42
|
definition[:time_partitioning] = {
|
43
|
-
type: time_partitioning_type.to_s.upcase,
|
44
|
-
expiration_ms: time_partitioning_expiration ? time_partitioning_expiration * 1000 : nil
|
43
|
+
type: @options[:time_partitioning_type].to_s.upcase,
|
44
|
+
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil
|
45
45
|
}.compact
|
46
46
|
end
|
47
47
|
client.insert_table(project, dataset, definition, {})
|
@@ -84,21 +84,21 @@ module Fluent
|
|
84
84
|
nil
|
85
85
|
end
|
86
86
|
|
87
|
-
def insert_rows(project, dataset, table_id, rows,
|
87
|
+
def insert_rows(project, dataset, table_id, rows, template_suffix: nil)
|
88
88
|
body = {
|
89
89
|
rows: rows,
|
90
|
-
skip_invalid_rows: skip_invalid_rows,
|
91
|
-
ignore_unknown_values: ignore_unknown_values,
|
90
|
+
skip_invalid_rows: @options[:skip_invalid_rows],
|
91
|
+
ignore_unknown_values: @options[:ignore_unknown_values],
|
92
92
|
}
|
93
93
|
body.merge!(template_suffix: template_suffix) if template_suffix
|
94
94
|
res = client.insert_all_table_data(project, dataset, table_id, body, {
|
95
|
-
options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
|
95
|
+
options: {timeout_sec: @options[:timeout_sec], open_timeout_sec: @options[:open_timeout_sec]}
|
96
96
|
})
|
97
97
|
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
98
98
|
|
99
99
|
if res.insert_errors && !res.insert_errors.empty?
|
100
100
|
log.warn "insert errors", project_id: project, dataset: dataset, table: table_id, insert_errors: res.insert_errors.to_s
|
101
|
-
if allow_retry_insert_errors
|
101
|
+
if @options[:allow_retry_insert_errors]
|
102
102
|
is_included_any_retryable_insert_error = res.insert_errors.any? do |insert_error|
|
103
103
|
insert_error.errors.any? { |error| Fluent::BigQuery::Error.retryable_insert_errors_reason?(error.reason) }
|
104
104
|
end
|
@@ -118,7 +118,7 @@ module Fluent
|
|
118
118
|
raise Fluent::BigQuery::Error.wrap(e)
|
119
119
|
end
|
120
120
|
|
121
|
-
def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields
|
121
|
+
def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields)
|
122
122
|
configuration = {
|
123
123
|
configuration: {
|
124
124
|
load: {
|
@@ -132,14 +132,14 @@ module Fluent
|
|
132
132
|
},
|
133
133
|
write_disposition: "WRITE_APPEND",
|
134
134
|
source_format: "NEWLINE_DELIMITED_JSON",
|
135
|
-
ignore_unknown_values: ignore_unknown_values,
|
136
|
-
max_bad_records: max_bad_records,
|
135
|
+
ignore_unknown_values: @options[:ignore_unknown_values],
|
136
|
+
max_bad_records: @options[:max_bad_records],
|
137
137
|
}
|
138
138
|
}
|
139
139
|
}
|
140
140
|
|
141
|
-
job_id = create_job_id(chunk_id, dataset, table_id, fields.to_a
|
142
|
-
configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if time_partitioning_type
|
141
|
+
job_id = create_job_id(chunk_id, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
|
142
|
+
configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if @options[:time_partitioning_type]
|
143
143
|
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
144
144
|
|
145
145
|
# If target table is already exist, omit schema configuration.
|
@@ -159,8 +159,8 @@ module Fluent
|
|
159
159
|
upload_source: upload_source,
|
160
160
|
content_type: "application/octet-stream",
|
161
161
|
options: {
|
162
|
-
timeout_sec: timeout_sec,
|
163
|
-
open_timeout_sec: open_timeout_sec,
|
162
|
+
timeout_sec: @options[:timeout_sec],
|
163
|
+
open_timeout_sec: @options[:open_timeout_sec],
|
164
164
|
}
|
165
165
|
}
|
166
166
|
)
|
@@ -172,14 +172,19 @@ module Fluent
|
|
172
172
|
reason = e.respond_to?(:reason) ? e.reason : nil
|
173
173
|
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
174
174
|
|
175
|
-
if auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
175
|
+
if @options[:auto_create_table] && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
176
176
|
# Table Not Found: Auto Create Table
|
177
|
-
create_table(
|
177
|
+
create_table(
|
178
|
+
project,
|
179
|
+
dataset,
|
180
|
+
table_id,
|
181
|
+
fields,
|
182
|
+
)
|
178
183
|
raise "table created. send rows next time."
|
179
184
|
end
|
180
185
|
|
181
186
|
if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
182
|
-
wait_load_job(chunk_id, project, dataset, job_id, table_id)
|
187
|
+
wait_load_job(chunk_id, project, dataset, job_id, table_id)
|
183
188
|
@num_errors_per_chunk.delete(chunk_id)
|
184
189
|
return
|
185
190
|
end
|
@@ -242,9 +247,9 @@ module Fluent
|
|
242
247
|
|
243
248
|
def get_auth_from_private_key
|
244
249
|
require 'google/api_client/auth/key_utils'
|
245
|
-
private_key_path = @
|
246
|
-
private_key_passphrase = @
|
247
|
-
email = @
|
250
|
+
private_key_path = @options[:private_key_path]
|
251
|
+
private_key_passphrase = @options[:private_key_passphrase]
|
252
|
+
email = @options[:email]
|
248
253
|
|
249
254
|
key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
|
250
255
|
Signet::OAuth2::Client.new(
|
@@ -261,7 +266,7 @@ module Fluent
|
|
261
266
|
end
|
262
267
|
|
263
268
|
def get_auth_from_json_key
|
264
|
-
json_key = @
|
269
|
+
json_key = @options[:json_key]
|
265
270
|
|
266
271
|
begin
|
267
272
|
JSON.parse(json_key)
|
@@ -283,8 +288,8 @@ module Fluent
|
|
283
288
|
table_id.gsub(/\$\d+$/, "")
|
284
289
|
end
|
285
290
|
|
286
|
-
def create_job_id(chunk_id, dataset, table, schema
|
287
|
-
job_id_key = "#{chunk_id}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}#{@num_errors_per_chunk[chunk_id]}"
|
291
|
+
def create_job_id(chunk_id, dataset, table, schema)
|
292
|
+
job_id_key = "#{chunk_id}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id]}"
|
288
293
|
@log.debug "job_id_key: #{job_id_key}"
|
289
294
|
"fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
|
290
295
|
end
|
@@ -288,6 +288,16 @@ module Fluent
|
|
288
288
|
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
289
289
|
email: @email,
|
290
290
|
json_key: @json_key,
|
291
|
+
skip_invalid_rows: @skip_invalid_rows,
|
292
|
+
ignore_unknown_values: @ignore_unknown_values,
|
293
|
+
max_bad_records: @max_bad_records,
|
294
|
+
allow_retry_insert_errors: @allow_retry_insert_errors,
|
295
|
+
prevent_duplicate_load: @prevent_duplicate_load,
|
296
|
+
auto_create_table: @auto_create_table,
|
297
|
+
time_partitioning_type: @time_partitioning_type,
|
298
|
+
time_partitioning_expiration: @time_partitioning_expiration,
|
299
|
+
timeout_sec: @request_timeout_sec,
|
300
|
+
open_timeout_sec: @request_open_timeout_sec,
|
291
301
|
})
|
292
302
|
end
|
293
303
|
|
@@ -427,11 +437,11 @@ module Fluent
|
|
427
437
|
end
|
428
438
|
|
429
439
|
def insert(table_id, rows, template_suffix)
|
430
|
-
writer.insert_rows(@project, @dataset, table_id, rows,
|
440
|
+
writer.insert_rows(@project, @dataset, table_id, rows, template_suffix: template_suffix)
|
431
441
|
rescue Fluent::BigQuery::Error => e
|
432
442
|
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
433
443
|
# Table Not Found: Auto Create Table
|
434
|
-
writer.create_table(@project, @dataset, table_id, @fields
|
444
|
+
writer.create_table(@project, @dataset, table_id, @fields)
|
435
445
|
raise "table created. send rows next time."
|
436
446
|
end
|
437
447
|
|
@@ -473,12 +483,7 @@ module Fluent
|
|
473
483
|
res = nil
|
474
484
|
|
475
485
|
create_upload_source(chunk) do |upload_source|
|
476
|
-
res = writer.create_load_job(chunk.unique_id, @project, @dataset, table_id, upload_source, @fields
|
477
|
-
prevent_duplicate_load: @prevent_duplicate_load,
|
478
|
-
ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
|
479
|
-
timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec, auto_create_table: @auto_create_table,
|
480
|
-
time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration
|
481
|
-
})
|
486
|
+
res = writer.create_load_job(chunk.unique_id, @project, @dataset, table_id, upload_source, @fields)
|
482
487
|
end
|
483
488
|
rescue Fluent::BigQuery::Error => e
|
484
489
|
if e.retryable?
|
@@ -754,10 +754,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
754
754
|
driver = create_driver
|
755
755
|
|
756
756
|
writer = stub_writer(driver)
|
757
|
-
mock.proxy(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', entry,
|
758
|
-
skip_invalid_rows: false,
|
759
|
-
ignore_unknown_values: false
|
760
|
-
))
|
757
|
+
mock.proxy(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', entry, template_suffix: nil)
|
761
758
|
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', {
|
762
759
|
rows: entry,
|
763
760
|
skip_invalid_rows: false,
|
@@ -780,62 +777,71 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
780
777
|
|
781
778
|
def test_write_with_retryable_error
|
782
779
|
entry = {json: {a: "b"}}, {json: {b: "c"}}
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
time_format %s
|
791
|
-
time_field time
|
792
|
-
|
793
|
-
schema [
|
794
|
-
{"name": "time", "type": "INTEGER"},
|
795
|
-
{"name": "status", "type": "INTEGER"},
|
796
|
-
{"name": "bytes", "type": "INTEGER"},
|
797
|
-
{"name": "vhost", "type": "STRING"},
|
798
|
-
{"name": "path", "type": "STRING"},
|
799
|
-
{"name": "method", "type": "STRING"},
|
800
|
-
{"name": "protocol", "type": "STRING"},
|
801
|
-
{"name": "agent", "type": "STRING"},
|
802
|
-
{"name": "referer", "type": "STRING"},
|
803
|
-
{"name": "remote", "type": "RECORD", "fields": [
|
804
|
-
{"name": "host", "type": "STRING"},
|
805
|
-
{"name": "ip", "type": "STRING"},
|
806
|
-
{"name": "user", "type": "STRING"}
|
807
|
-
]},
|
808
|
-
{"name": "requesttime", "type": "FLOAT"},
|
809
|
-
{"name": "bot_access", "type": "BOOLEAN"},
|
810
|
-
{"name": "loginsession", "type": "BOOLEAN"}
|
811
|
-
]
|
812
|
-
<secondary>
|
813
|
-
type file
|
814
|
-
path error
|
815
|
-
utc
|
816
|
-
</secondary>
|
817
|
-
CONFIG
|
780
|
+
data_input = [
|
781
|
+
{ "status_code" => 500 },
|
782
|
+
{ "status_code" => 502 },
|
783
|
+
{ "status_code" => 503 },
|
784
|
+
{ "status_code" => 504 },
|
785
|
+
]
|
818
786
|
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
raise ex
|
827
|
-
end
|
787
|
+
data_input.each do |d|
|
788
|
+
driver = create_driver(<<-CONFIG)
|
789
|
+
table foo
|
790
|
+
email foo@bar.example
|
791
|
+
private_key_path /path/to/key
|
792
|
+
project yourproject_id
|
793
|
+
dataset yourdataset_id
|
828
794
|
|
829
|
-
|
830
|
-
|
831
|
-
chunk << e.to_msgpack
|
832
|
-
end
|
795
|
+
time_format %s
|
796
|
+
time_field time
|
833
797
|
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
798
|
+
schema [
|
799
|
+
{"name": "time", "type": "INTEGER"},
|
800
|
+
{"name": "status", "type": "INTEGER"},
|
801
|
+
{"name": "bytes", "type": "INTEGER"},
|
802
|
+
{"name": "vhost", "type": "STRING"},
|
803
|
+
{"name": "path", "type": "STRING"},
|
804
|
+
{"name": "method", "type": "STRING"},
|
805
|
+
{"name": "protocol", "type": "STRING"},
|
806
|
+
{"name": "agent", "type": "STRING"},
|
807
|
+
{"name": "referer", "type": "STRING"},
|
808
|
+
{"name": "remote", "type": "RECORD", "fields": [
|
809
|
+
{"name": "host", "type": "STRING"},
|
810
|
+
{"name": "ip", "type": "STRING"},
|
811
|
+
{"name": "user", "type": "STRING"}
|
812
|
+
]},
|
813
|
+
{"name": "requesttime", "type": "FLOAT"},
|
814
|
+
{"name": "bot_access", "type": "BOOLEAN"},
|
815
|
+
{"name": "loginsession", "type": "BOOLEAN"}
|
816
|
+
]
|
817
|
+
<secondary>
|
818
|
+
type file
|
819
|
+
path error
|
820
|
+
utc
|
821
|
+
</secondary>
|
822
|
+
CONFIG
|
823
|
+
|
824
|
+
writer = stub_writer(driver)
|
825
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', {
|
826
|
+
rows: entry,
|
827
|
+
skip_invalid_rows: false,
|
828
|
+
ignore_unknown_values: false
|
829
|
+
}, {options: {timeout_sec: nil, open_timeout_sec: 60}}) do
|
830
|
+
ex = Google::Apis::ServerError.new("error", status_code: d["status_code"])
|
831
|
+
raise ex
|
832
|
+
end
|
833
|
+
|
834
|
+
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
835
|
+
entry.each do |e|
|
836
|
+
chunk << e.to_msgpack
|
837
|
+
end
|
838
|
+
|
839
|
+
driver.instance.start
|
840
|
+
assert_raise Fluent::BigQuery::RetryableError do
|
841
|
+
driver.instance.write(chunk)
|
842
|
+
end
|
843
|
+
driver.instance.shutdown
|
844
|
+
end
|
839
845
|
end
|
840
846
|
|
841
847
|
def test_write_with_not_retryable_error
|
@@ -1455,11 +1461,8 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
1455
1461
|
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
1456
1462
|
CONFIG
|
1457
1463
|
writer = stub_writer(driver)
|
1458
|
-
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message],
|
1459
|
-
|
1460
|
-
ignore_unknown_values: false,
|
1461
|
-
)) { raise Fluent::BigQuery::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1462
|
-
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: nil, time_partitioning_expiration: nil)
|
1464
|
+
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], template_suffix: nil) { raise Fluent::BigQuery::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1465
|
+
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
|
1463
1466
|
|
1464
1467
|
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
1465
1468
|
chunk << message.to_msgpack
|
@@ -1517,11 +1520,8 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
1517
1520
|
time_partitioning_expiration 1h
|
1518
1521
|
CONFIG
|
1519
1522
|
writer = stub_writer(driver)
|
1520
|
-
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message],
|
1521
|
-
|
1522
|
-
ignore_unknown_values: false,
|
1523
|
-
)) { raise Fluent::BigQuery::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1524
|
-
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: :day, time_partitioning_expiration: 3600)
|
1523
|
+
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], template_suffix: nil) { raise Fluent::BigQuery::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1524
|
+
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
|
1525
1525
|
|
1526
1526
|
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
1527
1527
|
chunk << message.to_msgpack
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-03-
|
12
|
+
date: 2017-03-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|