fluent-plugin-bigquery 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb46d9ded6ca44476f2a241a4a08f5abff3e99f4
|
4
|
+
data.tar.gz: 2729484cdd6de6edbd9636f0c01eeb69c9b0368b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aa84153cb3e53c093cc888f93ea211e1f6852f2f6a08ad7eab875438d7e7c0a5be8ab9b1c8b9c181d3655c981b47e756ed9adf06fbe71142f98bb9f128f773e2
|
7
|
+
data.tar.gz: a0fd64ab52abe46eccde000d364ce79dca01a3ae3d9dde48d36963ae4ca03bfe9e17dc913b10d5ea6706765dff2cbc0a8bc34df7b1ba1a3345accd60283478e0
|
@@ -4,7 +4,7 @@ module Fluent
|
|
4
4
|
class Error < StandardError
|
5
5
|
RETRYABLE_ERROR_REASON = %w(backendError internalError rateLimitExceeded tableUnavailable).freeze
|
6
6
|
RETRYABLE_INSERT_ERRORS_REASON = %w(timeout).freeze
|
7
|
-
RETRYABLE_STATUS_CODE = [500, 503]
|
7
|
+
RETRYABLE_STATUS_CODE = [500, 502, 503, 504]
|
8
8
|
|
9
9
|
class << self
|
10
10
|
def wrap(google_api_error, message = nil, force_unretryable: false)
|
@@ -1,10 +1,10 @@
|
|
1
1
|
module Fluent
|
2
2
|
module BigQuery
|
3
3
|
class Writer
|
4
|
-
def initialize(log, auth_method,
|
4
|
+
def initialize(log, auth_method, options = {})
|
5
5
|
@auth_method = auth_method
|
6
6
|
@scope = "https://www.googleapis.com/auth/bigquery"
|
7
|
-
@
|
7
|
+
@options = options
|
8
8
|
@log = log
|
9
9
|
@num_errors_per_chunk = {}
|
10
10
|
|
@@ -22,7 +22,7 @@ module Fluent
|
|
22
22
|
@client = client
|
23
23
|
end
|
24
24
|
|
25
|
-
def create_table(project, dataset, table_id, record_schema
|
25
|
+
def create_table(project, dataset, table_id, record_schema)
|
26
26
|
create_table_retry_limit = 3
|
27
27
|
create_table_retry_wait = 1
|
28
28
|
create_table_retry_count = 0
|
@@ -38,10 +38,10 @@ module Fluent
|
|
38
38
|
}
|
39
39
|
}
|
40
40
|
|
41
|
-
if time_partitioning_type
|
41
|
+
if @options[:time_partitioning_type]
|
42
42
|
definition[:time_partitioning] = {
|
43
|
-
type: time_partitioning_type.to_s.upcase,
|
44
|
-
expiration_ms: time_partitioning_expiration ? time_partitioning_expiration * 1000 : nil
|
43
|
+
type: @options[:time_partitioning_type].to_s.upcase,
|
44
|
+
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil
|
45
45
|
}.compact
|
46
46
|
end
|
47
47
|
client.insert_table(project, dataset, definition, {})
|
@@ -84,21 +84,21 @@ module Fluent
|
|
84
84
|
nil
|
85
85
|
end
|
86
86
|
|
87
|
-
def insert_rows(project, dataset, table_id, rows,
|
87
|
+
def insert_rows(project, dataset, table_id, rows, template_suffix: nil)
|
88
88
|
body = {
|
89
89
|
rows: rows,
|
90
|
-
skip_invalid_rows: skip_invalid_rows,
|
91
|
-
ignore_unknown_values: ignore_unknown_values,
|
90
|
+
skip_invalid_rows: @options[:skip_invalid_rows],
|
91
|
+
ignore_unknown_values: @options[:ignore_unknown_values],
|
92
92
|
}
|
93
93
|
body.merge!(template_suffix: template_suffix) if template_suffix
|
94
94
|
res = client.insert_all_table_data(project, dataset, table_id, body, {
|
95
|
-
options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
|
95
|
+
options: {timeout_sec: @options[:timeout_sec], open_timeout_sec: @options[:open_timeout_sec]}
|
96
96
|
})
|
97
97
|
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
98
98
|
|
99
99
|
if res.insert_errors && !res.insert_errors.empty?
|
100
100
|
log.warn "insert errors", project_id: project, dataset: dataset, table: table_id, insert_errors: res.insert_errors.to_s
|
101
|
-
if allow_retry_insert_errors
|
101
|
+
if @options[:allow_retry_insert_errors]
|
102
102
|
is_included_any_retryable_insert_error = res.insert_errors.any? do |insert_error|
|
103
103
|
insert_error.errors.any? { |error| Fluent::BigQuery::Error.retryable_insert_errors_reason?(error.reason) }
|
104
104
|
end
|
@@ -118,7 +118,7 @@ module Fluent
|
|
118
118
|
raise Fluent::BigQuery::Error.wrap(e)
|
119
119
|
end
|
120
120
|
|
121
|
-
def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields
|
121
|
+
def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields)
|
122
122
|
configuration = {
|
123
123
|
configuration: {
|
124
124
|
load: {
|
@@ -132,14 +132,14 @@ module Fluent
|
|
132
132
|
},
|
133
133
|
write_disposition: "WRITE_APPEND",
|
134
134
|
source_format: "NEWLINE_DELIMITED_JSON",
|
135
|
-
ignore_unknown_values: ignore_unknown_values,
|
136
|
-
max_bad_records: max_bad_records,
|
135
|
+
ignore_unknown_values: @options[:ignore_unknown_values],
|
136
|
+
max_bad_records: @options[:max_bad_records],
|
137
137
|
}
|
138
138
|
}
|
139
139
|
}
|
140
140
|
|
141
|
-
job_id = create_job_id(chunk_id, dataset, table_id, fields.to_a
|
142
|
-
configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if time_partitioning_type
|
141
|
+
job_id = create_job_id(chunk_id, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
|
142
|
+
configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if @options[:time_partitioning_type]
|
143
143
|
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
144
144
|
|
145
145
|
# If target table is already exist, omit schema configuration.
|
@@ -159,8 +159,8 @@ module Fluent
|
|
159
159
|
upload_source: upload_source,
|
160
160
|
content_type: "application/octet-stream",
|
161
161
|
options: {
|
162
|
-
timeout_sec: timeout_sec,
|
163
|
-
open_timeout_sec: open_timeout_sec,
|
162
|
+
timeout_sec: @options[:timeout_sec],
|
163
|
+
open_timeout_sec: @options[:open_timeout_sec],
|
164
164
|
}
|
165
165
|
}
|
166
166
|
)
|
@@ -172,14 +172,19 @@ module Fluent
|
|
172
172
|
reason = e.respond_to?(:reason) ? e.reason : nil
|
173
173
|
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
174
174
|
|
175
|
-
if auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
175
|
+
if @options[:auto_create_table] && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
176
176
|
# Table Not Found: Auto Create Table
|
177
|
-
create_table(
|
177
|
+
create_table(
|
178
|
+
project,
|
179
|
+
dataset,
|
180
|
+
table_id,
|
181
|
+
fields,
|
182
|
+
)
|
178
183
|
raise "table created. send rows next time."
|
179
184
|
end
|
180
185
|
|
181
186
|
if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
182
|
-
wait_load_job(chunk_id, project, dataset, job_id, table_id)
|
187
|
+
wait_load_job(chunk_id, project, dataset, job_id, table_id)
|
183
188
|
@num_errors_per_chunk.delete(chunk_id)
|
184
189
|
return
|
185
190
|
end
|
@@ -242,9 +247,9 @@ module Fluent
|
|
242
247
|
|
243
248
|
def get_auth_from_private_key
|
244
249
|
require 'google/api_client/auth/key_utils'
|
245
|
-
private_key_path = @
|
246
|
-
private_key_passphrase = @
|
247
|
-
email = @
|
250
|
+
private_key_path = @options[:private_key_path]
|
251
|
+
private_key_passphrase = @options[:private_key_passphrase]
|
252
|
+
email = @options[:email]
|
248
253
|
|
249
254
|
key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
|
250
255
|
Signet::OAuth2::Client.new(
|
@@ -261,7 +266,7 @@ module Fluent
|
|
261
266
|
end
|
262
267
|
|
263
268
|
def get_auth_from_json_key
|
264
|
-
json_key = @
|
269
|
+
json_key = @options[:json_key]
|
265
270
|
|
266
271
|
begin
|
267
272
|
JSON.parse(json_key)
|
@@ -283,8 +288,8 @@ module Fluent
|
|
283
288
|
table_id.gsub(/\$\d+$/, "")
|
284
289
|
end
|
285
290
|
|
286
|
-
def create_job_id(chunk_id, dataset, table, schema
|
287
|
-
job_id_key = "#{chunk_id}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}#{@num_errors_per_chunk[chunk_id]}"
|
291
|
+
def create_job_id(chunk_id, dataset, table, schema)
|
292
|
+
job_id_key = "#{chunk_id}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id]}"
|
288
293
|
@log.debug "job_id_key: #{job_id_key}"
|
289
294
|
"fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
|
290
295
|
end
|
@@ -288,6 +288,16 @@ module Fluent
|
|
288
288
|
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
289
289
|
email: @email,
|
290
290
|
json_key: @json_key,
|
291
|
+
skip_invalid_rows: @skip_invalid_rows,
|
292
|
+
ignore_unknown_values: @ignore_unknown_values,
|
293
|
+
max_bad_records: @max_bad_records,
|
294
|
+
allow_retry_insert_errors: @allow_retry_insert_errors,
|
295
|
+
prevent_duplicate_load: @prevent_duplicate_load,
|
296
|
+
auto_create_table: @auto_create_table,
|
297
|
+
time_partitioning_type: @time_partitioning_type,
|
298
|
+
time_partitioning_expiration: @time_partitioning_expiration,
|
299
|
+
timeout_sec: @request_timeout_sec,
|
300
|
+
open_timeout_sec: @request_open_timeout_sec,
|
291
301
|
})
|
292
302
|
end
|
293
303
|
|
@@ -427,11 +437,11 @@ module Fluent
|
|
427
437
|
end
|
428
438
|
|
429
439
|
def insert(table_id, rows, template_suffix)
|
430
|
-
writer.insert_rows(@project, @dataset, table_id, rows,
|
440
|
+
writer.insert_rows(@project, @dataset, table_id, rows, template_suffix: template_suffix)
|
431
441
|
rescue Fluent::BigQuery::Error => e
|
432
442
|
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
433
443
|
# Table Not Found: Auto Create Table
|
434
|
-
writer.create_table(@project, @dataset, table_id, @fields
|
444
|
+
writer.create_table(@project, @dataset, table_id, @fields)
|
435
445
|
raise "table created. send rows next time."
|
436
446
|
end
|
437
447
|
|
@@ -473,12 +483,7 @@ module Fluent
|
|
473
483
|
res = nil
|
474
484
|
|
475
485
|
create_upload_source(chunk) do |upload_source|
|
476
|
-
res = writer.create_load_job(chunk.unique_id, @project, @dataset, table_id, upload_source, @fields
|
477
|
-
prevent_duplicate_load: @prevent_duplicate_load,
|
478
|
-
ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
|
479
|
-
timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec, auto_create_table: @auto_create_table,
|
480
|
-
time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration
|
481
|
-
})
|
486
|
+
res = writer.create_load_job(chunk.unique_id, @project, @dataset, table_id, upload_source, @fields)
|
482
487
|
end
|
483
488
|
rescue Fluent::BigQuery::Error => e
|
484
489
|
if e.retryable?
|
@@ -754,10 +754,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
754
754
|
driver = create_driver
|
755
755
|
|
756
756
|
writer = stub_writer(driver)
|
757
|
-
mock.proxy(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', entry,
|
758
|
-
skip_invalid_rows: false,
|
759
|
-
ignore_unknown_values: false
|
760
|
-
))
|
757
|
+
mock.proxy(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', entry, template_suffix: nil)
|
761
758
|
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', {
|
762
759
|
rows: entry,
|
763
760
|
skip_invalid_rows: false,
|
@@ -780,62 +777,71 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
780
777
|
|
781
778
|
def test_write_with_retryable_error
|
782
779
|
entry = {json: {a: "b"}}, {json: {b: "c"}}
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
time_format %s
|
791
|
-
time_field time
|
792
|
-
|
793
|
-
schema [
|
794
|
-
{"name": "time", "type": "INTEGER"},
|
795
|
-
{"name": "status", "type": "INTEGER"},
|
796
|
-
{"name": "bytes", "type": "INTEGER"},
|
797
|
-
{"name": "vhost", "type": "STRING"},
|
798
|
-
{"name": "path", "type": "STRING"},
|
799
|
-
{"name": "method", "type": "STRING"},
|
800
|
-
{"name": "protocol", "type": "STRING"},
|
801
|
-
{"name": "agent", "type": "STRING"},
|
802
|
-
{"name": "referer", "type": "STRING"},
|
803
|
-
{"name": "remote", "type": "RECORD", "fields": [
|
804
|
-
{"name": "host", "type": "STRING"},
|
805
|
-
{"name": "ip", "type": "STRING"},
|
806
|
-
{"name": "user", "type": "STRING"}
|
807
|
-
]},
|
808
|
-
{"name": "requesttime", "type": "FLOAT"},
|
809
|
-
{"name": "bot_access", "type": "BOOLEAN"},
|
810
|
-
{"name": "loginsession", "type": "BOOLEAN"}
|
811
|
-
]
|
812
|
-
<secondary>
|
813
|
-
type file
|
814
|
-
path error
|
815
|
-
utc
|
816
|
-
</secondary>
|
817
|
-
CONFIG
|
780
|
+
data_input = [
|
781
|
+
{ "status_code" => 500 },
|
782
|
+
{ "status_code" => 502 },
|
783
|
+
{ "status_code" => 503 },
|
784
|
+
{ "status_code" => 504 },
|
785
|
+
]
|
818
786
|
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
raise ex
|
827
|
-
end
|
787
|
+
data_input.each do |d|
|
788
|
+
driver = create_driver(<<-CONFIG)
|
789
|
+
table foo
|
790
|
+
email foo@bar.example
|
791
|
+
private_key_path /path/to/key
|
792
|
+
project yourproject_id
|
793
|
+
dataset yourdataset_id
|
828
794
|
|
829
|
-
|
830
|
-
|
831
|
-
chunk << e.to_msgpack
|
832
|
-
end
|
795
|
+
time_format %s
|
796
|
+
time_field time
|
833
797
|
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
798
|
+
schema [
|
799
|
+
{"name": "time", "type": "INTEGER"},
|
800
|
+
{"name": "status", "type": "INTEGER"},
|
801
|
+
{"name": "bytes", "type": "INTEGER"},
|
802
|
+
{"name": "vhost", "type": "STRING"},
|
803
|
+
{"name": "path", "type": "STRING"},
|
804
|
+
{"name": "method", "type": "STRING"},
|
805
|
+
{"name": "protocol", "type": "STRING"},
|
806
|
+
{"name": "agent", "type": "STRING"},
|
807
|
+
{"name": "referer", "type": "STRING"},
|
808
|
+
{"name": "remote", "type": "RECORD", "fields": [
|
809
|
+
{"name": "host", "type": "STRING"},
|
810
|
+
{"name": "ip", "type": "STRING"},
|
811
|
+
{"name": "user", "type": "STRING"}
|
812
|
+
]},
|
813
|
+
{"name": "requesttime", "type": "FLOAT"},
|
814
|
+
{"name": "bot_access", "type": "BOOLEAN"},
|
815
|
+
{"name": "loginsession", "type": "BOOLEAN"}
|
816
|
+
]
|
817
|
+
<secondary>
|
818
|
+
type file
|
819
|
+
path error
|
820
|
+
utc
|
821
|
+
</secondary>
|
822
|
+
CONFIG
|
823
|
+
|
824
|
+
writer = stub_writer(driver)
|
825
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', {
|
826
|
+
rows: entry,
|
827
|
+
skip_invalid_rows: false,
|
828
|
+
ignore_unknown_values: false
|
829
|
+
}, {options: {timeout_sec: nil, open_timeout_sec: 60}}) do
|
830
|
+
ex = Google::Apis::ServerError.new("error", status_code: d["status_code"])
|
831
|
+
raise ex
|
832
|
+
end
|
833
|
+
|
834
|
+
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
835
|
+
entry.each do |e|
|
836
|
+
chunk << e.to_msgpack
|
837
|
+
end
|
838
|
+
|
839
|
+
driver.instance.start
|
840
|
+
assert_raise Fluent::BigQuery::RetryableError do
|
841
|
+
driver.instance.write(chunk)
|
842
|
+
end
|
843
|
+
driver.instance.shutdown
|
844
|
+
end
|
839
845
|
end
|
840
846
|
|
841
847
|
def test_write_with_not_retryable_error
|
@@ -1455,11 +1461,8 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
1455
1461
|
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
1456
1462
|
CONFIG
|
1457
1463
|
writer = stub_writer(driver)
|
1458
|
-
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message],
|
1459
|
-
|
1460
|
-
ignore_unknown_values: false,
|
1461
|
-
)) { raise Fluent::BigQuery::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1462
|
-
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: nil, time_partitioning_expiration: nil)
|
1464
|
+
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], template_suffix: nil) { raise Fluent::BigQuery::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1465
|
+
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
|
1463
1466
|
|
1464
1467
|
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
1465
1468
|
chunk << message.to_msgpack
|
@@ -1517,11 +1520,8 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
1517
1520
|
time_partitioning_expiration 1h
|
1518
1521
|
CONFIG
|
1519
1522
|
writer = stub_writer(driver)
|
1520
|
-
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message],
|
1521
|
-
|
1522
|
-
ignore_unknown_values: false,
|
1523
|
-
)) { raise Fluent::BigQuery::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1524
|
-
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: :day, time_partitioning_expiration: 3600)
|
1523
|
+
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], template_suffix: nil) { raise Fluent::BigQuery::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1524
|
+
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
|
1525
1525
|
|
1526
1526
|
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
1527
1527
|
chunk << message.to_msgpack
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-03-
|
12
|
+
date: 2017-03-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|