fluent-plugin-bigquery 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -5
- data/lib/fluent/plugin/bigquery/schema.rb +11 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +16 -2
- data/lib/fluent/plugin/out_bigquery_base.rb +8 -2
- data/test/plugin/test_out_bigquery_insert.rb +90 -1
- data/test/plugin/test_out_bigquery_load.rb +2 -2
- data/test/plugin/test_record_schema.rb +17 -2
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 36b950bf0783d3ce350d7c7514f5b7946b10fe4b867aec015c9331656e86eb48
|
4
|
+
data.tar.gz: b4b8e92f41008043b09822b20698a7e29ca8daf9ba69c2a5c38c696553e86d71
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 01d3d39d9247134ca9059b990d0d6a52f308b27711d8cd989de30dfeb4e91a1673f1047d4e9269d24447169d9ec4bbac1d0d9b9f7d93b08b7be5d6c170593f1f
|
7
|
+
data.tar.gz: f226de7925fb048ba5533bf9b7c626f43e4b63eeb92c119d700737d1ae44611fb6fe6294e1ed5f989456de2ee3e1f98334c2d4cd1d89c49b52ef945a3674c8ce
|
data/README.md
CHANGED
@@ -44,6 +44,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
44
44
|
| private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
|
45
45
|
| private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
|
46
46
|
| json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
|
47
|
+
| location | string | no | no | nil | BigQuery Data Location. The geographic location of the job. Required except for US and EU. |
|
47
48
|
| project | string | yes | yes | nil | |
|
48
49
|
| dataset | string | yes | yes | nil | |
|
49
50
|
| table | string | yes (either `tables`) | yes | nil | |
|
@@ -57,10 +58,10 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
57
58
|
| schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
58
59
|
| request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
|
59
60
|
| request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
60
|
-
| time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature
|
61
|
-
| time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition
|
62
|
-
| time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning.
|
63
|
-
|
|
61
|
+
| time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature. |
|
62
|
+
| time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition. |
|
63
|
+
| time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. |
|
64
|
+
| clustering_fields | array(string) | no | no | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
|
64
65
|
|
65
66
|
#### bigquery_insert
|
66
67
|
|
@@ -433,7 +434,7 @@ Use placeholder.
|
|
433
434
|
|
434
435
|
```apache
|
435
436
|
<match dummy>
|
436
|
-
@type
|
437
|
+
@type bigquery_load
|
437
438
|
|
438
439
|
...
|
439
440
|
table accesslog$%Y%m%d
|
@@ -446,6 +447,8 @@ Use placeholder.
|
|
446
447
|
```
|
447
448
|
|
448
449
|
But, Dynamic table creating doesn't support date partitioned table yet.
|
450
|
+
And streaming insert is not allowed to insert with `$%Y%m%d` suffix.
|
451
|
+
If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`.
|
449
452
|
|
450
453
|
### Dynamic table creating
|
451
454
|
|
@@ -467,6 +470,8 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
|
|
467
470
|
</match>
|
468
471
|
```
|
469
472
|
|
473
|
+
Also, you can create clustered table by using `clustering_fields`.
|
474
|
+
|
470
475
|
### Table schema
|
471
476
|
|
472
477
|
There are three methods to describe the schema of the target table.
|
@@ -86,6 +86,16 @@ module Fluent
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
class NumericFieldSchema < FieldSchema
|
90
|
+
def type
|
91
|
+
:numeric
|
92
|
+
end
|
93
|
+
|
94
|
+
def format_one(value)
|
95
|
+
value.to_s
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
89
99
|
class BooleanFieldSchema < FieldSchema
|
90
100
|
def type
|
91
101
|
:boolean
|
@@ -169,6 +179,7 @@ module Fluent
|
|
169
179
|
string: StringFieldSchema,
|
170
180
|
integer: IntegerFieldSchema,
|
171
181
|
float: FloatFieldSchema,
|
182
|
+
numeric: NumericFieldSchema,
|
172
183
|
boolean: BooleanFieldSchema,
|
173
184
|
timestamp: TimestampFieldSchema,
|
174
185
|
date: DateFieldSchema,
|
@@ -35,6 +35,7 @@ module Fluent
|
|
35
35
|
}
|
36
36
|
|
37
37
|
definition.merge!(time_partitioning: time_partitioning) if time_partitioning
|
38
|
+
definition.merge!(clustering: clustering) if clustering
|
38
39
|
client.insert_table(project, dataset, definition, {})
|
39
40
|
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
40
41
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
@@ -149,6 +150,7 @@ module Fluent
|
|
149
150
|
raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
|
150
151
|
configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
|
151
152
|
configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
|
153
|
+
configuration[:configuration][:load].merge!(clustering: clustering) if clustering
|
152
154
|
end
|
153
155
|
end
|
154
156
|
|
@@ -174,8 +176,9 @@ module Fluent
|
|
174
176
|
def fetch_load_job(job_reference)
|
175
177
|
project = job_reference.project_id
|
176
178
|
job_id = job_reference.job_id
|
179
|
+
location = @options[:location]
|
177
180
|
|
178
|
-
res = client.get_job(project, job_id)
|
181
|
+
res = client.get_job(project, job_id, location: location)
|
179
182
|
log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
|
180
183
|
|
181
184
|
if res.status.state == "DONE"
|
@@ -309,13 +312,24 @@ module Fluent
|
|
309
312
|
type: @options[:time_partitioning_type].to_s.upcase,
|
310
313
|
field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
|
311
314
|
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
|
312
|
-
require_partition_filter: @options[:time_partitioning_require_partition_filter],
|
313
315
|
}.reject { |_, v| v.nil? }
|
314
316
|
else
|
315
317
|
@time_partitioning
|
316
318
|
end
|
317
319
|
end
|
318
320
|
|
321
|
+
def clustering
|
322
|
+
return @clustering if instance_variable_defined?(:@clustering)
|
323
|
+
|
324
|
+
if @options[:clustering_fields]
|
325
|
+
@clustering = {
|
326
|
+
fields: @options[:clustering_fields]
|
327
|
+
}
|
328
|
+
else
|
329
|
+
@clustering
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
319
333
|
def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
|
320
334
|
try_count ||= 1
|
321
335
|
res = client.insert_all_table_data(project, dataset, table_id, body, {})
|
@@ -29,6 +29,9 @@ module Fluent
|
|
29
29
|
config_param :private_key_path, :string, default: nil
|
30
30
|
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
31
31
|
config_param :json_key, default: nil, secret: true
|
32
|
+
# The geographic location of the job. Required except for US and EU.
|
33
|
+
# https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
|
34
|
+
config_param :location, :string, default: nil
|
32
35
|
|
33
36
|
# see as simple reference
|
34
37
|
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
@@ -69,7 +72,9 @@ module Fluent
|
|
69
72
|
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
70
73
|
config_param :time_partitioning_field, :string, default: nil
|
71
74
|
config_param :time_partitioning_expiration, :time, default: nil
|
72
|
-
|
75
|
+
|
76
|
+
## Clustering
|
77
|
+
config_param :clustering_fields, :array, default: nil
|
73
78
|
|
74
79
|
## Formatter
|
75
80
|
config_section :format do
|
@@ -132,6 +137,7 @@ module Fluent
|
|
132
137
|
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
133
138
|
email: @email,
|
134
139
|
json_key: @json_key,
|
140
|
+
location: @location,
|
135
141
|
source_format: @source_format,
|
136
142
|
skip_invalid_rows: @skip_invalid_rows,
|
137
143
|
ignore_unknown_values: @ignore_unknown_values,
|
@@ -142,7 +148,7 @@ module Fluent
|
|
142
148
|
time_partitioning_type: @time_partitioning_type,
|
143
149
|
time_partitioning_field: @time_partitioning_field,
|
144
150
|
time_partitioning_expiration: @time_partitioning_expiration,
|
145
|
-
|
151
|
+
clustering_fields: @clustering_fields,
|
146
152
|
timeout_sec: @request_timeout_sec,
|
147
153
|
open_timeout_sec: @request_open_timeout_sec,
|
148
154
|
})
|
@@ -400,6 +400,85 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
400
400
|
}
|
401
401
|
}
|
402
402
|
|
403
|
+
driver = create_driver(<<-CONFIG)
|
404
|
+
table foo
|
405
|
+
email foo@bar.example
|
406
|
+
private_key_path /path/to/key
|
407
|
+
project yourproject_id
|
408
|
+
dataset yourdataset_id
|
409
|
+
|
410
|
+
time_format %s
|
411
|
+
time_field time
|
412
|
+
|
413
|
+
auto_create_table true
|
414
|
+
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
415
|
+
|
416
|
+
time_partitioning_type day
|
417
|
+
time_partitioning_field time
|
418
|
+
time_partitioning_expiration 1h
|
419
|
+
CONFIG
|
420
|
+
|
421
|
+
stub_writer do |writer|
|
422
|
+
body = {
|
423
|
+
rows: [message],
|
424
|
+
skip_invalid_rows: false,
|
425
|
+
ignore_unknown_values: false,
|
426
|
+
}
|
427
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
|
428
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
429
|
+
end.at_least(1)
|
430
|
+
mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
|
431
|
+
|
432
|
+
mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
|
433
|
+
table_reference: {
|
434
|
+
table_id: 'foo',
|
435
|
+
},
|
436
|
+
schema: {
|
437
|
+
fields: driver.instance.instance_variable_get(:@table_schema).to_a,
|
438
|
+
},
|
439
|
+
time_partitioning: {
|
440
|
+
type: 'DAY',
|
441
|
+
field: 'time',
|
442
|
+
expiration_ms: 3600000,
|
443
|
+
},
|
444
|
+
}, {})
|
445
|
+
end
|
446
|
+
|
447
|
+
assert_raise(RuntimeError) do
|
448
|
+
driver.run do
|
449
|
+
driver.feed("tag", Fluent::EventTime.now, message[:json])
|
450
|
+
end
|
451
|
+
end
|
452
|
+
end
|
453
|
+
|
454
|
+
def test_auto_create_clustered_table_by_bigquery_api
|
455
|
+
now = Time.now
|
456
|
+
message = {
|
457
|
+
json: {
|
458
|
+
time: now.to_i,
|
459
|
+
request: {
|
460
|
+
vhost: "bar",
|
461
|
+
path: "/path/to/baz",
|
462
|
+
method: "GET",
|
463
|
+
protocol: "HTTP/1.0",
|
464
|
+
agent: "libwww",
|
465
|
+
referer: "http://referer.example",
|
466
|
+
time: (now - 1).to_f,
|
467
|
+
bot_access: true,
|
468
|
+
loginsession: false,
|
469
|
+
},
|
470
|
+
remote: {
|
471
|
+
host: "remote.example",
|
472
|
+
ip: "192.168.1.1",
|
473
|
+
user: "nagachika",
|
474
|
+
},
|
475
|
+
response: {
|
476
|
+
status: 200,
|
477
|
+
bytes: 72,
|
478
|
+
},
|
479
|
+
}
|
480
|
+
}
|
481
|
+
|
403
482
|
driver = create_driver(<<-CONFIG)
|
404
483
|
table foo
|
405
484
|
email foo@bar.example
|
@@ -417,6 +496,11 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
417
496
|
time_partitioning_field time
|
418
497
|
time_partitioning_expiration 1h
|
419
498
|
time_partitioning_require_partition_filter true
|
499
|
+
|
500
|
+
clustering_fields [
|
501
|
+
"time",
|
502
|
+
"vhost"
|
503
|
+
]
|
420
504
|
CONFIG
|
421
505
|
|
422
506
|
stub_writer do |writer|
|
@@ -441,7 +525,12 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
441
525
|
type: 'DAY',
|
442
526
|
field: 'time',
|
443
527
|
expiration_ms: 3600000,
|
444
|
-
|
528
|
+
},
|
529
|
+
clustering: {
|
530
|
+
fields: [
|
531
|
+
'time',
|
532
|
+
'vhost',
|
533
|
+
],
|
445
534
|
},
|
446
535
|
}, {})
|
447
536
|
end
|
@@ -158,7 +158,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
158
158
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
159
159
|
end
|
160
160
|
|
161
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
|
161
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
|
162
162
|
stub! do |s|
|
163
163
|
s.id { 'dummy_job_id' }
|
164
164
|
s.configuration.stub! do |_s|
|
@@ -241,7 +241,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
241
241
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
242
242
|
end
|
243
243
|
|
244
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
|
244
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
|
245
245
|
stub! do |s|
|
246
246
|
s.id { 'dummy_job_id' }
|
247
247
|
s.configuration.stub! do |_s|
|
@@ -27,6 +27,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
27
27
|
"name" => "argv",
|
28
28
|
"type" => "STRING",
|
29
29
|
"mode" => "REPEATED"
|
30
|
+
},
|
31
|
+
{
|
32
|
+
"name" => "utilisation",
|
33
|
+
"type" => "NUMERIC",
|
34
|
+
"mode" => "NULLABLE"
|
30
35
|
}
|
31
36
|
]
|
32
37
|
end
|
@@ -58,6 +63,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
58
63
|
"type" => "STRING",
|
59
64
|
"mode" => "REPEATED"
|
60
65
|
},
|
66
|
+
{
|
67
|
+
"name" => "utilisation",
|
68
|
+
"type" => "NUMERIC",
|
69
|
+
"mode" => "NULLABLE"
|
70
|
+
},
|
61
71
|
{
|
62
72
|
"name" => "new_column",
|
63
73
|
"type" => "STRING",
|
@@ -93,6 +103,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
93
103
|
"type" => "STRING",
|
94
104
|
"mode" => "REPEATED"
|
95
105
|
},
|
106
|
+
{
|
107
|
+
"name" => "utilisation",
|
108
|
+
"type" => "NUMERIC",
|
109
|
+
"mode" => "NULLABLE"
|
110
|
+
}
|
96
111
|
]
|
97
112
|
end
|
98
113
|
|
@@ -142,12 +157,12 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
142
157
|
time = Time.local(2016, 2, 7, 19, 0, 0).utc
|
143
158
|
|
144
159
|
formatted = fields.format_one({
|
145
|
-
"time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42]
|
160
|
+
"time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42], "utilisation" => "0.837"
|
146
161
|
})
|
147
162
|
assert_equal(
|
148
163
|
formatted,
|
149
164
|
{
|
150
|
-
"time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"]
|
165
|
+
"time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"], "utilisation" => "0.837"
|
151
166
|
}
|
152
167
|
)
|
153
168
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2019-08-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -183,8 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
183
183
|
- !ruby/object:Gem::Version
|
184
184
|
version: '0'
|
185
185
|
requirements: []
|
186
|
-
|
187
|
-
rubygems_version: 2.7.7
|
186
|
+
rubygems_version: 3.0.3
|
188
187
|
signing_key:
|
189
188
|
specification_version: 4
|
190
189
|
summary: Fluentd plugin to store data on Google BigQuery
|