fluent-plugin-bigquery 2.1.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +10 -5
- data/lib/fluent/plugin/bigquery/schema.rb +11 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +16 -2
- data/lib/fluent/plugin/out_bigquery_base.rb +8 -2
- data/test/plugin/test_out_bigquery_insert.rb +90 -1
- data/test/plugin/test_out_bigquery_load.rb +2 -2
- data/test/plugin/test_record_schema.rb +17 -2
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 36b950bf0783d3ce350d7c7514f5b7946b10fe4b867aec015c9331656e86eb48
|
4
|
+
data.tar.gz: b4b8e92f41008043b09822b20698a7e29ca8daf9ba69c2a5c38c696553e86d71
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 01d3d39d9247134ca9059b990d0d6a52f308b27711d8cd989de30dfeb4e91a1673f1047d4e9269d24447169d9ec4bbac1d0d9b9f7d93b08b7be5d6c170593f1f
|
7
|
+
data.tar.gz: f226de7925fb048ba5533bf9b7c626f43e4b63eeb92c119d700737d1ae44611fb6fe6294e1ed5f989456de2ee3e1f98334c2d4cd1d89c49b52ef945a3674c8ce
|
data/README.md
CHANGED
@@ -44,6 +44,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
44
44
|
| private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
|
45
45
|
| private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
|
46
46
|
| json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
|
47
|
+
| location | string | no | no | nil | BigQuery Data Location. The geographic location of the job. Required except for US and EU. |
|
47
48
|
| project | string | yes | yes | nil | |
|
48
49
|
| dataset | string | yes | yes | nil | |
|
49
50
|
| table | string | yes (either `tables`) | yes | nil | |
|
@@ -57,10 +58,10 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
57
58
|
| schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
58
59
|
| request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
|
59
60
|
| request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
60
|
-
| time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature
|
61
|
-
| time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition
|
62
|
-
| time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning.
|
63
|
-
|
|
61
|
+
| time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature. |
|
62
|
+
| time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition. |
|
63
|
+
| time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. |
|
64
|
+
| clustering_fields | array(string) | no | no | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
|
64
65
|
|
65
66
|
#### bigquery_insert
|
66
67
|
|
@@ -433,7 +434,7 @@ Use placeholder.
|
|
433
434
|
|
434
435
|
```apache
|
435
436
|
<match dummy>
|
436
|
-
@type
|
437
|
+
@type bigquery_load
|
437
438
|
|
438
439
|
...
|
439
440
|
table accesslog$%Y%m%d
|
@@ -446,6 +447,8 @@ Use placeholder.
|
|
446
447
|
```
|
447
448
|
|
448
449
|
But, Dynamic table creating doesn't support date partitioned table yet.
|
450
|
+
And streaming insert is not allowed to insert with `$%Y%m%d` suffix.
|
451
|
+
If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`.
|
449
452
|
|
450
453
|
### Dynamic table creating
|
451
454
|
|
@@ -467,6 +470,8 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
|
|
467
470
|
</match>
|
468
471
|
```
|
469
472
|
|
473
|
+
Also, you can create clustered table by using `clustering_fields`.
|
474
|
+
|
470
475
|
### Table schema
|
471
476
|
|
472
477
|
There are three methods to describe the schema of the target table.
|
@@ -86,6 +86,16 @@ module Fluent
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
class NumericFieldSchema < FieldSchema
|
90
|
+
def type
|
91
|
+
:numeric
|
92
|
+
end
|
93
|
+
|
94
|
+
def format_one(value)
|
95
|
+
value.to_s
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
89
99
|
class BooleanFieldSchema < FieldSchema
|
90
100
|
def type
|
91
101
|
:boolean
|
@@ -169,6 +179,7 @@ module Fluent
|
|
169
179
|
string: StringFieldSchema,
|
170
180
|
integer: IntegerFieldSchema,
|
171
181
|
float: FloatFieldSchema,
|
182
|
+
numeric: NumericFieldSchema,
|
172
183
|
boolean: BooleanFieldSchema,
|
173
184
|
timestamp: TimestampFieldSchema,
|
174
185
|
date: DateFieldSchema,
|
@@ -35,6 +35,7 @@ module Fluent
|
|
35
35
|
}
|
36
36
|
|
37
37
|
definition.merge!(time_partitioning: time_partitioning) if time_partitioning
|
38
|
+
definition.merge!(clustering: clustering) if clustering
|
38
39
|
client.insert_table(project, dataset, definition, {})
|
39
40
|
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
40
41
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
@@ -149,6 +150,7 @@ module Fluent
|
|
149
150
|
raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
|
150
151
|
configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
|
151
152
|
configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
|
153
|
+
configuration[:configuration][:load].merge!(clustering: clustering) if clustering
|
152
154
|
end
|
153
155
|
end
|
154
156
|
|
@@ -174,8 +176,9 @@ module Fluent
|
|
174
176
|
def fetch_load_job(job_reference)
|
175
177
|
project = job_reference.project_id
|
176
178
|
job_id = job_reference.job_id
|
179
|
+
location = @options[:location]
|
177
180
|
|
178
|
-
res = client.get_job(project, job_id)
|
181
|
+
res = client.get_job(project, job_id, location: location)
|
179
182
|
log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
|
180
183
|
|
181
184
|
if res.status.state == "DONE"
|
@@ -309,13 +312,24 @@ module Fluent
|
|
309
312
|
type: @options[:time_partitioning_type].to_s.upcase,
|
310
313
|
field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
|
311
314
|
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
|
312
|
-
require_partition_filter: @options[:time_partitioning_require_partition_filter],
|
313
315
|
}.reject { |_, v| v.nil? }
|
314
316
|
else
|
315
317
|
@time_partitioning
|
316
318
|
end
|
317
319
|
end
|
318
320
|
|
321
|
+
def clustering
|
322
|
+
return @clustering if instance_variable_defined?(:@clustering)
|
323
|
+
|
324
|
+
if @options[:clustering_fields]
|
325
|
+
@clustering = {
|
326
|
+
fields: @options[:clustering_fields]
|
327
|
+
}
|
328
|
+
else
|
329
|
+
@clustering
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
319
333
|
def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
|
320
334
|
try_count ||= 1
|
321
335
|
res = client.insert_all_table_data(project, dataset, table_id, body, {})
|
@@ -29,6 +29,9 @@ module Fluent
|
|
29
29
|
config_param :private_key_path, :string, default: nil
|
30
30
|
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
31
31
|
config_param :json_key, default: nil, secret: true
|
32
|
+
# The geographic location of the job. Required except for US and EU.
|
33
|
+
# https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
|
34
|
+
config_param :location, :string, default: nil
|
32
35
|
|
33
36
|
# see as simple reference
|
34
37
|
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
@@ -69,7 +72,9 @@ module Fluent
|
|
69
72
|
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
70
73
|
config_param :time_partitioning_field, :string, default: nil
|
71
74
|
config_param :time_partitioning_expiration, :time, default: nil
|
72
|
-
|
75
|
+
|
76
|
+
## Clustering
|
77
|
+
config_param :clustering_fields, :array, default: nil
|
73
78
|
|
74
79
|
## Formatter
|
75
80
|
config_section :format do
|
@@ -132,6 +137,7 @@ module Fluent
|
|
132
137
|
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
133
138
|
email: @email,
|
134
139
|
json_key: @json_key,
|
140
|
+
location: @location,
|
135
141
|
source_format: @source_format,
|
136
142
|
skip_invalid_rows: @skip_invalid_rows,
|
137
143
|
ignore_unknown_values: @ignore_unknown_values,
|
@@ -142,7 +148,7 @@ module Fluent
|
|
142
148
|
time_partitioning_type: @time_partitioning_type,
|
143
149
|
time_partitioning_field: @time_partitioning_field,
|
144
150
|
time_partitioning_expiration: @time_partitioning_expiration,
|
145
|
-
|
151
|
+
clustering_fields: @clustering_fields,
|
146
152
|
timeout_sec: @request_timeout_sec,
|
147
153
|
open_timeout_sec: @request_open_timeout_sec,
|
148
154
|
})
|
@@ -400,6 +400,85 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
400
400
|
}
|
401
401
|
}
|
402
402
|
|
403
|
+
driver = create_driver(<<-CONFIG)
|
404
|
+
table foo
|
405
|
+
email foo@bar.example
|
406
|
+
private_key_path /path/to/key
|
407
|
+
project yourproject_id
|
408
|
+
dataset yourdataset_id
|
409
|
+
|
410
|
+
time_format %s
|
411
|
+
time_field time
|
412
|
+
|
413
|
+
auto_create_table true
|
414
|
+
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
415
|
+
|
416
|
+
time_partitioning_type day
|
417
|
+
time_partitioning_field time
|
418
|
+
time_partitioning_expiration 1h
|
419
|
+
CONFIG
|
420
|
+
|
421
|
+
stub_writer do |writer|
|
422
|
+
body = {
|
423
|
+
rows: [message],
|
424
|
+
skip_invalid_rows: false,
|
425
|
+
ignore_unknown_values: false,
|
426
|
+
}
|
427
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
|
428
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
429
|
+
end.at_least(1)
|
430
|
+
mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
|
431
|
+
|
432
|
+
mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
|
433
|
+
table_reference: {
|
434
|
+
table_id: 'foo',
|
435
|
+
},
|
436
|
+
schema: {
|
437
|
+
fields: driver.instance.instance_variable_get(:@table_schema).to_a,
|
438
|
+
},
|
439
|
+
time_partitioning: {
|
440
|
+
type: 'DAY',
|
441
|
+
field: 'time',
|
442
|
+
expiration_ms: 3600000,
|
443
|
+
},
|
444
|
+
}, {})
|
445
|
+
end
|
446
|
+
|
447
|
+
assert_raise(RuntimeError) do
|
448
|
+
driver.run do
|
449
|
+
driver.feed("tag", Fluent::EventTime.now, message[:json])
|
450
|
+
end
|
451
|
+
end
|
452
|
+
end
|
453
|
+
|
454
|
+
def test_auto_create_clustered_table_by_bigquery_api
|
455
|
+
now = Time.now
|
456
|
+
message = {
|
457
|
+
json: {
|
458
|
+
time: now.to_i,
|
459
|
+
request: {
|
460
|
+
vhost: "bar",
|
461
|
+
path: "/path/to/baz",
|
462
|
+
method: "GET",
|
463
|
+
protocol: "HTTP/1.0",
|
464
|
+
agent: "libwww",
|
465
|
+
referer: "http://referer.example",
|
466
|
+
time: (now - 1).to_f,
|
467
|
+
bot_access: true,
|
468
|
+
loginsession: false,
|
469
|
+
},
|
470
|
+
remote: {
|
471
|
+
host: "remote.example",
|
472
|
+
ip: "192.168.1.1",
|
473
|
+
user: "nagachika",
|
474
|
+
},
|
475
|
+
response: {
|
476
|
+
status: 200,
|
477
|
+
bytes: 72,
|
478
|
+
},
|
479
|
+
}
|
480
|
+
}
|
481
|
+
|
403
482
|
driver = create_driver(<<-CONFIG)
|
404
483
|
table foo
|
405
484
|
email foo@bar.example
|
@@ -417,6 +496,11 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
417
496
|
time_partitioning_field time
|
418
497
|
time_partitioning_expiration 1h
|
419
498
|
time_partitioning_require_partition_filter true
|
499
|
+
|
500
|
+
clustering_fields [
|
501
|
+
"time",
|
502
|
+
"vhost"
|
503
|
+
]
|
420
504
|
CONFIG
|
421
505
|
|
422
506
|
stub_writer do |writer|
|
@@ -441,7 +525,12 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
441
525
|
type: 'DAY',
|
442
526
|
field: 'time',
|
443
527
|
expiration_ms: 3600000,
|
444
|
-
|
528
|
+
},
|
529
|
+
clustering: {
|
530
|
+
fields: [
|
531
|
+
'time',
|
532
|
+
'vhost',
|
533
|
+
],
|
445
534
|
},
|
446
535
|
}, {})
|
447
536
|
end
|
@@ -158,7 +158,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
158
158
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
159
159
|
end
|
160
160
|
|
161
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
|
161
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
|
162
162
|
stub! do |s|
|
163
163
|
s.id { 'dummy_job_id' }
|
164
164
|
s.configuration.stub! do |_s|
|
@@ -241,7 +241,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
241
241
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
242
242
|
end
|
243
243
|
|
244
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
|
244
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
|
245
245
|
stub! do |s|
|
246
246
|
s.id { 'dummy_job_id' }
|
247
247
|
s.configuration.stub! do |_s|
|
@@ -27,6 +27,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
27
27
|
"name" => "argv",
|
28
28
|
"type" => "STRING",
|
29
29
|
"mode" => "REPEATED"
|
30
|
+
},
|
31
|
+
{
|
32
|
+
"name" => "utilisation",
|
33
|
+
"type" => "NUMERIC",
|
34
|
+
"mode" => "NULLABLE"
|
30
35
|
}
|
31
36
|
]
|
32
37
|
end
|
@@ -58,6 +63,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
58
63
|
"type" => "STRING",
|
59
64
|
"mode" => "REPEATED"
|
60
65
|
},
|
66
|
+
{
|
67
|
+
"name" => "utilisation",
|
68
|
+
"type" => "NUMERIC",
|
69
|
+
"mode" => "NULLABLE"
|
70
|
+
},
|
61
71
|
{
|
62
72
|
"name" => "new_column",
|
63
73
|
"type" => "STRING",
|
@@ -93,6 +103,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
93
103
|
"type" => "STRING",
|
94
104
|
"mode" => "REPEATED"
|
95
105
|
},
|
106
|
+
{
|
107
|
+
"name" => "utilisation",
|
108
|
+
"type" => "NUMERIC",
|
109
|
+
"mode" => "NULLABLE"
|
110
|
+
}
|
96
111
|
]
|
97
112
|
end
|
98
113
|
|
@@ -142,12 +157,12 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
142
157
|
time = Time.local(2016, 2, 7, 19, 0, 0).utc
|
143
158
|
|
144
159
|
formatted = fields.format_one({
|
145
|
-
"time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42]
|
160
|
+
"time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42], "utilisation" => "0.837"
|
146
161
|
})
|
147
162
|
assert_equal(
|
148
163
|
formatted,
|
149
164
|
{
|
150
|
-
"time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"]
|
165
|
+
"time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"], "utilisation" => "0.837"
|
151
166
|
}
|
152
167
|
)
|
153
168
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2019-08-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -183,8 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
183
183
|
- !ruby/object:Gem::Version
|
184
184
|
version: '0'
|
185
185
|
requirements: []
|
186
|
-
|
187
|
-
rubygems_version: 2.7.7
|
186
|
+
rubygems_version: 3.0.3
|
188
187
|
signing_key:
|
189
188
|
specification_version: 4
|
190
189
|
summary: Fluentd plugin to store data on Google BigQuery
|