fluent-plugin-bigquery 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/linux.yml +31 -0
- data/.github/workflows/windows.yml +27 -0
- data/README.md +10 -1
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +11 -0
- data/lib/fluent/plugin/out_bigquery_base.rb +21 -3
- data/lib/fluent/plugin/out_bigquery_insert.rb +3 -0
- data/test/plugin/test_out_bigquery_base.rb +22 -27
- data/test/plugin/test_out_bigquery_insert.rb +15 -5
- data/test/plugin/test_out_bigquery_load.rb +4 -4
- metadata +5 -4
- data/.travis.yml +0 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4209a2b6eaaf0b6f8ba315b6f5de6690e28fb47890aeea777bdb31889e4785ab
|
4
|
+
data.tar.gz: b0983fb4fa16d72059b0e679ea4ee627d19e805779fa010888fa1723354896a5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6fc6891eda12bbc1272af7af9c4e8d48e588bc7ef65153b3a7524e39468baebb8fdb925856d1850bbda12fed5d33865faa56542503f76fdf724a18937c7d56e
|
7
|
+
data.tar.gz: fff0599b6a838cb4ff233ba9585b558ff733eed8063c1cf36ee08aaacb9b3c2ca1bce4d13db2a51ecc72c398ba751a18b2856a6348f43738ee8ca366becdea61
|
@@ -0,0 +1,31 @@
|
|
1
|
+
name: Testing on Ubuntu
|
2
|
+
on:
|
3
|
+
- push
|
4
|
+
- pull_request
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ${{ matrix.os }}
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- 2.6
|
13
|
+
- 2.7
|
14
|
+
- 3.0
|
15
|
+
- 3.1
|
16
|
+
os:
|
17
|
+
- ubuntu-latest
|
18
|
+
name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }}
|
19
|
+
steps:
|
20
|
+
- uses: actions/checkout@v2
|
21
|
+
- uses: ruby/setup-ruby@v1
|
22
|
+
with:
|
23
|
+
ruby-version: ${{ matrix.ruby }}
|
24
|
+
- name: unit testing
|
25
|
+
env:
|
26
|
+
CI: true
|
27
|
+
run: |
|
28
|
+
ruby -v
|
29
|
+
gem install bundler rake
|
30
|
+
bundle install --jobs 4 --retry 3
|
31
|
+
bundle exec rake test
|
@@ -0,0 +1,27 @@
|
|
1
|
+
name: Testing on Windows
|
2
|
+
on:
|
3
|
+
- push
|
4
|
+
- pull_request
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ${{ matrix.os }}
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby: [ '2.6', '2.7', '3.0', '3.1' ]
|
12
|
+
os:
|
13
|
+
- windows-latest
|
14
|
+
name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }}
|
15
|
+
steps:
|
16
|
+
- uses: actions/checkout@v2
|
17
|
+
- uses: ruby/setup-ruby@v1
|
18
|
+
with:
|
19
|
+
ruby-version: ${{ matrix.ruby }}
|
20
|
+
- name: unit testing
|
21
|
+
env:
|
22
|
+
CI: true
|
23
|
+
run: |
|
24
|
+
ruby -v
|
25
|
+
gem install bundler rake
|
26
|
+
bundle install --jobs 4 --retry 3
|
27
|
+
bundle exec rake test
|
data/README.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
# fluent-plugin-bigquery
|
2
2
|
|
3
|
+
## Notice
|
4
|
+
|
5
|
+
We will transfer fluent-plugin-bigquery repository to [fluent-plugins-nursery](https://github.com/fluent-plugins-nursery) organization.
|
6
|
+
It does not change maintenance plan.
|
7
|
+
The main purpose is that it solves mismatch between maintainers and current organization.
|
8
|
+
|
9
|
+
---
|
10
|
+
|
3
11
|
[Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery.
|
4
12
|
|
5
13
|
- **Plugin type**: Output
|
@@ -52,7 +60,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
52
60
|
| auto_create_table | bool | no | no | false | If true, creates table automatically |
|
53
61
|
| ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
|
54
62
|
| schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
|
55
|
-
| schema_path | string | yes (either `fetch_schema`) |
|
63
|
+
| schema_path | string | yes (either `fetch_schema`) | yes | nil | Schema Definition file path. It is formatted by JSON. |
|
56
64
|
| fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
|
57
65
|
| fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
|
58
66
|
| schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
@@ -72,6 +80,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
72
80
|
| insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. see. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor |
|
73
81
|
| add_insert_timestamp | string | no | no | nil | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
|
74
82
|
| allow_retry_insert_errors | bool | no | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
|
83
|
+
| require_partition_filter | bool | no | no | false | If true, queries over this table require a partition filter that can be used for partition elimination to be specified. |
|
75
84
|
|
76
85
|
#### bigquery_load
|
77
86
|
|
@@ -35,6 +35,7 @@ module Fluent
|
|
35
35
|
}
|
36
36
|
|
37
37
|
definition.merge!(time_partitioning: time_partitioning) if time_partitioning
|
38
|
+
definition.merge!(require_partition_filter: require_partition_filter) if require_partition_filter
|
38
39
|
definition.merge!(clustering: clustering) if clustering
|
39
40
|
client.insert_table(project, dataset, definition, {})
|
40
41
|
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
@@ -318,6 +319,16 @@ module Fluent
|
|
318
319
|
end
|
319
320
|
end
|
320
321
|
|
322
|
+
def require_partition_filter
|
323
|
+
return @require_partition_filter if instance_variable_defined?(:@require_partition_filter)
|
324
|
+
|
325
|
+
if @options[:require_partition_filter]
|
326
|
+
@require_partition_filter = @options[:require_partition_filter]
|
327
|
+
else
|
328
|
+
@require_partition_filter
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
321
332
|
def clustering
|
322
333
|
return @clustering if instance_variable_defined?(:@clustering)
|
323
334
|
|
@@ -111,9 +111,6 @@ module Fluent
|
|
111
111
|
if @schema
|
112
112
|
@table_schema.load_schema(@schema)
|
113
113
|
end
|
114
|
-
if @schema_path
|
115
|
-
@table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
|
116
|
-
end
|
117
114
|
|
118
115
|
formatter_config = conf.elements("format")[0]
|
119
116
|
@formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
|
@@ -126,6 +123,7 @@ module Fluent
|
|
126
123
|
@tables_mutex = Mutex.new
|
127
124
|
@fetched_schemas = {}
|
128
125
|
@last_fetch_schema_time = Hash.new(0)
|
126
|
+
@read_schemas = {}
|
129
127
|
end
|
130
128
|
|
131
129
|
def multi_workers_ready?
|
@@ -148,6 +146,7 @@ module Fluent
|
|
148
146
|
time_partitioning_type: @time_partitioning_type,
|
149
147
|
time_partitioning_field: @time_partitioning_field,
|
150
148
|
time_partitioning_expiration: @time_partitioning_expiration,
|
149
|
+
require_partition_filter: @require_partition_filter,
|
151
150
|
clustering_fields: @clustering_fields,
|
152
151
|
timeout_sec: @request_timeout_sec,
|
153
152
|
open_timeout_sec: @request_open_timeout_sec,
|
@@ -161,6 +160,8 @@ module Fluent
|
|
161
160
|
schema =
|
162
161
|
if @fetch_schema
|
163
162
|
fetch_schema(meta)
|
163
|
+
elsif @schema_path
|
164
|
+
read_schema(meta)
|
164
165
|
else
|
165
166
|
@table_schema
|
166
167
|
end
|
@@ -209,9 +210,26 @@ module Fluent
|
|
209
210
|
extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
|
210
211
|
end
|
211
212
|
|
213
|
+
def read_schema(metadata)
|
214
|
+
schema_path = read_schema_target_path(metadata)
|
215
|
+
|
216
|
+
unless @read_schemas[schema_path]
|
217
|
+
table_schema = Fluent::BigQuery::RecordSchema.new("record")
|
218
|
+
table_schema.load_schema(MultiJson.load(File.read(schema_path)))
|
219
|
+
@read_schemas[schema_path] = table_schema
|
220
|
+
end
|
221
|
+
@read_schemas[schema_path]
|
222
|
+
end
|
223
|
+
|
224
|
+
def read_schema_target_path(metadata)
|
225
|
+
extract_placeholders(@schema_path, metadata)
|
226
|
+
end
|
227
|
+
|
212
228
|
def get_schema(project, dataset, metadata)
|
213
229
|
if @fetch_schema
|
214
230
|
@fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
|
231
|
+
elsif @schema_path
|
232
|
+
@read_schemas[read_schema_target_path(metadata)] || read_schema(metadata)
|
215
233
|
else
|
216
234
|
@table_schema
|
217
235
|
end
|
@@ -29,6 +29,9 @@ module Fluent
|
|
29
29
|
# If insert_id_field is not specified, true means to allow duplicate rows
|
30
30
|
config_param :allow_retry_insert_errors, :bool, default: false
|
31
31
|
|
32
|
+
## RequirePartitionFilter
|
33
|
+
config_param :require_partition_filter, :bool, default: false
|
34
|
+
|
32
35
|
## Buffer
|
33
36
|
config_section :buffer do
|
34
37
|
config_set_default :@type, "memory"
|
@@ -147,33 +147,6 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
147
147
|
assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
|
148
148
|
end
|
149
149
|
|
150
|
-
def test_configure_auth_json_key_as_file_raise_permission_error
|
151
|
-
json_key_path = 'test/plugin/testdata/json_key.json'
|
152
|
-
json_key_path_dir = File.dirname(json_key_path)
|
153
|
-
|
154
|
-
begin
|
155
|
-
File.chmod(0000, json_key_path_dir)
|
156
|
-
|
157
|
-
driver = create_driver(%[
|
158
|
-
table foo
|
159
|
-
auth_method json_key
|
160
|
-
json_key #{json_key_path}
|
161
|
-
project yourproject_id
|
162
|
-
dataset yourdataset_id
|
163
|
-
schema [
|
164
|
-
{"name": "time", "type": "INTEGER"},
|
165
|
-
{"name": "status", "type": "INTEGER"},
|
166
|
-
{"name": "bytes", "type": "INTEGER"}
|
167
|
-
]
|
168
|
-
])
|
169
|
-
assert_raises(Errno::EACCES) do
|
170
|
-
driver.instance.writer.client
|
171
|
-
end
|
172
|
-
ensure
|
173
|
-
File.chmod(0755, json_key_path_dir)
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
150
|
def test_configure_auth_json_key_as_string
|
178
151
|
json_key = '{"private_key": "X", "client_email": "' + 'x' * 255 + '@developer.gserviceaccount.com"}'
|
179
152
|
json_key_io = StringIO.new(json_key)
|
@@ -199,6 +172,8 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
199
172
|
end
|
200
173
|
|
201
174
|
def test_configure_auth_application_default
|
175
|
+
omit "This testcase depends on some environment variables." if ENV["CI"] == "true"
|
176
|
+
|
202
177
|
driver = create_driver(%[
|
203
178
|
table foo
|
204
179
|
auth_method application_default
|
@@ -576,4 +551,24 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
576
551
|
assert_equal :string, table_schema["argv"].type
|
577
552
|
assert_equal :repeated, table_schema["argv"].mode
|
578
553
|
end
|
554
|
+
|
555
|
+
def test_resolve_schema_path_with_placeholder
|
556
|
+
now = Time.now.to_i
|
557
|
+
driver = create_driver(<<-CONFIG)
|
558
|
+
table ${tag}_%Y%m%d
|
559
|
+
auth_method json_key
|
560
|
+
json_key jsonkey.josn
|
561
|
+
project yourproject_id
|
562
|
+
dataset yourdataset_id
|
563
|
+
schema_path ${tag}.schema
|
564
|
+
|
565
|
+
<buffer tag, time>
|
566
|
+
timekey 1d
|
567
|
+
</buffer>
|
568
|
+
CONFIG
|
569
|
+
|
570
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(now, "foo", {})
|
571
|
+
|
572
|
+
assert_equal "foo.schema", driver.instance.read_schema_target_path(metadata)
|
573
|
+
end
|
579
574
|
end
|
@@ -5,6 +5,8 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
5
5
|
Fluent::Test.setup
|
6
6
|
end
|
7
7
|
|
8
|
+
SCHEMA_PATH = File.join(File.dirname(__FILE__), "testdata", "apache.schema")
|
9
|
+
|
8
10
|
CONFIG = %[
|
9
11
|
table foo
|
10
12
|
email foo@bar.example
|
@@ -260,7 +262,7 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
260
262
|
|
261
263
|
driver.instance_start
|
262
264
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
263
|
-
metadata =
|
265
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
264
266
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
265
267
|
c.append([driver.instance.format(tag, time, record)])
|
266
268
|
end
|
@@ -344,6 +346,8 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
344
346
|
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
345
347
|
CONFIG
|
346
348
|
|
349
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
350
|
+
|
347
351
|
stub_writer do |writer|
|
348
352
|
body = {
|
349
353
|
rows: [{json: Fluent::BigQuery::Helper.deep_symbolize_keys(message)}],
|
@@ -360,7 +364,7 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
360
364
|
table_id: 'foo',
|
361
365
|
},
|
362
366
|
schema: {
|
363
|
-
fields:
|
367
|
+
fields: schema_fields,
|
364
368
|
},
|
365
369
|
}, {})
|
366
370
|
end
|
@@ -416,8 +420,12 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
416
420
|
time_partitioning_type day
|
417
421
|
time_partitioning_field time
|
418
422
|
time_partitioning_expiration 1h
|
423
|
+
|
424
|
+
require_partition_filter true
|
419
425
|
CONFIG
|
420
426
|
|
427
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
428
|
+
|
421
429
|
stub_writer do |writer|
|
422
430
|
body = {
|
423
431
|
rows: [message],
|
@@ -434,13 +442,14 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
434
442
|
table_id: 'foo',
|
435
443
|
},
|
436
444
|
schema: {
|
437
|
-
fields:
|
445
|
+
fields: schema_fields,
|
438
446
|
},
|
439
447
|
time_partitioning: {
|
440
448
|
type: 'DAY',
|
441
449
|
field: 'time',
|
442
450
|
expiration_ms: 3600000,
|
443
451
|
},
|
452
|
+
require_partition_filter: true,
|
444
453
|
}, {})
|
445
454
|
end
|
446
455
|
|
@@ -495,7 +504,6 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
495
504
|
time_partitioning_type day
|
496
505
|
time_partitioning_field time
|
497
506
|
time_partitioning_expiration 1h
|
498
|
-
time_partitioning_require_partition_filter true
|
499
507
|
|
500
508
|
clustering_fields [
|
501
509
|
"time",
|
@@ -503,6 +511,8 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
503
511
|
]
|
504
512
|
CONFIG
|
505
513
|
|
514
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
515
|
+
|
506
516
|
stub_writer do |writer|
|
507
517
|
body = {
|
508
518
|
rows: [message],
|
@@ -519,7 +529,7 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
519
529
|
table_id: 'foo',
|
520
530
|
},
|
521
531
|
schema: {
|
522
|
-
fields:
|
532
|
+
fields: schema_fields,
|
523
533
|
},
|
524
534
|
time_partitioning: {
|
525
535
|
type: 'DAY',
|
@@ -132,7 +132,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
132
132
|
|
133
133
|
driver.instance_start
|
134
134
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
135
|
-
metadata =
|
135
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
136
136
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
137
137
|
c.append([driver.instance.format(tag, time, record)])
|
138
138
|
end
|
@@ -158,7 +158,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
158
158
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
159
159
|
end
|
160
160
|
|
161
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id',
|
161
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', :location=>nil) do
|
162
162
|
stub! do |s|
|
163
163
|
s.id { 'dummy_job_id' }
|
164
164
|
s.configuration.stub! do |_s|
|
@@ -215,7 +215,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
215
215
|
|
216
216
|
driver.instance_start
|
217
217
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
218
|
-
metadata =
|
218
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
219
219
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
220
220
|
c.append([driver.instance.format(tag, time, record)])
|
221
221
|
end
|
@@ -241,7 +241,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
241
241
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
242
242
|
end
|
243
243
|
|
244
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id',
|
244
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', :location=>nil) do
|
245
245
|
stub! do |s|
|
246
246
|
s.id { 'dummy_job_id' }
|
247
247
|
s.configuration.stub! do |_s|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-02-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -139,8 +139,9 @@ extensions: []
|
|
139
139
|
extra_rdoc_files: []
|
140
140
|
files:
|
141
141
|
- ".github/ISSUE_TEMPLATE.md"
|
142
|
+
- ".github/workflows/linux.yml"
|
143
|
+
- ".github/workflows/windows.yml"
|
142
144
|
- ".gitignore"
|
143
|
-
- ".travis.yml"
|
144
145
|
- Gemfile
|
145
146
|
- LICENSE.txt
|
146
147
|
- README.md
|
@@ -183,7 +184,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
183
184
|
- !ruby/object:Gem::Version
|
184
185
|
version: '0'
|
185
186
|
requirements: []
|
186
|
-
rubygems_version: 3.
|
187
|
+
rubygems_version: 3.1.4
|
187
188
|
signing_key:
|
188
189
|
specification_version: 4
|
189
190
|
summary: Fluentd plugin to store data on Google BigQuery
|