fluent-plugin-bigquery 2.0.0.beta → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/workflows/linux.yml +31 -0
- data/.github/workflows/windows.yml +27 -0
- data/README.md +44 -28
- data/lib/fluent/plugin/bigquery/errors.rb +6 -10
- data/lib/fluent/plugin/bigquery/schema.rb +11 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +85 -38
- data/lib/fluent/plugin/out_bigquery_base.rb +32 -4
- data/lib/fluent/plugin/out_bigquery_insert.rb +4 -7
- data/lib/fluent/plugin/out_bigquery_load.rb +1 -0
- data/test/plugin/test_out_bigquery_base.rb +22 -27
- data/test/plugin/test_out_bigquery_insert.rb +143 -9
- data/test/plugin/test_out_bigquery_load.rb +60 -22
- data/test/plugin/test_record_schema.rb +17 -2
- metadata +7 -7
- data/.travis.yml +0 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 4209a2b6eaaf0b6f8ba315b6f5de6690e28fb47890aeea777bdb31889e4785ab
|
4
|
+
data.tar.gz: b0983fb4fa16d72059b0e679ea4ee627d19e805779fa010888fa1723354896a5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6fc6891eda12bbc1272af7af9c4e8d48e588bc7ef65153b3a7524e39468baebb8fdb925856d1850bbda12fed5d33865faa56542503f76fdf724a18937c7d56e
|
7
|
+
data.tar.gz: fff0599b6a838cb4ff233ba9585b558ff733eed8063c1cf36ee08aaacb9b3c2ca1bce4d13db2a51ecc72c398ba751a18b2856a6348f43738ee8ca366becdea61
|
@@ -0,0 +1,31 @@
|
|
1
|
+
name: Testing on Ubuntu
|
2
|
+
on:
|
3
|
+
- push
|
4
|
+
- pull_request
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ${{ matrix.os }}
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- 2.6
|
13
|
+
- 2.7
|
14
|
+
- 3.0
|
15
|
+
- 3.1
|
16
|
+
os:
|
17
|
+
- ubuntu-latest
|
18
|
+
name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }}
|
19
|
+
steps:
|
20
|
+
- uses: actions/checkout@v2
|
21
|
+
- uses: ruby/setup-ruby@v1
|
22
|
+
with:
|
23
|
+
ruby-version: ${{ matrix.ruby }}
|
24
|
+
- name: unit testing
|
25
|
+
env:
|
26
|
+
CI: true
|
27
|
+
run: |
|
28
|
+
ruby -v
|
29
|
+
gem install bundler rake
|
30
|
+
bundle install --jobs 4 --retry 3
|
31
|
+
bundle exec rake test
|
@@ -0,0 +1,27 @@
|
|
1
|
+
name: Testing on Windows
|
2
|
+
on:
|
3
|
+
- push
|
4
|
+
- pull_request
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ${{ matrix.os }}
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby: [ '2.6', '2.7', '3.0', '3.1' ]
|
12
|
+
os:
|
13
|
+
- windows-latest
|
14
|
+
name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }}
|
15
|
+
steps:
|
16
|
+
- uses: actions/checkout@v2
|
17
|
+
- uses: ruby/setup-ruby@v1
|
18
|
+
with:
|
19
|
+
ruby-version: ${{ matrix.ruby }}
|
20
|
+
- name: unit testing
|
21
|
+
env:
|
22
|
+
CI: true
|
23
|
+
run: |
|
24
|
+
ruby -v
|
25
|
+
gem install bundler rake
|
26
|
+
bundle install --jobs 4 --retry 3
|
27
|
+
bundle exec rake test
|
data/README.md
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
# fluent-plugin-bigquery
|
2
2
|
|
3
|
-
|
3
|
+
## Notice
|
4
|
+
|
5
|
+
We will transfer fluent-plugin-bigquery repository to [fluent-plugins-nursery](https://github.com/fluent-plugins-nursery) organization.
|
6
|
+
It does not change maintenance plan.
|
7
|
+
The main purpose is that it solves mismatch between maintainers and current organization.
|
8
|
+
|
9
|
+
---
|
4
10
|
|
5
11
|
[Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery.
|
6
12
|
|
@@ -18,11 +24,13 @@
|
|
18
24
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
19
25
|
OAuth flow for installed applications.
|
20
26
|
|
21
|
-
## Version
|
22
|
-
v1.0.0 or later supports fluentd-0.14.0 or later.
|
23
|
-
If you use fluentd-0.12.x, please use v0.4.x.
|
27
|
+
## Support Version
|
24
28
|
|
25
|
-
|
29
|
+
| plugin version | fluentd version | ruby version |
|
30
|
+
| :----------- | :----------- | :----------- |
|
31
|
+
| v0.4.x | 0.12.x | 2.0 or later |
|
32
|
+
| v1.x.x | 0.14.x or later | 2.2 or later |
|
33
|
+
| v2.x.x | 0.14.x or later | 2.3 or later |
|
26
34
|
|
27
35
|
## With docker image
|
28
36
|
If you use official alpine based fluentd docker image (https://github.com/fluent/fluentd-docker-image),
|
@@ -37,28 +45,31 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
37
45
|
|
38
46
|
#### common
|
39
47
|
|
40
|
-
| name
|
41
|
-
|
|
42
|
-
| auth_method
|
43
|
-
| email
|
44
|
-
| private_key_path
|
45
|
-
| private_key_passphrase
|
46
|
-
| json_key
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
48
|
+
| name | type | required? | placeholder? | default | description |
|
49
|
+
| :-------------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
|
50
|
+
| auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
|
51
|
+
| email | string | yes (private_key) | no | nil | GCP Service Account Email |
|
52
|
+
| private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
|
53
|
+
| private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
|
54
|
+
| json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
|
55
|
+
| location | string | no | no | nil | BigQuery Data Location. The geographic location of the job. Required except for US and EU. |
|
56
|
+
| project | string | yes | yes | nil | |
|
57
|
+
| dataset | string | yes | yes | nil | |
|
58
|
+
| table | string | yes (either `tables`) | yes | nil | |
|
59
|
+
| tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` |
|
60
|
+
| auto_create_table | bool | no | no | false | If true, creates table automatically |
|
61
|
+
| ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
|
62
|
+
| schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
|
63
|
+
| schema_path | string | yes (either `fetch_schema`) | yes | nil | Schema Definition file path. It is formatted by JSON. |
|
64
|
+
| fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
|
65
|
+
| fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
|
66
|
+
| schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
67
|
+
| request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
|
68
|
+
| request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
69
|
+
| time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature. |
|
70
|
+
| time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition. |
|
71
|
+
| time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. |
|
72
|
+
| clustering_fields | array(string) | no | no | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
|
62
73
|
|
63
74
|
#### bigquery_insert
|
64
75
|
|
@@ -69,6 +80,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
69
80
|
| insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. see. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor |
|
70
81
|
| add_insert_timestamp | string | no | no | nil | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
|
71
82
|
| allow_retry_insert_errors | bool | no | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
|
83
|
+
| require_partition_filter | bool | no | no | false | If true, queries over this table require a partition filter that can be used for partition elimination to be specified. |
|
72
84
|
|
73
85
|
#### bigquery_load
|
74
86
|
|
@@ -431,7 +443,7 @@ Use placeholder.
|
|
431
443
|
|
432
444
|
```apache
|
433
445
|
<match dummy>
|
434
|
-
@type
|
446
|
+
@type bigquery_load
|
435
447
|
|
436
448
|
...
|
437
449
|
table accesslog$%Y%m%d
|
@@ -444,6 +456,8 @@ Use placeholder.
|
|
444
456
|
```
|
445
457
|
|
446
458
|
But, Dynamic table creating doesn't support date partitioned table yet.
|
459
|
+
And streaming insert is not allowed to insert with `$%Y%m%d` suffix.
|
460
|
+
If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`.
|
447
461
|
|
448
462
|
### Dynamic table creating
|
449
463
|
|
@@ -465,6 +479,8 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
|
|
465
479
|
</match>
|
466
480
|
```
|
467
481
|
|
482
|
+
Also, you can create clustered table by using `clustering_fields`.
|
483
|
+
|
468
484
|
### Table schema
|
469
485
|
|
470
486
|
There are three methods to describe the schema of the target table.
|
@@ -7,10 +7,9 @@ module Fluent
|
|
7
7
|
RETRYABLE_STATUS_CODE = [500, 502, 503, 504]
|
8
8
|
|
9
9
|
class << self
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
# @param e [Google::Apis::Error]
|
11
|
+
# @param message [String]
|
12
|
+
def wrap(e, message = nil)
|
14
13
|
if retryable_error?(e)
|
15
14
|
RetryableError.new(message, e)
|
16
15
|
else
|
@@ -18,12 +17,9 @@ module Fluent
|
|
18
17
|
end
|
19
18
|
end
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
retryable_error_reason?(reason) ||
|
26
|
-
(e.is_a?(Google::Apis::ServerError) && RETRYABLE_STATUS_CODE.include?(e.status_code))
|
20
|
+
# @param e [Google::Apis::Error]
|
21
|
+
def retryable_error?(e)
|
22
|
+
e.is_a?(Google::Apis::ServerError) && RETRYABLE_STATUS_CODE.include?(e.status_code)
|
27
23
|
end
|
28
24
|
|
29
25
|
def retryable_error_reason?(reason)
|
@@ -86,6 +86,16 @@ module Fluent
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
class NumericFieldSchema < FieldSchema
|
90
|
+
def type
|
91
|
+
:numeric
|
92
|
+
end
|
93
|
+
|
94
|
+
def format_one(value)
|
95
|
+
value.to_s
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
89
99
|
class BooleanFieldSchema < FieldSchema
|
90
100
|
def type
|
91
101
|
:boolean
|
@@ -169,6 +179,7 @@ module Fluent
|
|
169
179
|
string: StringFieldSchema,
|
170
180
|
integer: IntegerFieldSchema,
|
171
181
|
float: FloatFieldSchema,
|
182
|
+
numeric: NumericFieldSchema,
|
172
183
|
boolean: BooleanFieldSchema,
|
173
184
|
timestamp: TimestampFieldSchema,
|
174
185
|
date: DateFieldSchema,
|
@@ -34,12 +34,9 @@ module Fluent
|
|
34
34
|
}
|
35
35
|
}
|
36
36
|
|
37
|
-
if
|
38
|
-
|
39
|
-
|
40
|
-
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil
|
41
|
-
}.select { |_, value| !value.nil? }
|
42
|
-
end
|
37
|
+
definition.merge!(time_partitioning: time_partitioning) if time_partitioning
|
38
|
+
definition.merge!(require_partition_filter: require_partition_filter) if require_partition_filter
|
39
|
+
definition.merge!(clustering: clustering) if clustering
|
43
40
|
client.insert_table(project, dataset, definition, {})
|
44
41
|
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
45
42
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
@@ -50,10 +47,9 @@ module Fluent
|
|
50
47
|
return
|
51
48
|
end
|
52
49
|
|
53
|
-
|
54
|
-
log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message, reason: reason
|
50
|
+
log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
|
55
51
|
|
56
|
-
if
|
52
|
+
if create_table_retry_count < create_table_retry_limit
|
57
53
|
sleep create_table_retry_wait
|
58
54
|
create_table_retry_wait *= 2
|
59
55
|
create_table_retry_count += 1
|
@@ -76,14 +72,19 @@ module Fluent
|
|
76
72
|
nil
|
77
73
|
end
|
78
74
|
|
79
|
-
def insert_rows(project, dataset, table_id, rows, template_suffix: nil)
|
75
|
+
def insert_rows(project, dataset, table_id, rows, schema, template_suffix: nil)
|
80
76
|
body = {
|
81
77
|
rows: rows,
|
82
78
|
skip_invalid_rows: @options[:skip_invalid_rows],
|
83
79
|
ignore_unknown_values: @options[:ignore_unknown_values],
|
84
80
|
}
|
85
81
|
body.merge!(template_suffix: template_suffix) if template_suffix
|
86
|
-
|
82
|
+
|
83
|
+
if @options[:auto_create_table]
|
84
|
+
res = insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
|
85
|
+
else
|
86
|
+
res = client.insert_all_table_data(project, dataset, table_id, body, {})
|
87
|
+
end
|
87
88
|
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
88
89
|
|
89
90
|
if res.insert_errors && !res.insert_errors.empty?
|
@@ -100,8 +101,7 @@ module Fluent
|
|
100
101
|
end
|
101
102
|
end
|
102
103
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
103
|
-
|
104
|
-
error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason }
|
104
|
+
error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message }
|
105
105
|
wrapped = Fluent::BigQuery::Error.wrap(e)
|
106
106
|
if wrapped.retryable?
|
107
107
|
log.warn "tabledata.insertAll API", error_data
|
@@ -131,9 +131,6 @@ module Fluent
|
|
131
131
|
dataset_id: dataset,
|
132
132
|
table_id: table_id,
|
133
133
|
},
|
134
|
-
schema: {
|
135
|
-
fields: fields.to_a,
|
136
|
-
},
|
137
134
|
write_disposition: "WRITE_APPEND",
|
138
135
|
source_format: source_format,
|
139
136
|
ignore_unknown_values: @options[:ignore_unknown_values],
|
@@ -143,17 +140,19 @@ module Fluent
|
|
143
140
|
}
|
144
141
|
|
145
142
|
job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
|
146
|
-
configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if @options[:time_partitioning_type]
|
147
143
|
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
148
144
|
|
149
|
-
# If target table is already exist, omit schema configuration.
|
150
|
-
# Because schema changing is easier.
|
151
145
|
begin
|
152
|
-
|
153
|
-
|
146
|
+
# Check table existance
|
147
|
+
client.get_table(project, dataset, table_id)
|
148
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
149
|
+
if e.status_code == 404 && /Not Found: Table/i =~ e.message
|
150
|
+
raise Fluent::BigQuery::UnRetryableError.new("Table is not found") unless @options[:auto_create_table]
|
151
|
+
raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
|
152
|
+
configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
|
153
|
+
configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
|
154
|
+
configuration[:configuration][:load].merge!(clustering: clustering) if clustering
|
154
155
|
end
|
155
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError
|
156
|
-
raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
|
157
156
|
end
|
158
157
|
|
159
158
|
res = client.insert_job(
|
@@ -166,19 +165,7 @@ module Fluent
|
|
166
165
|
)
|
167
166
|
JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id)
|
168
167
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
169
|
-
|
170
|
-
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
171
|
-
|
172
|
-
if @options[:auto_create_table] && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
173
|
-
# Table Not Found: Auto Create Table
|
174
|
-
create_table(
|
175
|
-
project,
|
176
|
-
dataset,
|
177
|
-
table_id,
|
178
|
-
fields,
|
179
|
-
)
|
180
|
-
raise "table created. send rows next time."
|
181
|
-
end
|
168
|
+
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message
|
182
169
|
|
183
170
|
if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
184
171
|
return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id)
|
@@ -190,8 +177,9 @@ module Fluent
|
|
190
177
|
def fetch_load_job(job_reference)
|
191
178
|
project = job_reference.project_id
|
192
179
|
job_id = job_reference.job_id
|
180
|
+
location = @options[:location]
|
193
181
|
|
194
|
-
res = client.get_job(project, job_id)
|
182
|
+
res = client.get_job(project, job_id, location: location)
|
195
183
|
log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
|
196
184
|
|
197
185
|
if res.status.state == "DONE"
|
@@ -227,9 +215,10 @@ module Fluent
|
|
227
215
|
end
|
228
216
|
end
|
229
217
|
|
218
|
+
# `stats` can be nil if we receive a warning like "Warning: Load job succeeded with data imported, however statistics may be lost due to internal error."
|
230
219
|
stats = response.statistics.load
|
231
220
|
duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
|
232
|
-
log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats
|
221
|
+
log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats&.input_file_bytes, input_files: stats&.input_files, output_bytes: stats&.output_bytes, output_rows: stats&.output_rows, bad_records: stats&.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
|
233
222
|
@num_errors_per_chunk.delete(chunk_id_hex)
|
234
223
|
end
|
235
224
|
|
@@ -315,6 +304,64 @@ module Fluent
|
|
315
304
|
"NEWLINE_DELIMITED_JSON"
|
316
305
|
end
|
317
306
|
end
|
307
|
+
|
308
|
+
def time_partitioning
|
309
|
+
return @time_partitioning if instance_variable_defined?(:@time_partitioning)
|
310
|
+
|
311
|
+
if @options[:time_partitioning_type]
|
312
|
+
@time_partitioning = {
|
313
|
+
type: @options[:time_partitioning_type].to_s.upcase,
|
314
|
+
field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
|
315
|
+
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
|
316
|
+
}.reject { |_, v| v.nil? }
|
317
|
+
else
|
318
|
+
@time_partitioning
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def require_partition_filter
|
323
|
+
return @require_partition_filter if instance_variable_defined?(:@require_partition_filter)
|
324
|
+
|
325
|
+
if @options[:require_partition_filter]
|
326
|
+
@require_partition_filter = @options[:require_partition_filter]
|
327
|
+
else
|
328
|
+
@require_partition_filter
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
def clustering
|
333
|
+
return @clustering if instance_variable_defined?(:@clustering)
|
334
|
+
|
335
|
+
if @options[:clustering_fields]
|
336
|
+
@clustering = {
|
337
|
+
fields: @options[:clustering_fields]
|
338
|
+
}
|
339
|
+
else
|
340
|
+
@clustering
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
|
345
|
+
try_count ||= 1
|
346
|
+
res = client.insert_all_table_data(project, dataset, table_id, body, {})
|
347
|
+
rescue Google::Apis::ClientError => e
|
348
|
+
if e.status_code == 404 && /Not Found: Table/i =~ e.message
|
349
|
+
if try_count == 1
|
350
|
+
# Table Not Found: Auto Create Table
|
351
|
+
create_table(project, dataset, table_id, schema)
|
352
|
+
elsif try_count > 10
|
353
|
+
raise "A new table was created but it is not found."
|
354
|
+
end
|
355
|
+
|
356
|
+
# Retry to insert several times because the created table is not visible from Streaming insert for a little while
|
357
|
+
# cf. https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
|
358
|
+
try_count += 1
|
359
|
+
sleep 5
|
360
|
+
log.debug "Retry to insert rows", project_id: project, dataset: dataset, table: table_id
|
361
|
+
retry
|
362
|
+
end
|
363
|
+
raise
|
364
|
+
end
|
318
365
|
end
|
319
366
|
end
|
320
367
|
end
|
@@ -29,6 +29,9 @@ module Fluent
|
|
29
29
|
config_param :private_key_path, :string, default: nil
|
30
30
|
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
31
31
|
config_param :json_key, default: nil, secret: true
|
32
|
+
# The geographic location of the job. Required except for US and EU.
|
33
|
+
# https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
|
34
|
+
config_param :location, :string, default: nil
|
32
35
|
|
33
36
|
# see as simple reference
|
34
37
|
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
@@ -67,8 +70,12 @@ module Fluent
|
|
67
70
|
|
68
71
|
## Partitioning
|
69
72
|
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
73
|
+
config_param :time_partitioning_field, :string, default: nil
|
70
74
|
config_param :time_partitioning_expiration, :time, default: nil
|
71
75
|
|
76
|
+
## Clustering
|
77
|
+
config_param :clustering_fields, :array, default: nil
|
78
|
+
|
72
79
|
## Formatter
|
73
80
|
config_section :format do
|
74
81
|
config_set_default :@type, 'json'
|
@@ -104,9 +111,6 @@ module Fluent
|
|
104
111
|
if @schema
|
105
112
|
@table_schema.load_schema(@schema)
|
106
113
|
end
|
107
|
-
if @schema_path
|
108
|
-
@table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
|
109
|
-
end
|
110
114
|
|
111
115
|
formatter_config = conf.elements("format")[0]
|
112
116
|
@formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
|
@@ -119,6 +123,7 @@ module Fluent
|
|
119
123
|
@tables_mutex = Mutex.new
|
120
124
|
@fetched_schemas = {}
|
121
125
|
@last_fetch_schema_time = Hash.new(0)
|
126
|
+
@read_schemas = {}
|
122
127
|
end
|
123
128
|
|
124
129
|
def multi_workers_ready?
|
@@ -130,6 +135,7 @@ module Fluent
|
|
130
135
|
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
131
136
|
email: @email,
|
132
137
|
json_key: @json_key,
|
138
|
+
location: @location,
|
133
139
|
source_format: @source_format,
|
134
140
|
skip_invalid_rows: @skip_invalid_rows,
|
135
141
|
ignore_unknown_values: @ignore_unknown_values,
|
@@ -138,7 +144,10 @@ module Fluent
|
|
138
144
|
prevent_duplicate_load: @prevent_duplicate_load,
|
139
145
|
auto_create_table: @auto_create_table,
|
140
146
|
time_partitioning_type: @time_partitioning_type,
|
147
|
+
time_partitioning_field: @time_partitioning_field,
|
141
148
|
time_partitioning_expiration: @time_partitioning_expiration,
|
149
|
+
require_partition_filter: @require_partition_filter,
|
150
|
+
clustering_fields: @clustering_fields,
|
142
151
|
timeout_sec: @request_timeout_sec,
|
143
152
|
open_timeout_sec: @request_open_timeout_sec,
|
144
153
|
})
|
@@ -151,6 +160,8 @@ module Fluent
|
|
151
160
|
schema =
|
152
161
|
if @fetch_schema
|
153
162
|
fetch_schema(meta)
|
163
|
+
elsif @schema_path
|
164
|
+
read_schema(meta)
|
154
165
|
else
|
155
166
|
@table_schema
|
156
167
|
end
|
@@ -182,7 +193,7 @@ module Fluent
|
|
182
193
|
table_schema.load_schema(schema)
|
183
194
|
@fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
|
184
195
|
else
|
185
|
-
if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].
|
196
|
+
if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].nil?
|
186
197
|
raise "failed to fetch schema from bigquery"
|
187
198
|
else
|
188
199
|
log.warn "#{table_id} uses previous schema"
|
@@ -199,9 +210,26 @@ module Fluent
|
|
199
210
|
extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
|
200
211
|
end
|
201
212
|
|
213
|
+
def read_schema(metadata)
|
214
|
+
schema_path = read_schema_target_path(metadata)
|
215
|
+
|
216
|
+
unless @read_schemas[schema_path]
|
217
|
+
table_schema = Fluent::BigQuery::RecordSchema.new("record")
|
218
|
+
table_schema.load_schema(MultiJson.load(File.read(schema_path)))
|
219
|
+
@read_schemas[schema_path] = table_schema
|
220
|
+
end
|
221
|
+
@read_schemas[schema_path]
|
222
|
+
end
|
223
|
+
|
224
|
+
def read_schema_target_path(metadata)
|
225
|
+
extract_placeholders(@schema_path, metadata)
|
226
|
+
end
|
227
|
+
|
202
228
|
def get_schema(project, dataset, metadata)
|
203
229
|
if @fetch_schema
|
204
230
|
@fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
|
231
|
+
elsif @schema_path
|
232
|
+
@read_schemas[read_schema_target_path(metadata)] || read_schema(metadata)
|
205
233
|
else
|
206
234
|
@table_schema
|
207
235
|
end
|
@@ -29,6 +29,9 @@ module Fluent
|
|
29
29
|
# If insert_id_field is not specified, true means to allow duplicate rows
|
30
30
|
config_param :allow_retry_insert_errors, :bool, default: false
|
31
31
|
|
32
|
+
## RequirePartitionFilter
|
33
|
+
config_param :require_partition_filter, :bool, default: false
|
34
|
+
|
32
35
|
## Buffer
|
33
36
|
config_section :buffer do
|
34
37
|
config_set_default :@type, "memory"
|
@@ -96,14 +99,8 @@ module Fluent
|
|
96
99
|
end
|
97
100
|
|
98
101
|
def insert(project, dataset, table_id, rows, schema, template_suffix)
|
99
|
-
writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
|
102
|
+
writer.insert_rows(project, dataset, table_id, rows, schema, template_suffix: template_suffix)
|
100
103
|
rescue Fluent::BigQuery::Error => e
|
101
|
-
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
102
|
-
# Table Not Found: Auto Create Table
|
103
|
-
writer.create_table(project, dataset, table_id, schema)
|
104
|
-
raise "table created. send rows next time."
|
105
|
-
end
|
106
|
-
|
107
104
|
raise if e.retryable?
|
108
105
|
|
109
106
|
if @secondary
|
@@ -147,33 +147,6 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
147
147
|
assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
|
148
148
|
end
|
149
149
|
|
150
|
-
def test_configure_auth_json_key_as_file_raise_permission_error
|
151
|
-
json_key_path = 'test/plugin/testdata/json_key.json'
|
152
|
-
json_key_path_dir = File.dirname(json_key_path)
|
153
|
-
|
154
|
-
begin
|
155
|
-
File.chmod(0000, json_key_path_dir)
|
156
|
-
|
157
|
-
driver = create_driver(%[
|
158
|
-
table foo
|
159
|
-
auth_method json_key
|
160
|
-
json_key #{json_key_path}
|
161
|
-
project yourproject_id
|
162
|
-
dataset yourdataset_id
|
163
|
-
schema [
|
164
|
-
{"name": "time", "type": "INTEGER"},
|
165
|
-
{"name": "status", "type": "INTEGER"},
|
166
|
-
{"name": "bytes", "type": "INTEGER"}
|
167
|
-
]
|
168
|
-
])
|
169
|
-
assert_raises(Errno::EACCES) do
|
170
|
-
driver.instance.writer.client
|
171
|
-
end
|
172
|
-
ensure
|
173
|
-
File.chmod(0755, json_key_path_dir)
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
150
|
def test_configure_auth_json_key_as_string
|
178
151
|
json_key = '{"private_key": "X", "client_email": "' + 'x' * 255 + '@developer.gserviceaccount.com"}'
|
179
152
|
json_key_io = StringIO.new(json_key)
|
@@ -199,6 +172,8 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
199
172
|
end
|
200
173
|
|
201
174
|
def test_configure_auth_application_default
|
175
|
+
omit "This testcase depends on some environment variables." if ENV["CI"] == "true"
|
176
|
+
|
202
177
|
driver = create_driver(%[
|
203
178
|
table foo
|
204
179
|
auth_method application_default
|
@@ -576,4 +551,24 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
576
551
|
assert_equal :string, table_schema["argv"].type
|
577
552
|
assert_equal :repeated, table_schema["argv"].mode
|
578
553
|
end
|
554
|
+
|
555
|
+
def test_resolve_schema_path_with_placeholder
|
556
|
+
now = Time.now.to_i
|
557
|
+
driver = create_driver(<<-CONFIG)
|
558
|
+
table ${tag}_%Y%m%d
|
559
|
+
auth_method json_key
|
560
|
+
json_key jsonkey.josn
|
561
|
+
project yourproject_id
|
562
|
+
dataset yourdataset_id
|
563
|
+
schema_path ${tag}.schema
|
564
|
+
|
565
|
+
<buffer tag, time>
|
566
|
+
timekey 1d
|
567
|
+
</buffer>
|
568
|
+
CONFIG
|
569
|
+
|
570
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(now, "foo", {})
|
571
|
+
|
572
|
+
assert_equal "foo.schema", driver.instance.read_schema_target_path(metadata)
|
573
|
+
end
|
579
574
|
end
|
@@ -5,6 +5,8 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
5
5
|
Fluent::Test.setup
|
6
6
|
end
|
7
7
|
|
8
|
+
SCHEMA_PATH = File.join(File.dirname(__FILE__), "testdata", "apache.schema")
|
9
|
+
|
8
10
|
CONFIG = %[
|
9
11
|
table foo
|
10
12
|
email foo@bar.example
|
@@ -121,7 +123,6 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
121
123
|
driver = create_driver
|
122
124
|
|
123
125
|
stub_writer do |writer|
|
124
|
-
mock.proxy(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [{json: hash_including(entry)}], template_suffix: nil)
|
125
126
|
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', {
|
126
127
|
rows: [{json: hash_including(entry)}],
|
127
128
|
skip_invalid_rows: false,
|
@@ -261,7 +262,7 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
261
262
|
|
262
263
|
driver.instance_start
|
263
264
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
264
|
-
metadata =
|
265
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
265
266
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
266
267
|
c.append([driver.instance.format(tag, time, record)])
|
267
268
|
end
|
@@ -345,11 +346,27 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
345
346
|
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
346
347
|
CONFIG
|
347
348
|
|
349
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
350
|
+
|
348
351
|
stub_writer do |writer|
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
352
|
+
body = {
|
353
|
+
rows: [{json: Fluent::BigQuery::Helper.deep_symbolize_keys(message)}],
|
354
|
+
skip_invalid_rows: false,
|
355
|
+
ignore_unknown_values: false,
|
356
|
+
}
|
357
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
|
358
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
359
|
+
end.at_least(1)
|
360
|
+
mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
|
361
|
+
|
362
|
+
mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
|
363
|
+
table_reference: {
|
364
|
+
table_id: 'foo',
|
365
|
+
},
|
366
|
+
schema: {
|
367
|
+
fields: schema_fields,
|
368
|
+
},
|
369
|
+
}, {})
|
353
370
|
end
|
354
371
|
|
355
372
|
assert_raise(RuntimeError) do
|
@@ -401,14 +418,131 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
401
418
|
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
402
419
|
|
403
420
|
time_partitioning_type day
|
421
|
+
time_partitioning_field time
|
404
422
|
time_partitioning_expiration 1h
|
423
|
+
|
424
|
+
require_partition_filter true
|
405
425
|
CONFIG
|
406
426
|
|
427
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
428
|
+
|
407
429
|
stub_writer do |writer|
|
408
|
-
|
409
|
-
|
430
|
+
body = {
|
431
|
+
rows: [message],
|
432
|
+
skip_invalid_rows: false,
|
433
|
+
ignore_unknown_values: false,
|
434
|
+
}
|
435
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
|
436
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
437
|
+
end.at_least(1)
|
438
|
+
mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
|
439
|
+
|
440
|
+
mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
|
441
|
+
table_reference: {
|
442
|
+
table_id: 'foo',
|
443
|
+
},
|
444
|
+
schema: {
|
445
|
+
fields: schema_fields,
|
446
|
+
},
|
447
|
+
time_partitioning: {
|
448
|
+
type: 'DAY',
|
449
|
+
field: 'time',
|
450
|
+
expiration_ms: 3600000,
|
451
|
+
},
|
452
|
+
require_partition_filter: true,
|
453
|
+
}, {})
|
454
|
+
end
|
455
|
+
|
456
|
+
assert_raise(RuntimeError) do
|
457
|
+
driver.run do
|
458
|
+
driver.feed("tag", Fluent::EventTime.now, message[:json])
|
410
459
|
end
|
411
|
-
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
def test_auto_create_clustered_table_by_bigquery_api
|
464
|
+
now = Time.now
|
465
|
+
message = {
|
466
|
+
json: {
|
467
|
+
time: now.to_i,
|
468
|
+
request: {
|
469
|
+
vhost: "bar",
|
470
|
+
path: "/path/to/baz",
|
471
|
+
method: "GET",
|
472
|
+
protocol: "HTTP/1.0",
|
473
|
+
agent: "libwww",
|
474
|
+
referer: "http://referer.example",
|
475
|
+
time: (now - 1).to_f,
|
476
|
+
bot_access: true,
|
477
|
+
loginsession: false,
|
478
|
+
},
|
479
|
+
remote: {
|
480
|
+
host: "remote.example",
|
481
|
+
ip: "192.168.1.1",
|
482
|
+
user: "nagachika",
|
483
|
+
},
|
484
|
+
response: {
|
485
|
+
status: 200,
|
486
|
+
bytes: 72,
|
487
|
+
},
|
488
|
+
}
|
489
|
+
}
|
490
|
+
|
491
|
+
driver = create_driver(<<-CONFIG)
|
492
|
+
table foo
|
493
|
+
email foo@bar.example
|
494
|
+
private_key_path /path/to/key
|
495
|
+
project yourproject_id
|
496
|
+
dataset yourdataset_id
|
497
|
+
|
498
|
+
time_format %s
|
499
|
+
time_field time
|
500
|
+
|
501
|
+
auto_create_table true
|
502
|
+
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
503
|
+
|
504
|
+
time_partitioning_type day
|
505
|
+
time_partitioning_field time
|
506
|
+
time_partitioning_expiration 1h
|
507
|
+
|
508
|
+
clustering_fields [
|
509
|
+
"time",
|
510
|
+
"vhost"
|
511
|
+
]
|
512
|
+
CONFIG
|
513
|
+
|
514
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
515
|
+
|
516
|
+
stub_writer do |writer|
|
517
|
+
body = {
|
518
|
+
rows: [message],
|
519
|
+
skip_invalid_rows: false,
|
520
|
+
ignore_unknown_values: false,
|
521
|
+
}
|
522
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
|
523
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
524
|
+
end.at_least(1)
|
525
|
+
mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
|
526
|
+
|
527
|
+
mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
|
528
|
+
table_reference: {
|
529
|
+
table_id: 'foo',
|
530
|
+
},
|
531
|
+
schema: {
|
532
|
+
fields: schema_fields,
|
533
|
+
},
|
534
|
+
time_partitioning: {
|
535
|
+
type: 'DAY',
|
536
|
+
field: 'time',
|
537
|
+
expiration_ms: 3600000,
|
538
|
+
},
|
539
|
+
clustering: {
|
540
|
+
fields: [
|
541
|
+
'time',
|
542
|
+
'vhost',
|
543
|
+
],
|
544
|
+
},
|
545
|
+
}, {})
|
412
546
|
end
|
413
547
|
|
414
548
|
assert_raise(RuntimeError) do
|
@@ -39,10 +39,8 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
39
39
|
writer
|
40
40
|
end
|
41
41
|
end
|
42
|
-
|
43
|
-
def test_write
|
44
|
-
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
45
42
|
|
43
|
+
def test_write
|
46
44
|
response_stub = stub!
|
47
45
|
|
48
46
|
driver = create_driver
|
@@ -60,9 +58,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
60
58
|
dataset_id: 'yourdataset_id',
|
61
59
|
table_id: 'foo',
|
62
60
|
},
|
63
|
-
schema: {
|
64
|
-
fields: schema_fields,
|
65
|
-
},
|
66
61
|
write_disposition: "WRITE_APPEND",
|
67
62
|
source_format: "NEWLINE_DELIMITED_JSON",
|
68
63
|
ignore_unknown_values: false,
|
@@ -99,7 +94,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
99
94
|
schema_path #{SCHEMA_PATH}
|
100
95
|
prevent_duplicate_load true
|
101
96
|
CONFIG
|
102
|
-
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
103
97
|
|
104
98
|
response_stub = stub!
|
105
99
|
stub_writer do |writer|
|
@@ -116,9 +110,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
116
110
|
dataset_id: 'yourdataset_id',
|
117
111
|
table_id: 'foo',
|
118
112
|
},
|
119
|
-
schema: {
|
120
|
-
fields: schema_fields,
|
121
|
-
},
|
122
113
|
write_disposition: "WRITE_APPEND",
|
123
114
|
source_format: "NEWLINE_DELIMITED_JSON",
|
124
115
|
ignore_unknown_values: false,
|
@@ -138,11 +129,10 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
138
129
|
|
139
130
|
def test_write_with_retryable_error
|
140
131
|
driver = create_driver
|
141
|
-
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
142
132
|
|
143
133
|
driver.instance_start
|
144
134
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
145
|
-
metadata =
|
135
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
146
136
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
147
137
|
c.append([driver.instance.format(tag, time, record)])
|
148
138
|
end
|
@@ -158,9 +148,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
158
148
|
dataset_id: 'yourdataset_id',
|
159
149
|
table_id: 'foo',
|
160
150
|
},
|
161
|
-
schema: {
|
162
|
-
fields: schema_fields,
|
163
|
-
},
|
164
151
|
write_disposition: "WRITE_APPEND",
|
165
152
|
source_format: "NEWLINE_DELIMITED_JSON",
|
166
153
|
ignore_unknown_values: false,
|
@@ -171,7 +158,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
171
158
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
172
159
|
end
|
173
160
|
|
174
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
|
161
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', :location=>nil) do
|
175
162
|
stub! do |s|
|
176
163
|
s.id { 'dummy_job_id' }
|
177
164
|
s.configuration.stub! do |_s|
|
@@ -225,11 +212,10 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
225
212
|
utc
|
226
213
|
</secondary>
|
227
214
|
CONFIG
|
228
|
-
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
229
215
|
|
230
216
|
driver.instance_start
|
231
217
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
232
|
-
metadata =
|
218
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
233
219
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
234
220
|
c.append([driver.instance.format(tag, time, record)])
|
235
221
|
end
|
@@ -245,9 +231,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
245
231
|
dataset_id: 'yourdataset_id',
|
246
232
|
table_id: 'foo',
|
247
233
|
},
|
248
|
-
schema: {
|
249
|
-
fields: schema_fields,
|
250
|
-
},
|
251
234
|
write_disposition: "WRITE_APPEND",
|
252
235
|
source_format: "NEWLINE_DELIMITED_JSON",
|
253
236
|
ignore_unknown_values: false,
|
@@ -258,7 +241,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
258
241
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
259
242
|
end
|
260
243
|
|
261
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
|
244
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', :location=>nil) do
|
262
245
|
stub! do |s|
|
263
246
|
s.id { 'dummy_job_id' }
|
264
247
|
s.configuration.stub! do |_s|
|
@@ -289,6 +272,61 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
289
272
|
driver.instance_shutdown
|
290
273
|
end
|
291
274
|
|
275
|
+
def test_write_with_auto_create_table
|
276
|
+
driver = create_driver(<<-CONFIG)
|
277
|
+
table foo
|
278
|
+
email foo@bar.example
|
279
|
+
private_key_path /path/to/key
|
280
|
+
project yourproject_id
|
281
|
+
dataset yourdataset_id
|
282
|
+
|
283
|
+
<buffer>
|
284
|
+
@type memory
|
285
|
+
</buffer>
|
286
|
+
|
287
|
+
<inject>
|
288
|
+
time_format %s
|
289
|
+
time_key time
|
290
|
+
</inject>
|
291
|
+
|
292
|
+
auto_create_table true
|
293
|
+
schema_path #{SCHEMA_PATH}
|
294
|
+
CONFIG
|
295
|
+
|
296
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
297
|
+
|
298
|
+
stub_writer do |writer|
|
299
|
+
mock(writer.client).get_table('yourproject_id', 'yourdataset_id', 'foo') do
|
300
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
301
|
+
end
|
302
|
+
|
303
|
+
mock(writer.client).insert_job('yourproject_id', {
|
304
|
+
configuration: {
|
305
|
+
load: {
|
306
|
+
destination_table: {
|
307
|
+
project_id: 'yourproject_id',
|
308
|
+
dataset_id: 'yourdataset_id',
|
309
|
+
table_id: 'foo',
|
310
|
+
},
|
311
|
+
write_disposition: "WRITE_APPEND",
|
312
|
+
source_format: "NEWLINE_DELIMITED_JSON",
|
313
|
+
ignore_unknown_values: false,
|
314
|
+
max_bad_records: 0,
|
315
|
+
schema: {
|
316
|
+
fields: schema_fields,
|
317
|
+
},
|
318
|
+
}
|
319
|
+
}
|
320
|
+
}, {upload_source: duck_type(:write, :sync, :rewind), content_type: "application/octet-stream"}) do
|
321
|
+
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
driver.run do
|
326
|
+
driver.feed("tag", Time.now.to_i, {"a" => "b"})
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
292
330
|
private
|
293
331
|
|
294
332
|
def create_response_stub(response)
|
@@ -27,6 +27,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
27
27
|
"name" => "argv",
|
28
28
|
"type" => "STRING",
|
29
29
|
"mode" => "REPEATED"
|
30
|
+
},
|
31
|
+
{
|
32
|
+
"name" => "utilisation",
|
33
|
+
"type" => "NUMERIC",
|
34
|
+
"mode" => "NULLABLE"
|
30
35
|
}
|
31
36
|
]
|
32
37
|
end
|
@@ -58,6 +63,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
58
63
|
"type" => "STRING",
|
59
64
|
"mode" => "REPEATED"
|
60
65
|
},
|
66
|
+
{
|
67
|
+
"name" => "utilisation",
|
68
|
+
"type" => "NUMERIC",
|
69
|
+
"mode" => "NULLABLE"
|
70
|
+
},
|
61
71
|
{
|
62
72
|
"name" => "new_column",
|
63
73
|
"type" => "STRING",
|
@@ -93,6 +103,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
93
103
|
"type" => "STRING",
|
94
104
|
"mode" => "REPEATED"
|
95
105
|
},
|
106
|
+
{
|
107
|
+
"name" => "utilisation",
|
108
|
+
"type" => "NUMERIC",
|
109
|
+
"mode" => "NULLABLE"
|
110
|
+
}
|
96
111
|
]
|
97
112
|
end
|
98
113
|
|
@@ -142,12 +157,12 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
142
157
|
time = Time.local(2016, 2, 7, 19, 0, 0).utc
|
143
158
|
|
144
159
|
formatted = fields.format_one({
|
145
|
-
"time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42]
|
160
|
+
"time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42], "utilisation" => "0.837"
|
146
161
|
})
|
147
162
|
assert_equal(
|
148
163
|
formatted,
|
149
164
|
{
|
150
|
-
"time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"]
|
165
|
+
"time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"], "utilisation" => "0.837"
|
151
166
|
}
|
152
167
|
)
|
153
168
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-02-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -139,8 +139,9 @@ extensions: []
|
|
139
139
|
extra_rdoc_files: []
|
140
140
|
files:
|
141
141
|
- ".github/ISSUE_TEMPLATE.md"
|
142
|
+
- ".github/workflows/linux.yml"
|
143
|
+
- ".github/workflows/windows.yml"
|
142
144
|
- ".gitignore"
|
143
|
-
- ".travis.yml"
|
144
145
|
- Gemfile
|
145
146
|
- LICENSE.txt
|
146
147
|
- README.md
|
@@ -179,12 +180,11 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
179
180
|
version: '0'
|
180
181
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
181
182
|
requirements:
|
182
|
-
- - "
|
183
|
+
- - ">="
|
183
184
|
- !ruby/object:Gem::Version
|
184
|
-
version:
|
185
|
+
version: '0'
|
185
186
|
requirements: []
|
186
|
-
|
187
|
-
rubygems_version: 2.6.12
|
187
|
+
rubygems_version: 3.1.4
|
188
188
|
signing_key:
|
189
189
|
specification_version: 4
|
190
190
|
summary: Fluentd plugin to store data on Google BigQuery
|