fluent-plugin-bigquery 2.0.0.beta → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/workflows/linux.yml +31 -0
- data/.github/workflows/windows.yml +27 -0
- data/README.md +44 -28
- data/lib/fluent/plugin/bigquery/errors.rb +6 -10
- data/lib/fluent/plugin/bigquery/schema.rb +11 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +85 -38
- data/lib/fluent/plugin/out_bigquery_base.rb +32 -4
- data/lib/fluent/plugin/out_bigquery_insert.rb +4 -7
- data/lib/fluent/plugin/out_bigquery_load.rb +1 -0
- data/test/plugin/test_out_bigquery_base.rb +22 -27
- data/test/plugin/test_out_bigquery_insert.rb +143 -9
- data/test/plugin/test_out_bigquery_load.rb +60 -22
- data/test/plugin/test_record_schema.rb +17 -2
- metadata +7 -7
- data/.travis.yml +0 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 4209a2b6eaaf0b6f8ba315b6f5de6690e28fb47890aeea777bdb31889e4785ab
|
4
|
+
data.tar.gz: b0983fb4fa16d72059b0e679ea4ee627d19e805779fa010888fa1723354896a5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6fc6891eda12bbc1272af7af9c4e8d48e588bc7ef65153b3a7524e39468baebb8fdb925856d1850bbda12fed5d33865faa56542503f76fdf724a18937c7d56e
|
7
|
+
data.tar.gz: fff0599b6a838cb4ff233ba9585b558ff733eed8063c1cf36ee08aaacb9b3c2ca1bce4d13db2a51ecc72c398ba751a18b2856a6348f43738ee8ca366becdea61
|
@@ -0,0 +1,31 @@
|
|
1
|
+
name: Testing on Ubuntu
|
2
|
+
on:
|
3
|
+
- push
|
4
|
+
- pull_request
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ${{ matrix.os }}
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby:
|
12
|
+
- 2.6
|
13
|
+
- 2.7
|
14
|
+
- 3.0
|
15
|
+
- 3.1
|
16
|
+
os:
|
17
|
+
- ubuntu-latest
|
18
|
+
name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }}
|
19
|
+
steps:
|
20
|
+
- uses: actions/checkout@v2
|
21
|
+
- uses: ruby/setup-ruby@v1
|
22
|
+
with:
|
23
|
+
ruby-version: ${{ matrix.ruby }}
|
24
|
+
- name: unit testing
|
25
|
+
env:
|
26
|
+
CI: true
|
27
|
+
run: |
|
28
|
+
ruby -v
|
29
|
+
gem install bundler rake
|
30
|
+
bundle install --jobs 4 --retry 3
|
31
|
+
bundle exec rake test
|
@@ -0,0 +1,27 @@
|
|
1
|
+
name: Testing on Windows
|
2
|
+
on:
|
3
|
+
- push
|
4
|
+
- pull_request
|
5
|
+
jobs:
|
6
|
+
build:
|
7
|
+
runs-on: ${{ matrix.os }}
|
8
|
+
strategy:
|
9
|
+
fail-fast: false
|
10
|
+
matrix:
|
11
|
+
ruby: [ '2.6', '2.7', '3.0', '3.1' ]
|
12
|
+
os:
|
13
|
+
- windows-latest
|
14
|
+
name: Ruby ${{ matrix.ruby }} unit testing on ${{ matrix.os }}
|
15
|
+
steps:
|
16
|
+
- uses: actions/checkout@v2
|
17
|
+
- uses: ruby/setup-ruby@v1
|
18
|
+
with:
|
19
|
+
ruby-version: ${{ matrix.ruby }}
|
20
|
+
- name: unit testing
|
21
|
+
env:
|
22
|
+
CI: true
|
23
|
+
run: |
|
24
|
+
ruby -v
|
25
|
+
gem install bundler rake
|
26
|
+
bundle install --jobs 4 --retry 3
|
27
|
+
bundle exec rake test
|
data/README.md
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
# fluent-plugin-bigquery
|
2
2
|
|
3
|
-
|
3
|
+
## Notice
|
4
|
+
|
5
|
+
We will transfer fluent-plugin-bigquery repository to [fluent-plugins-nursery](https://github.com/fluent-plugins-nursery) organization.
|
6
|
+
It does not change maintenance plan.
|
7
|
+
The main purpose is that it solves mismatch between maintainers and current organization.
|
8
|
+
|
9
|
+
---
|
4
10
|
|
5
11
|
[Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery.
|
6
12
|
|
@@ -18,11 +24,13 @@
|
|
18
24
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
19
25
|
OAuth flow for installed applications.
|
20
26
|
|
21
|
-
## Version
|
22
|
-
v1.0.0 or later supports fluentd-0.14.0 or later.
|
23
|
-
If you use fluentd-0.12.x, please use v0.4.x.
|
27
|
+
## Support Version
|
24
28
|
|
25
|
-
|
29
|
+
| plugin version | fluentd version | ruby version |
|
30
|
+
| :----------- | :----------- | :----------- |
|
31
|
+
| v0.4.x | 0.12.x | 2.0 or later |
|
32
|
+
| v1.x.x | 0.14.x or later | 2.2 or later |
|
33
|
+
| v2.x.x | 0.14.x or later | 2.3 or later |
|
26
34
|
|
27
35
|
## With docker image
|
28
36
|
If you use official alpine based fluentd docker image (https://github.com/fluent/fluentd-docker-image),
|
@@ -37,28 +45,31 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
37
45
|
|
38
46
|
#### common
|
39
47
|
|
40
|
-
| name
|
41
|
-
|
|
42
|
-
| auth_method
|
43
|
-
| email
|
44
|
-
| private_key_path
|
45
|
-
| private_key_passphrase
|
46
|
-
| json_key
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
48
|
+
| name | type | required? | placeholder? | default | description |
|
49
|
+
| :-------------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
|
50
|
+
| auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
|
51
|
+
| email | string | yes (private_key) | no | nil | GCP Service Account Email |
|
52
|
+
| private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
|
53
|
+
| private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
|
54
|
+
| json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
|
55
|
+
| location | string | no | no | nil | BigQuery Data Location. The geographic location of the job. Required except for US and EU. |
|
56
|
+
| project | string | yes | yes | nil | |
|
57
|
+
| dataset | string | yes | yes | nil | |
|
58
|
+
| table | string | yes (either `tables`) | yes | nil | |
|
59
|
+
| tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` |
|
60
|
+
| auto_create_table | bool | no | no | false | If true, creates table automatically |
|
61
|
+
| ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
|
62
|
+
| schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
|
63
|
+
| schema_path | string | yes (either `fetch_schema`) | yes | nil | Schema Definition file path. It is formatted by JSON. |
|
64
|
+
| fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
|
65
|
+
| fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
|
66
|
+
| schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
67
|
+
| request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
|
68
|
+
| request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
69
|
+
| time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature. |
|
70
|
+
| time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition. |
|
71
|
+
| time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. |
|
72
|
+
| clustering_fields | array(string) | no | no | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
|
62
73
|
|
63
74
|
#### bigquery_insert
|
64
75
|
|
@@ -69,6 +80,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
69
80
|
| insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. see. https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor |
|
70
81
|
| add_insert_timestamp | string | no | no | nil | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
|
71
82
|
| allow_retry_insert_errors | bool | no | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
|
83
|
+
| require_partition_filter | bool | no | no | false | If true, queries over this table require a partition filter that can be used for partition elimination to be specified. |
|
72
84
|
|
73
85
|
#### bigquery_load
|
74
86
|
|
@@ -431,7 +443,7 @@ Use placeholder.
|
|
431
443
|
|
432
444
|
```apache
|
433
445
|
<match dummy>
|
434
|
-
@type
|
446
|
+
@type bigquery_load
|
435
447
|
|
436
448
|
...
|
437
449
|
table accesslog$%Y%m%d
|
@@ -444,6 +456,8 @@ Use placeholder.
|
|
444
456
|
```
|
445
457
|
|
446
458
|
But, Dynamic table creating doesn't support date partitioned table yet.
|
459
|
+
And streaming insert is not allowed to insert with `$%Y%m%d` suffix.
|
460
|
+
If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`.
|
447
461
|
|
448
462
|
### Dynamic table creating
|
449
463
|
|
@@ -465,6 +479,8 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
|
|
465
479
|
</match>
|
466
480
|
```
|
467
481
|
|
482
|
+
Also, you can create clustered table by using `clustering_fields`.
|
483
|
+
|
468
484
|
### Table schema
|
469
485
|
|
470
486
|
There are three methods to describe the schema of the target table.
|
@@ -7,10 +7,9 @@ module Fluent
|
|
7
7
|
RETRYABLE_STATUS_CODE = [500, 502, 503, 504]
|
8
8
|
|
9
9
|
class << self
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
# @param e [Google::Apis::Error]
|
11
|
+
# @param message [String]
|
12
|
+
def wrap(e, message = nil)
|
14
13
|
if retryable_error?(e)
|
15
14
|
RetryableError.new(message, e)
|
16
15
|
else
|
@@ -18,12 +17,9 @@ module Fluent
|
|
18
17
|
end
|
19
18
|
end
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
retryable_error_reason?(reason) ||
|
26
|
-
(e.is_a?(Google::Apis::ServerError) && RETRYABLE_STATUS_CODE.include?(e.status_code))
|
20
|
+
# @param e [Google::Apis::Error]
|
21
|
+
def retryable_error?(e)
|
22
|
+
e.is_a?(Google::Apis::ServerError) && RETRYABLE_STATUS_CODE.include?(e.status_code)
|
27
23
|
end
|
28
24
|
|
29
25
|
def retryable_error_reason?(reason)
|
@@ -86,6 +86,16 @@ module Fluent
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
+
class NumericFieldSchema < FieldSchema
|
90
|
+
def type
|
91
|
+
:numeric
|
92
|
+
end
|
93
|
+
|
94
|
+
def format_one(value)
|
95
|
+
value.to_s
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
89
99
|
class BooleanFieldSchema < FieldSchema
|
90
100
|
def type
|
91
101
|
:boolean
|
@@ -169,6 +179,7 @@ module Fluent
|
|
169
179
|
string: StringFieldSchema,
|
170
180
|
integer: IntegerFieldSchema,
|
171
181
|
float: FloatFieldSchema,
|
182
|
+
numeric: NumericFieldSchema,
|
172
183
|
boolean: BooleanFieldSchema,
|
173
184
|
timestamp: TimestampFieldSchema,
|
174
185
|
date: DateFieldSchema,
|
@@ -34,12 +34,9 @@ module Fluent
|
|
34
34
|
}
|
35
35
|
}
|
36
36
|
|
37
|
-
if
|
38
|
-
|
39
|
-
|
40
|
-
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil
|
41
|
-
}.select { |_, value| !value.nil? }
|
42
|
-
end
|
37
|
+
definition.merge!(time_partitioning: time_partitioning) if time_partitioning
|
38
|
+
definition.merge!(require_partition_filter: require_partition_filter) if require_partition_filter
|
39
|
+
definition.merge!(clustering: clustering) if clustering
|
43
40
|
client.insert_table(project, dataset, definition, {})
|
44
41
|
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
45
42
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
@@ -50,10 +47,9 @@ module Fluent
|
|
50
47
|
return
|
51
48
|
end
|
52
49
|
|
53
|
-
|
54
|
-
log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message, reason: reason
|
50
|
+
log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
|
55
51
|
|
56
|
-
if
|
52
|
+
if create_table_retry_count < create_table_retry_limit
|
57
53
|
sleep create_table_retry_wait
|
58
54
|
create_table_retry_wait *= 2
|
59
55
|
create_table_retry_count += 1
|
@@ -76,14 +72,19 @@ module Fluent
|
|
76
72
|
nil
|
77
73
|
end
|
78
74
|
|
79
|
-
def insert_rows(project, dataset, table_id, rows, template_suffix: nil)
|
75
|
+
def insert_rows(project, dataset, table_id, rows, schema, template_suffix: nil)
|
80
76
|
body = {
|
81
77
|
rows: rows,
|
82
78
|
skip_invalid_rows: @options[:skip_invalid_rows],
|
83
79
|
ignore_unknown_values: @options[:ignore_unknown_values],
|
84
80
|
}
|
85
81
|
body.merge!(template_suffix: template_suffix) if template_suffix
|
86
|
-
|
82
|
+
|
83
|
+
if @options[:auto_create_table]
|
84
|
+
res = insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
|
85
|
+
else
|
86
|
+
res = client.insert_all_table_data(project, dataset, table_id, body, {})
|
87
|
+
end
|
87
88
|
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
88
89
|
|
89
90
|
if res.insert_errors && !res.insert_errors.empty?
|
@@ -100,8 +101,7 @@ module Fluent
|
|
100
101
|
end
|
101
102
|
end
|
102
103
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
103
|
-
|
104
|
-
error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason }
|
104
|
+
error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message }
|
105
105
|
wrapped = Fluent::BigQuery::Error.wrap(e)
|
106
106
|
if wrapped.retryable?
|
107
107
|
log.warn "tabledata.insertAll API", error_data
|
@@ -131,9 +131,6 @@ module Fluent
|
|
131
131
|
dataset_id: dataset,
|
132
132
|
table_id: table_id,
|
133
133
|
},
|
134
|
-
schema: {
|
135
|
-
fields: fields.to_a,
|
136
|
-
},
|
137
134
|
write_disposition: "WRITE_APPEND",
|
138
135
|
source_format: source_format,
|
139
136
|
ignore_unknown_values: @options[:ignore_unknown_values],
|
@@ -143,17 +140,19 @@ module Fluent
|
|
143
140
|
}
|
144
141
|
|
145
142
|
job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
|
146
|
-
configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if @options[:time_partitioning_type]
|
147
143
|
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
148
144
|
|
149
|
-
# If target table is already exist, omit schema configuration.
|
150
|
-
# Because schema changing is easier.
|
151
145
|
begin
|
152
|
-
|
153
|
-
|
146
|
+
# Check table existance
|
147
|
+
client.get_table(project, dataset, table_id)
|
148
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
149
|
+
if e.status_code == 404 && /Not Found: Table/i =~ e.message
|
150
|
+
raise Fluent::BigQuery::UnRetryableError.new("Table is not found") unless @options[:auto_create_table]
|
151
|
+
raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
|
152
|
+
configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
|
153
|
+
configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
|
154
|
+
configuration[:configuration][:load].merge!(clustering: clustering) if clustering
|
154
155
|
end
|
155
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError
|
156
|
-
raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
|
157
156
|
end
|
158
157
|
|
159
158
|
res = client.insert_job(
|
@@ -166,19 +165,7 @@ module Fluent
|
|
166
165
|
)
|
167
166
|
JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id)
|
168
167
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
169
|
-
|
170
|
-
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
171
|
-
|
172
|
-
if @options[:auto_create_table] && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
173
|
-
# Table Not Found: Auto Create Table
|
174
|
-
create_table(
|
175
|
-
project,
|
176
|
-
dataset,
|
177
|
-
table_id,
|
178
|
-
fields,
|
179
|
-
)
|
180
|
-
raise "table created. send rows next time."
|
181
|
-
end
|
168
|
+
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message
|
182
169
|
|
183
170
|
if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
184
171
|
return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id)
|
@@ -190,8 +177,9 @@ module Fluent
|
|
190
177
|
def fetch_load_job(job_reference)
|
191
178
|
project = job_reference.project_id
|
192
179
|
job_id = job_reference.job_id
|
180
|
+
location = @options[:location]
|
193
181
|
|
194
|
-
res = client.get_job(project, job_id)
|
182
|
+
res = client.get_job(project, job_id, location: location)
|
195
183
|
log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
|
196
184
|
|
197
185
|
if res.status.state == "DONE"
|
@@ -227,9 +215,10 @@ module Fluent
|
|
227
215
|
end
|
228
216
|
end
|
229
217
|
|
218
|
+
# `stats` can be nil if we receive a warning like "Warning: Load job succeeded with data imported, however statistics may be lost due to internal error."
|
230
219
|
stats = response.statistics.load
|
231
220
|
duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
|
232
|
-
log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats
|
221
|
+
log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats&.input_file_bytes, input_files: stats&.input_files, output_bytes: stats&.output_bytes, output_rows: stats&.output_rows, bad_records: stats&.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
|
233
222
|
@num_errors_per_chunk.delete(chunk_id_hex)
|
234
223
|
end
|
235
224
|
|
@@ -315,6 +304,64 @@ module Fluent
|
|
315
304
|
"NEWLINE_DELIMITED_JSON"
|
316
305
|
end
|
317
306
|
end
|
307
|
+
|
308
|
+
def time_partitioning
|
309
|
+
return @time_partitioning if instance_variable_defined?(:@time_partitioning)
|
310
|
+
|
311
|
+
if @options[:time_partitioning_type]
|
312
|
+
@time_partitioning = {
|
313
|
+
type: @options[:time_partitioning_type].to_s.upcase,
|
314
|
+
field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
|
315
|
+
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
|
316
|
+
}.reject { |_, v| v.nil? }
|
317
|
+
else
|
318
|
+
@time_partitioning
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def require_partition_filter
|
323
|
+
return @require_partition_filter if instance_variable_defined?(:@require_partition_filter)
|
324
|
+
|
325
|
+
if @options[:require_partition_filter]
|
326
|
+
@require_partition_filter = @options[:require_partition_filter]
|
327
|
+
else
|
328
|
+
@require_partition_filter
|
329
|
+
end
|
330
|
+
end
|
331
|
+
|
332
|
+
def clustering
|
333
|
+
return @clustering if instance_variable_defined?(:@clustering)
|
334
|
+
|
335
|
+
if @options[:clustering_fields]
|
336
|
+
@clustering = {
|
337
|
+
fields: @options[:clustering_fields]
|
338
|
+
}
|
339
|
+
else
|
340
|
+
@clustering
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
|
345
|
+
try_count ||= 1
|
346
|
+
res = client.insert_all_table_data(project, dataset, table_id, body, {})
|
347
|
+
rescue Google::Apis::ClientError => e
|
348
|
+
if e.status_code == 404 && /Not Found: Table/i =~ e.message
|
349
|
+
if try_count == 1
|
350
|
+
# Table Not Found: Auto Create Table
|
351
|
+
create_table(project, dataset, table_id, schema)
|
352
|
+
elsif try_count > 10
|
353
|
+
raise "A new table was created but it is not found."
|
354
|
+
end
|
355
|
+
|
356
|
+
# Retry to insert several times because the created table is not visible from Streaming insert for a little while
|
357
|
+
# cf. https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
|
358
|
+
try_count += 1
|
359
|
+
sleep 5
|
360
|
+
log.debug "Retry to insert rows", project_id: project, dataset: dataset, table: table_id
|
361
|
+
retry
|
362
|
+
end
|
363
|
+
raise
|
364
|
+
end
|
318
365
|
end
|
319
366
|
end
|
320
367
|
end
|
@@ -29,6 +29,9 @@ module Fluent
|
|
29
29
|
config_param :private_key_path, :string, default: nil
|
30
30
|
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
31
31
|
config_param :json_key, default: nil, secret: true
|
32
|
+
# The geographic location of the job. Required except for US and EU.
|
33
|
+
# https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
|
34
|
+
config_param :location, :string, default: nil
|
32
35
|
|
33
36
|
# see as simple reference
|
34
37
|
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
@@ -67,8 +70,12 @@ module Fluent
|
|
67
70
|
|
68
71
|
## Partitioning
|
69
72
|
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
73
|
+
config_param :time_partitioning_field, :string, default: nil
|
70
74
|
config_param :time_partitioning_expiration, :time, default: nil
|
71
75
|
|
76
|
+
## Clustering
|
77
|
+
config_param :clustering_fields, :array, default: nil
|
78
|
+
|
72
79
|
## Formatter
|
73
80
|
config_section :format do
|
74
81
|
config_set_default :@type, 'json'
|
@@ -104,9 +111,6 @@ module Fluent
|
|
104
111
|
if @schema
|
105
112
|
@table_schema.load_schema(@schema)
|
106
113
|
end
|
107
|
-
if @schema_path
|
108
|
-
@table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
|
109
|
-
end
|
110
114
|
|
111
115
|
formatter_config = conf.elements("format")[0]
|
112
116
|
@formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
|
@@ -119,6 +123,7 @@ module Fluent
|
|
119
123
|
@tables_mutex = Mutex.new
|
120
124
|
@fetched_schemas = {}
|
121
125
|
@last_fetch_schema_time = Hash.new(0)
|
126
|
+
@read_schemas = {}
|
122
127
|
end
|
123
128
|
|
124
129
|
def multi_workers_ready?
|
@@ -130,6 +135,7 @@ module Fluent
|
|
130
135
|
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
131
136
|
email: @email,
|
132
137
|
json_key: @json_key,
|
138
|
+
location: @location,
|
133
139
|
source_format: @source_format,
|
134
140
|
skip_invalid_rows: @skip_invalid_rows,
|
135
141
|
ignore_unknown_values: @ignore_unknown_values,
|
@@ -138,7 +144,10 @@ module Fluent
|
|
138
144
|
prevent_duplicate_load: @prevent_duplicate_load,
|
139
145
|
auto_create_table: @auto_create_table,
|
140
146
|
time_partitioning_type: @time_partitioning_type,
|
147
|
+
time_partitioning_field: @time_partitioning_field,
|
141
148
|
time_partitioning_expiration: @time_partitioning_expiration,
|
149
|
+
require_partition_filter: @require_partition_filter,
|
150
|
+
clustering_fields: @clustering_fields,
|
142
151
|
timeout_sec: @request_timeout_sec,
|
143
152
|
open_timeout_sec: @request_open_timeout_sec,
|
144
153
|
})
|
@@ -151,6 +160,8 @@ module Fluent
|
|
151
160
|
schema =
|
152
161
|
if @fetch_schema
|
153
162
|
fetch_schema(meta)
|
163
|
+
elsif @schema_path
|
164
|
+
read_schema(meta)
|
154
165
|
else
|
155
166
|
@table_schema
|
156
167
|
end
|
@@ -182,7 +193,7 @@ module Fluent
|
|
182
193
|
table_schema.load_schema(schema)
|
183
194
|
@fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
|
184
195
|
else
|
185
|
-
if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].
|
196
|
+
if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].nil?
|
186
197
|
raise "failed to fetch schema from bigquery"
|
187
198
|
else
|
188
199
|
log.warn "#{table_id} uses previous schema"
|
@@ -199,9 +210,26 @@ module Fluent
|
|
199
210
|
extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
|
200
211
|
end
|
201
212
|
|
213
|
+
def read_schema(metadata)
|
214
|
+
schema_path = read_schema_target_path(metadata)
|
215
|
+
|
216
|
+
unless @read_schemas[schema_path]
|
217
|
+
table_schema = Fluent::BigQuery::RecordSchema.new("record")
|
218
|
+
table_schema.load_schema(MultiJson.load(File.read(schema_path)))
|
219
|
+
@read_schemas[schema_path] = table_schema
|
220
|
+
end
|
221
|
+
@read_schemas[schema_path]
|
222
|
+
end
|
223
|
+
|
224
|
+
def read_schema_target_path(metadata)
|
225
|
+
extract_placeholders(@schema_path, metadata)
|
226
|
+
end
|
227
|
+
|
202
228
|
def get_schema(project, dataset, metadata)
|
203
229
|
if @fetch_schema
|
204
230
|
@fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
|
231
|
+
elsif @schema_path
|
232
|
+
@read_schemas[read_schema_target_path(metadata)] || read_schema(metadata)
|
205
233
|
else
|
206
234
|
@table_schema
|
207
235
|
end
|
@@ -29,6 +29,9 @@ module Fluent
|
|
29
29
|
# If insert_id_field is not specified, true means to allow duplicate rows
|
30
30
|
config_param :allow_retry_insert_errors, :bool, default: false
|
31
31
|
|
32
|
+
## RequirePartitionFilter
|
33
|
+
config_param :require_partition_filter, :bool, default: false
|
34
|
+
|
32
35
|
## Buffer
|
33
36
|
config_section :buffer do
|
34
37
|
config_set_default :@type, "memory"
|
@@ -96,14 +99,8 @@ module Fluent
|
|
96
99
|
end
|
97
100
|
|
98
101
|
def insert(project, dataset, table_id, rows, schema, template_suffix)
|
99
|
-
writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
|
102
|
+
writer.insert_rows(project, dataset, table_id, rows, schema, template_suffix: template_suffix)
|
100
103
|
rescue Fluent::BigQuery::Error => e
|
101
|
-
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
102
|
-
# Table Not Found: Auto Create Table
|
103
|
-
writer.create_table(project, dataset, table_id, schema)
|
104
|
-
raise "table created. send rows next time."
|
105
|
-
end
|
106
|
-
|
107
104
|
raise if e.retryable?
|
108
105
|
|
109
106
|
if @secondary
|
@@ -147,33 +147,6 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
147
147
|
assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
|
148
148
|
end
|
149
149
|
|
150
|
-
def test_configure_auth_json_key_as_file_raise_permission_error
|
151
|
-
json_key_path = 'test/plugin/testdata/json_key.json'
|
152
|
-
json_key_path_dir = File.dirname(json_key_path)
|
153
|
-
|
154
|
-
begin
|
155
|
-
File.chmod(0000, json_key_path_dir)
|
156
|
-
|
157
|
-
driver = create_driver(%[
|
158
|
-
table foo
|
159
|
-
auth_method json_key
|
160
|
-
json_key #{json_key_path}
|
161
|
-
project yourproject_id
|
162
|
-
dataset yourdataset_id
|
163
|
-
schema [
|
164
|
-
{"name": "time", "type": "INTEGER"},
|
165
|
-
{"name": "status", "type": "INTEGER"},
|
166
|
-
{"name": "bytes", "type": "INTEGER"}
|
167
|
-
]
|
168
|
-
])
|
169
|
-
assert_raises(Errno::EACCES) do
|
170
|
-
driver.instance.writer.client
|
171
|
-
end
|
172
|
-
ensure
|
173
|
-
File.chmod(0755, json_key_path_dir)
|
174
|
-
end
|
175
|
-
end
|
176
|
-
|
177
150
|
def test_configure_auth_json_key_as_string
|
178
151
|
json_key = '{"private_key": "X", "client_email": "' + 'x' * 255 + '@developer.gserviceaccount.com"}'
|
179
152
|
json_key_io = StringIO.new(json_key)
|
@@ -199,6 +172,8 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
199
172
|
end
|
200
173
|
|
201
174
|
def test_configure_auth_application_default
|
175
|
+
omit "This testcase depends on some environment variables." if ENV["CI"] == "true"
|
176
|
+
|
202
177
|
driver = create_driver(%[
|
203
178
|
table foo
|
204
179
|
auth_method application_default
|
@@ -576,4 +551,24 @@ class BigQueryBaseOutputTest < Test::Unit::TestCase
|
|
576
551
|
assert_equal :string, table_schema["argv"].type
|
577
552
|
assert_equal :repeated, table_schema["argv"].mode
|
578
553
|
end
|
554
|
+
|
555
|
+
def test_resolve_schema_path_with_placeholder
|
556
|
+
now = Time.now.to_i
|
557
|
+
driver = create_driver(<<-CONFIG)
|
558
|
+
table ${tag}_%Y%m%d
|
559
|
+
auth_method json_key
|
560
|
+
json_key jsonkey.josn
|
561
|
+
project yourproject_id
|
562
|
+
dataset yourdataset_id
|
563
|
+
schema_path ${tag}.schema
|
564
|
+
|
565
|
+
<buffer tag, time>
|
566
|
+
timekey 1d
|
567
|
+
</buffer>
|
568
|
+
CONFIG
|
569
|
+
|
570
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(now, "foo", {})
|
571
|
+
|
572
|
+
assert_equal "foo.schema", driver.instance.read_schema_target_path(metadata)
|
573
|
+
end
|
579
574
|
end
|
@@ -5,6 +5,8 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
5
5
|
Fluent::Test.setup
|
6
6
|
end
|
7
7
|
|
8
|
+
SCHEMA_PATH = File.join(File.dirname(__FILE__), "testdata", "apache.schema")
|
9
|
+
|
8
10
|
CONFIG = %[
|
9
11
|
table foo
|
10
12
|
email foo@bar.example
|
@@ -121,7 +123,6 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
121
123
|
driver = create_driver
|
122
124
|
|
123
125
|
stub_writer do |writer|
|
124
|
-
mock.proxy(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [{json: hash_including(entry)}], template_suffix: nil)
|
125
126
|
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', {
|
126
127
|
rows: [{json: hash_including(entry)}],
|
127
128
|
skip_invalid_rows: false,
|
@@ -261,7 +262,7 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
261
262
|
|
262
263
|
driver.instance_start
|
263
264
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
264
|
-
metadata =
|
265
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
265
266
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
266
267
|
c.append([driver.instance.format(tag, time, record)])
|
267
268
|
end
|
@@ -345,11 +346,27 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
345
346
|
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
346
347
|
CONFIG
|
347
348
|
|
349
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
350
|
+
|
348
351
|
stub_writer do |writer|
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
352
|
+
body = {
|
353
|
+
rows: [{json: Fluent::BigQuery::Helper.deep_symbolize_keys(message)}],
|
354
|
+
skip_invalid_rows: false,
|
355
|
+
ignore_unknown_values: false,
|
356
|
+
}
|
357
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
|
358
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
359
|
+
end.at_least(1)
|
360
|
+
mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
|
361
|
+
|
362
|
+
mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
|
363
|
+
table_reference: {
|
364
|
+
table_id: 'foo',
|
365
|
+
},
|
366
|
+
schema: {
|
367
|
+
fields: schema_fields,
|
368
|
+
},
|
369
|
+
}, {})
|
353
370
|
end
|
354
371
|
|
355
372
|
assert_raise(RuntimeError) do
|
@@ -401,14 +418,131 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
|
|
401
418
|
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
402
419
|
|
403
420
|
time_partitioning_type day
|
421
|
+
time_partitioning_field time
|
404
422
|
time_partitioning_expiration 1h
|
423
|
+
|
424
|
+
require_partition_filter true
|
405
425
|
CONFIG
|
406
426
|
|
427
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
428
|
+
|
407
429
|
stub_writer do |writer|
|
408
|
-
|
409
|
-
|
430
|
+
body = {
|
431
|
+
rows: [message],
|
432
|
+
skip_invalid_rows: false,
|
433
|
+
ignore_unknown_values: false,
|
434
|
+
}
|
435
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
|
436
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
437
|
+
end.at_least(1)
|
438
|
+
mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
|
439
|
+
|
440
|
+
mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
|
441
|
+
table_reference: {
|
442
|
+
table_id: 'foo',
|
443
|
+
},
|
444
|
+
schema: {
|
445
|
+
fields: schema_fields,
|
446
|
+
},
|
447
|
+
time_partitioning: {
|
448
|
+
type: 'DAY',
|
449
|
+
field: 'time',
|
450
|
+
expiration_ms: 3600000,
|
451
|
+
},
|
452
|
+
require_partition_filter: true,
|
453
|
+
}, {})
|
454
|
+
end
|
455
|
+
|
456
|
+
assert_raise(RuntimeError) do
|
457
|
+
driver.run do
|
458
|
+
driver.feed("tag", Fluent::EventTime.now, message[:json])
|
410
459
|
end
|
411
|
-
|
460
|
+
end
|
461
|
+
end
|
462
|
+
|
463
|
+
def test_auto_create_clustered_table_by_bigquery_api
|
464
|
+
now = Time.now
|
465
|
+
message = {
|
466
|
+
json: {
|
467
|
+
time: now.to_i,
|
468
|
+
request: {
|
469
|
+
vhost: "bar",
|
470
|
+
path: "/path/to/baz",
|
471
|
+
method: "GET",
|
472
|
+
protocol: "HTTP/1.0",
|
473
|
+
agent: "libwww",
|
474
|
+
referer: "http://referer.example",
|
475
|
+
time: (now - 1).to_f,
|
476
|
+
bot_access: true,
|
477
|
+
loginsession: false,
|
478
|
+
},
|
479
|
+
remote: {
|
480
|
+
host: "remote.example",
|
481
|
+
ip: "192.168.1.1",
|
482
|
+
user: "nagachika",
|
483
|
+
},
|
484
|
+
response: {
|
485
|
+
status: 200,
|
486
|
+
bytes: 72,
|
487
|
+
},
|
488
|
+
}
|
489
|
+
}
|
490
|
+
|
491
|
+
driver = create_driver(<<-CONFIG)
|
492
|
+
table foo
|
493
|
+
email foo@bar.example
|
494
|
+
private_key_path /path/to/key
|
495
|
+
project yourproject_id
|
496
|
+
dataset yourdataset_id
|
497
|
+
|
498
|
+
time_format %s
|
499
|
+
time_field time
|
500
|
+
|
501
|
+
auto_create_table true
|
502
|
+
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
503
|
+
|
504
|
+
time_partitioning_type day
|
505
|
+
time_partitioning_field time
|
506
|
+
time_partitioning_expiration 1h
|
507
|
+
|
508
|
+
clustering_fields [
|
509
|
+
"time",
|
510
|
+
"vhost"
|
511
|
+
]
|
512
|
+
CONFIG
|
513
|
+
|
514
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
515
|
+
|
516
|
+
stub_writer do |writer|
|
517
|
+
body = {
|
518
|
+
rows: [message],
|
519
|
+
skip_invalid_rows: false,
|
520
|
+
ignore_unknown_values: false,
|
521
|
+
}
|
522
|
+
mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
|
523
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
524
|
+
end.at_least(1)
|
525
|
+
mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
|
526
|
+
|
527
|
+
mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
|
528
|
+
table_reference: {
|
529
|
+
table_id: 'foo',
|
530
|
+
},
|
531
|
+
schema: {
|
532
|
+
fields: schema_fields,
|
533
|
+
},
|
534
|
+
time_partitioning: {
|
535
|
+
type: 'DAY',
|
536
|
+
field: 'time',
|
537
|
+
expiration_ms: 3600000,
|
538
|
+
},
|
539
|
+
clustering: {
|
540
|
+
fields: [
|
541
|
+
'time',
|
542
|
+
'vhost',
|
543
|
+
],
|
544
|
+
},
|
545
|
+
}, {})
|
412
546
|
end
|
413
547
|
|
414
548
|
assert_raise(RuntimeError) do
|
@@ -39,10 +39,8 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
39
39
|
writer
|
40
40
|
end
|
41
41
|
end
|
42
|
-
|
43
|
-
def test_write
|
44
|
-
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
45
42
|
|
43
|
+
def test_write
|
46
44
|
response_stub = stub!
|
47
45
|
|
48
46
|
driver = create_driver
|
@@ -60,9 +58,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
60
58
|
dataset_id: 'yourdataset_id',
|
61
59
|
table_id: 'foo',
|
62
60
|
},
|
63
|
-
schema: {
|
64
|
-
fields: schema_fields,
|
65
|
-
},
|
66
61
|
write_disposition: "WRITE_APPEND",
|
67
62
|
source_format: "NEWLINE_DELIMITED_JSON",
|
68
63
|
ignore_unknown_values: false,
|
@@ -99,7 +94,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
99
94
|
schema_path #{SCHEMA_PATH}
|
100
95
|
prevent_duplicate_load true
|
101
96
|
CONFIG
|
102
|
-
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
103
97
|
|
104
98
|
response_stub = stub!
|
105
99
|
stub_writer do |writer|
|
@@ -116,9 +110,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
116
110
|
dataset_id: 'yourdataset_id',
|
117
111
|
table_id: 'foo',
|
118
112
|
},
|
119
|
-
schema: {
|
120
|
-
fields: schema_fields,
|
121
|
-
},
|
122
113
|
write_disposition: "WRITE_APPEND",
|
123
114
|
source_format: "NEWLINE_DELIMITED_JSON",
|
124
115
|
ignore_unknown_values: false,
|
@@ -138,11 +129,10 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
138
129
|
|
139
130
|
def test_write_with_retryable_error
|
140
131
|
driver = create_driver
|
141
|
-
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
142
132
|
|
143
133
|
driver.instance_start
|
144
134
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
145
|
-
metadata =
|
135
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
146
136
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
147
137
|
c.append([driver.instance.format(tag, time, record)])
|
148
138
|
end
|
@@ -158,9 +148,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
158
148
|
dataset_id: 'yourdataset_id',
|
159
149
|
table_id: 'foo',
|
160
150
|
},
|
161
|
-
schema: {
|
162
|
-
fields: schema_fields,
|
163
|
-
},
|
164
151
|
write_disposition: "WRITE_APPEND",
|
165
152
|
source_format: "NEWLINE_DELIMITED_JSON",
|
166
153
|
ignore_unknown_values: false,
|
@@ -171,7 +158,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
171
158
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
172
159
|
end
|
173
160
|
|
174
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
|
161
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', :location=>nil) do
|
175
162
|
stub! do |s|
|
176
163
|
s.id { 'dummy_job_id' }
|
177
164
|
s.configuration.stub! do |_s|
|
@@ -225,11 +212,10 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
225
212
|
utc
|
226
213
|
</secondary>
|
227
214
|
CONFIG
|
228
|
-
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
229
215
|
|
230
216
|
driver.instance_start
|
231
217
|
tag, time, record = "tag", Time.now.to_i, {"a" => "b"}
|
232
|
-
metadata =
|
218
|
+
metadata = Fluent::Plugin::Buffer::Metadata.new(tag, time, record)
|
233
219
|
chunk = driver.instance.buffer.generate_chunk(metadata).tap do |c|
|
234
220
|
c.append([driver.instance.format(tag, time, record)])
|
235
221
|
end
|
@@ -245,9 +231,6 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
245
231
|
dataset_id: 'yourdataset_id',
|
246
232
|
table_id: 'foo',
|
247
233
|
},
|
248
|
-
schema: {
|
249
|
-
fields: schema_fields,
|
250
|
-
},
|
251
234
|
write_disposition: "WRITE_APPEND",
|
252
235
|
source_format: "NEWLINE_DELIMITED_JSON",
|
253
236
|
ignore_unknown_values: false,
|
@@ -258,7 +241,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
258
241
|
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
259
242
|
end
|
260
243
|
|
261
|
-
mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
|
244
|
+
mock(writer.client).get_job('yourproject_id', 'dummy_job_id', :location=>nil) do
|
262
245
|
stub! do |s|
|
263
246
|
s.id { 'dummy_job_id' }
|
264
247
|
s.configuration.stub! do |_s|
|
@@ -289,6 +272,61 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
|
|
289
272
|
driver.instance_shutdown
|
290
273
|
end
|
291
274
|
|
275
|
+
def test_write_with_auto_create_table
|
276
|
+
driver = create_driver(<<-CONFIG)
|
277
|
+
table foo
|
278
|
+
email foo@bar.example
|
279
|
+
private_key_path /path/to/key
|
280
|
+
project yourproject_id
|
281
|
+
dataset yourdataset_id
|
282
|
+
|
283
|
+
<buffer>
|
284
|
+
@type memory
|
285
|
+
</buffer>
|
286
|
+
|
287
|
+
<inject>
|
288
|
+
time_format %s
|
289
|
+
time_key time
|
290
|
+
</inject>
|
291
|
+
|
292
|
+
auto_create_table true
|
293
|
+
schema_path #{SCHEMA_PATH}
|
294
|
+
CONFIG
|
295
|
+
|
296
|
+
schema_fields = Fluent::BigQuery::Helper.deep_symbolize_keys(MultiJson.load(File.read(SCHEMA_PATH)))
|
297
|
+
|
298
|
+
stub_writer do |writer|
|
299
|
+
mock(writer.client).get_table('yourproject_id', 'yourdataset_id', 'foo') do
|
300
|
+
raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
|
301
|
+
end
|
302
|
+
|
303
|
+
mock(writer.client).insert_job('yourproject_id', {
|
304
|
+
configuration: {
|
305
|
+
load: {
|
306
|
+
destination_table: {
|
307
|
+
project_id: 'yourproject_id',
|
308
|
+
dataset_id: 'yourdataset_id',
|
309
|
+
table_id: 'foo',
|
310
|
+
},
|
311
|
+
write_disposition: "WRITE_APPEND",
|
312
|
+
source_format: "NEWLINE_DELIMITED_JSON",
|
313
|
+
ignore_unknown_values: false,
|
314
|
+
max_bad_records: 0,
|
315
|
+
schema: {
|
316
|
+
fields: schema_fields,
|
317
|
+
},
|
318
|
+
}
|
319
|
+
}
|
320
|
+
}, {upload_source: duck_type(:write, :sync, :rewind), content_type: "application/octet-stream"}) do
|
321
|
+
stub!.job_reference.stub!.job_id { "dummy_job_id" }
|
322
|
+
end
|
323
|
+
end
|
324
|
+
|
325
|
+
driver.run do
|
326
|
+
driver.feed("tag", Time.now.to_i, {"a" => "b"})
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
292
330
|
private
|
293
331
|
|
294
332
|
def create_response_stub(response)
|
@@ -27,6 +27,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
27
27
|
"name" => "argv",
|
28
28
|
"type" => "STRING",
|
29
29
|
"mode" => "REPEATED"
|
30
|
+
},
|
31
|
+
{
|
32
|
+
"name" => "utilisation",
|
33
|
+
"type" => "NUMERIC",
|
34
|
+
"mode" => "NULLABLE"
|
30
35
|
}
|
31
36
|
]
|
32
37
|
end
|
@@ -58,6 +63,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
58
63
|
"type" => "STRING",
|
59
64
|
"mode" => "REPEATED"
|
60
65
|
},
|
66
|
+
{
|
67
|
+
"name" => "utilisation",
|
68
|
+
"type" => "NUMERIC",
|
69
|
+
"mode" => "NULLABLE"
|
70
|
+
},
|
61
71
|
{
|
62
72
|
"name" => "new_column",
|
63
73
|
"type" => "STRING",
|
@@ -93,6 +103,11 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
93
103
|
"type" => "STRING",
|
94
104
|
"mode" => "REPEATED"
|
95
105
|
},
|
106
|
+
{
|
107
|
+
"name" => "utilisation",
|
108
|
+
"type" => "NUMERIC",
|
109
|
+
"mode" => "NULLABLE"
|
110
|
+
}
|
96
111
|
]
|
97
112
|
end
|
98
113
|
|
@@ -142,12 +157,12 @@ class RecordSchemaTest < Test::Unit::TestCase
|
|
142
157
|
time = Time.local(2016, 2, 7, 19, 0, 0).utc
|
143
158
|
|
144
159
|
formatted = fields.format_one({
|
145
|
-
"time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42]
|
160
|
+
"time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42], "utilisation" => "0.837"
|
146
161
|
})
|
147
162
|
assert_equal(
|
148
163
|
formatted,
|
149
164
|
{
|
150
|
-
"time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"]
|
165
|
+
"time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"], "utilisation" => "0.837"
|
151
166
|
}
|
152
167
|
)
|
153
168
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2022-02-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -139,8 +139,9 @@ extensions: []
|
|
139
139
|
extra_rdoc_files: []
|
140
140
|
files:
|
141
141
|
- ".github/ISSUE_TEMPLATE.md"
|
142
|
+
- ".github/workflows/linux.yml"
|
143
|
+
- ".github/workflows/windows.yml"
|
142
144
|
- ".gitignore"
|
143
|
-
- ".travis.yml"
|
144
145
|
- Gemfile
|
145
146
|
- LICENSE.txt
|
146
147
|
- README.md
|
@@ -179,12 +180,11 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
179
180
|
version: '0'
|
180
181
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
181
182
|
requirements:
|
182
|
-
- - "
|
183
|
+
- - ">="
|
183
184
|
- !ruby/object:Gem::Version
|
184
|
-
version:
|
185
|
+
version: '0'
|
185
186
|
requirements: []
|
186
|
-
|
187
|
-
rubygems_version: 2.6.12
|
187
|
+
rubygems_version: 3.1.4
|
188
188
|
signing_key:
|
189
189
|
specification_version: 4
|
190
190
|
summary: Fluentd plugin to store data on Google BigQuery
|