fluent-plugin-bigquery 0.4.4 → 0.5.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/README.md +123 -127
- data/fluent-plugin-bigquery.gemspec +1 -4
- data/lib/fluent/plugin/bigquery/schema.rb +2 -4
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +2 -8
- data/lib/fluent/plugin/out_bigquery.rb +431 -440
- data/test/helper.rb +5 -1
- data/test/plugin/test_out_bigquery.rb +479 -708
- data/test/plugin/test_record_schema.rb +8 -24
- data/test/run_test.rb +9 -0
- metadata +8 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 440ecd282c8e8e724c5a1e0baa43aa88c0051f26
|
4
|
+
data.tar.gz: 4e9767f4cfb54a6091dc25553dbb0190387b0d35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f57040773228cd0c73610251a6a67497750f5df9d232c9d61a1177056d709671b3e1e0970f6e6557422bc6885d16c10024bd3b79954c57a3e75846d419974d6a
|
7
|
+
data.tar.gz: 1507120a4806737440d8666b57ea220df61076415adea45516d5feab651d3207bdbbb236c86ef579555226cbcb3a480c63260b361732ecb31a7cb6320aff9e7a
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -28,59 +28,87 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
28
28
|
|
29
29
|
### Options
|
30
30
|
|
31
|
-
| name | type | required? | default
|
32
|
-
| :------------------------------------- | :------------ | :----------- | :-------------------------
|
33
|
-
| method | string | no | insert
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
76
|
-
|
31
|
+
| name | type | required? | placeholder? | default | description |
|
32
|
+
| :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
|
33
|
+
| method | string | no | no | insert | `insert` (Streaming Insert) or `load` (load job) |
|
34
|
+
| auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
|
35
|
+
| email | string | yes (private_key) | no | nil | GCP Service Account Email |
|
36
|
+
| private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
|
37
|
+
| private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
|
38
|
+
| json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
|
39
|
+
| project | string | yes | yes | nil | |
|
40
|
+
| dataset | string | yes | yes | nil | |
|
41
|
+
| table | string | yes (either `tables`) | yes | nil | |
|
42
|
+
| tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` |
|
43
|
+
| template_suffix | string | no | yes | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
|
44
|
+
| auto_create_table | bool | no | no | false | If true, creates table automatically |
|
45
|
+
| skip_invalid_rows | bool | no | no | false | Only `insert` method. |
|
46
|
+
| max_bad_records | integer | no | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
|
47
|
+
| ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
|
48
|
+
| schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
|
49
|
+
| schema_path | string | yes (either `fetch_schema`) | no | nil | Schema Definition file path. It is formatted by JSON. |
|
50
|
+
| fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
|
51
|
+
| fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
|
52
|
+
| schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
53
|
+
| field_string | string | no | no | nil | see examples. |
|
54
|
+
| field_integer | string | no | no | nil | see examples. |
|
55
|
+
| field_float | string | no | no | nil | see examples. |
|
56
|
+
| field_boolean | string | no | no | nil | see examples. |
|
57
|
+
| field_timestamp | string | no | no | nil | see examples. |
|
58
|
+
| replace_record_key | bool | no | no | false | see examples. |
|
59
|
+
| replace_record_key_regexp{1-10} | string | no | no | nil | see examples. |
|
60
|
+
| convert_hash_to_json | bool | no | no | false | If true, converts Hash value of record to JSON String. |
|
61
|
+
| insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
|
62
|
+
| allow_retry_insert_errors | bool | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
|
63
|
+
| request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
|
64
|
+
| request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
65
|
+
| time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
|
66
|
+
| time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
|
67
|
+
|
68
|
+
### Buffer section
|
69
|
+
|
70
|
+
| name | type | required? | default | description |
|
71
|
+
| :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
|
72
|
+
| @type | string | no | memory (insert) or file (load) | |
|
73
|
+
| chunk_limit_size | integer | no | 1MB (insert) or 1GB (load) | |
|
74
|
+
| total_limit_size | integer | no | 1GB (insert) or 32GB (load) | |
|
75
|
+
| chunk_records_limit | integer | no | 500 (insert) or nil (load) | |
|
76
|
+
| flush_mode | enum | no | interval | default, lazy, interval, immediate |
|
77
|
+
| flush_interval | float | no | 0.25 (insert) or nil (load) | |
|
78
|
+
| flush_thread_interval | float | no | 0.05 (insert) or nil (load) | |
|
79
|
+
| flush_thread_burst_interval | float | no | 0.05 (insert) or nil (load) | |
|
80
|
+
|
81
|
+
And, other params (defined by base class) are available
|
82
|
+
|
83
|
+
see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin/output.rb
|
84
|
+
|
85
|
+
### Inject section
|
86
|
+
|
87
|
+
It is replacement of previous version `time_field` and `time_format`.
|
88
|
+
|
89
|
+
For example.
|
90
|
+
|
91
|
+
```
|
92
|
+
<inject>
|
93
|
+
time_key time_field_name
|
94
|
+
time_type string
|
95
|
+
time_format %Y-%m-%d %H:%M:%S
|
96
|
+
</inject>
|
97
|
+
```
|
77
98
|
|
78
99
|
| name | type | required? | default | description |
|
79
100
|
| :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
83
|
-
|
101
|
+
| hostname_key | string | no | nil | |
|
102
|
+
| hostname | string | no | nil | |
|
103
|
+
| tag_key | string | no | nil | |
|
104
|
+
| time_key | string | no | nil | |
|
105
|
+
| time_type | string | no | nil | |
|
106
|
+
| time_format | string | no | nil | |
|
107
|
+
| localtime | bool | no | true | |
|
108
|
+
| utc | bool | no | false | |
|
109
|
+
| timezone | string | no | nil | |
|
110
|
+
|
111
|
+
see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin_helper/inject.rb
|
84
112
|
|
85
113
|
## Examples
|
86
114
|
|
@@ -103,9 +131,6 @@ Configure insert specifications with target table schema, with your credentials.
|
|
103
131
|
dataset yourdataset_id
|
104
132
|
table tablename
|
105
133
|
|
106
|
-
time_format %s
|
107
|
-
time_field time
|
108
|
-
|
109
134
|
schema [
|
110
135
|
{"name": "time", "type": "INTEGER"},
|
111
136
|
{"name": "status", "type": "INTEGER"},
|
@@ -135,14 +160,15 @@ For high rate inserts over streaming inserts, you should specify flush intervals
|
|
135
160
|
@type bigquery
|
136
161
|
|
137
162
|
method insert # default
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
163
|
+
|
164
|
+
<buffer>
|
165
|
+
flush_interval 0.1 # flush as frequent as possible
|
166
|
+
|
167
|
+
buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
|
168
|
+
|
169
|
+
flush_thread_count 16
|
170
|
+
</buffer>
|
171
|
+
|
146
172
|
auth_method private_key # default
|
147
173
|
email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
|
148
174
|
private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
|
@@ -152,9 +178,6 @@ For high rate inserts over streaming inserts, you should specify flush intervals
|
|
152
178
|
dataset yourdataset_id
|
153
179
|
tables accesslog1,accesslog2,accesslog3
|
154
180
|
|
155
|
-
time_format %s
|
156
|
-
time_field time
|
157
|
-
|
158
181
|
schema [
|
159
182
|
{"name": "time", "type": "INTEGER"},
|
160
183
|
{"name": "status", "type": "INTEGER"},
|
@@ -183,23 +206,23 @@ Important options for high rate events are:
|
|
183
206
|
* 2 or more tables are available with ',' separator
|
184
207
|
* `out_bigquery` uses these tables for Table Sharding inserts
|
185
208
|
* these must have same schema
|
186
|
-
* `
|
209
|
+
* `buffer/chunk_limit_size`
|
187
210
|
* max size of an insert or chunk (default 1000000 or 1MB)
|
188
211
|
* the max size is limited to 1MB on BigQuery
|
189
|
-
* `
|
212
|
+
* `buffer/chunk_records_limit`
|
190
213
|
* number of records over streaming inserts API call is limited as 500, per insert or chunk
|
191
214
|
* `out_bigquery` flushes buffer with 500 records for 1 inserts API call
|
192
|
-
* `
|
215
|
+
* `buffer/queue_length_limit`
|
193
216
|
* BigQuery streaming inserts needs very small buffer chunks
|
194
217
|
* for high-rate events, `buffer_queue_limit` should be configured with big number
|
195
218
|
* Max 1GB memory may be used under network problem in default configuration
|
196
|
-
* `
|
197
|
-
* `
|
219
|
+
* `chunk_limit_size (default 1MB)` x `queue_length_limit (default 1024)`
|
220
|
+
* `buffer/flush_thread_count`
|
198
221
|
* threads for insert api calls in parallel
|
199
222
|
* specify this option for 100 or more records per seconds
|
200
223
|
* 10 or more threads seems good for inserts over internet
|
201
224
|
* less threads may be good for Google Compute Engine instances (with low latency for BigQuery)
|
202
|
-
* `flush_interval`
|
225
|
+
* `buffer/flush_interval`
|
203
226
|
* interval between data flushes (default 0.25)
|
204
227
|
* you can set subsecond values such as `0.15` on Fluentd v0.10.42 or later
|
205
228
|
|
@@ -212,19 +235,18 @@ section in the Google BigQuery document.
|
|
212
235
|
@type bigquery
|
213
236
|
|
214
237
|
method load
|
215
|
-
|
216
|
-
|
238
|
+
|
239
|
+
<buffer>
|
240
|
+
@type file
|
241
|
+
path bigquery.*.buffer
|
217
242
|
flush_interval 1800
|
218
243
|
flush_at_shutdown true
|
219
|
-
|
220
|
-
|
244
|
+
timekey_use_utc
|
245
|
+
</buffer>
|
221
246
|
|
222
247
|
auth_method json_key
|
223
248
|
json_key json_key_path.json
|
224
249
|
|
225
|
-
time_format %s
|
226
|
-
time_field time
|
227
|
-
|
228
250
|
project yourproject_id
|
229
251
|
dataset yourdataset_id
|
230
252
|
auto_create_table true
|
@@ -235,8 +257,6 @@ section in the Google BigQuery document.
|
|
235
257
|
|
236
258
|
I recommend to use file buffer and long flush interval.
|
237
259
|
|
238
|
-
__CAUTION: `flush_interval` default is still `0.25` even if `method` is `load` on current version.__
|
239
|
-
|
240
260
|
### Authentication
|
241
261
|
|
242
262
|
There are four methods supported to fetch access token for the service account.
|
@@ -304,8 +324,6 @@ Compute Engine instance, then you can configure fluentd like this.
|
|
304
324
|
dataset yourdataset_id
|
305
325
|
table tablename
|
306
326
|
|
307
|
-
time_format %s
|
308
|
-
time_field time
|
309
327
|
...
|
310
328
|
</match>
|
311
329
|
```
|
@@ -325,11 +343,15 @@ In this authentication method, the credentials returned are determined by the en
|
|
325
343
|
|
326
344
|
### Table id formatting
|
327
345
|
|
346
|
+
this plugin supports fluentd-0.14 style placeholder.
|
347
|
+
|
328
348
|
#### strftime formatting
|
329
349
|
`table` and `tables` options accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
|
330
350
|
format to construct table ids.
|
331
351
|
Table ids are formatted at runtime
|
332
|
-
using the
|
352
|
+
using the chunk key time.
|
353
|
+
|
354
|
+
see. http://docs.fluentd.org/v0.14/articles/output-plugin-overview
|
333
355
|
|
334
356
|
For example, with the configuration below,
|
335
357
|
data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
|
@@ -344,6 +366,9 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
|
|
344
366
|
dataset yourdataset_id
|
345
367
|
table accesslog_%Y_%m
|
346
368
|
|
369
|
+
<buffer time>
|
370
|
+
timekey 1d
|
371
|
+
</buffer>
|
347
372
|
...
|
348
373
|
</match>
|
349
374
|
```
|
@@ -351,12 +376,15 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
|
|
351
376
|
#### record attribute formatting
|
352
377
|
The format can be suffixed with attribute name.
|
353
378
|
|
354
|
-
|
379
|
+
__CAUTION: format is different with previous version__
|
355
380
|
|
356
381
|
```apache
|
357
382
|
<match dummy>
|
358
383
|
...
|
359
|
-
table accesslog_
|
384
|
+
table accesslog_${status_code}
|
385
|
+
|
386
|
+
<buffer status_code>
|
387
|
+
</buffer>
|
360
388
|
...
|
361
389
|
</match>
|
362
390
|
```
|
@@ -365,50 +393,27 @@ If attribute name is given, the time to be used for formatting is value of each
|
|
365
393
|
The value for the time should be a UNIX time.
|
366
394
|
|
367
395
|
#### time_slice_key formatting
|
368
|
-
Or, the options can use `%{time_slice}` placeholder.
|
369
|
-
`%{time_slice}` is replaced by formatted time slice key at runtime.
|
370
|
-
|
371
|
-
```apache
|
372
|
-
<match dummy>
|
373
|
-
@type bigquery
|
374
396
|
|
375
|
-
|
376
|
-
table accesslog%{time_slice}
|
377
|
-
...
|
378
|
-
</match>
|
379
|
-
```
|
380
|
-
|
381
|
-
#### record attribute value formatting
|
382
|
-
Or, `${attr_name}` placeholder is available to use value of attribute as part of table id.
|
383
|
-
`${attr_name}` is replaced by string value of the attribute specified by `attr_name`.
|
384
|
-
|
385
|
-
__NOTE: This feature is available only if `method` is `insert`.__
|
386
|
-
|
387
|
-
```apache
|
388
|
-
<match dummy>
|
389
|
-
...
|
390
|
-
table accesslog_%Y_%m_${subdomain}
|
391
|
-
...
|
392
|
-
</match>
|
393
|
-
```
|
397
|
+
Instead, Use strftime formatting.
|
394
398
|
|
395
|
-
|
396
|
-
|
397
|
-
- any type of attribute is allowed because stringified value will be used as replacement.
|
398
|
-
- acceptable characters are alphabets, digits and `_`. All other characters will be removed.
|
399
|
+
strftime formatting of current version is based on chunk key.
|
400
|
+
That is same with previous time_slice_key formatting .
|
399
401
|
|
400
402
|
### Date partitioned table support
|
401
403
|
this plugin can insert (load) into date partitioned table.
|
402
404
|
|
403
|
-
Use
|
405
|
+
Use placeholder.
|
404
406
|
|
405
407
|
```apache
|
406
408
|
<match dummy>
|
407
409
|
@type bigquery
|
408
410
|
|
409
411
|
...
|
410
|
-
|
411
|
-
|
412
|
+
table accesslog$%Y%m%d
|
413
|
+
|
414
|
+
<buffer time>
|
415
|
+
timekey 1d
|
416
|
+
</buffer>
|
412
417
|
...
|
413
418
|
</match>
|
414
419
|
```
|
@@ -452,9 +457,6 @@ you can also specify nested fields by prefixing their belonging record fields.
|
|
452
457
|
|
453
458
|
...
|
454
459
|
|
455
|
-
time_format %s
|
456
|
-
time_field time
|
457
|
-
|
458
460
|
schema [
|
459
461
|
{"name": "time", "type": "INTEGER"},
|
460
462
|
{"name": "status", "type": "INTEGER"},
|
@@ -505,10 +507,7 @@ The second method is to specify a path to a BigQuery schema file instead of list
|
|
505
507
|
@type bigquery
|
506
508
|
|
507
509
|
...
|
508
|
-
|
509
|
-
time_format %s
|
510
|
-
time_field time
|
511
|
-
|
510
|
+
|
512
511
|
schema_path /path/to/httpd.schema
|
513
512
|
</match>
|
514
513
|
```
|
@@ -521,10 +520,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
|
|
521
520
|
@type bigquery
|
522
521
|
|
523
522
|
...
|
524
|
-
|
525
|
-
time_format %s
|
526
|
-
time_field time
|
527
|
-
|
523
|
+
|
528
524
|
fetch_schema true
|
529
525
|
# fetch_schema_table other_table # if you want to fetch schema from other table
|
530
526
|
</match>
|
@@ -27,8 +27,5 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_runtime_dependency "googleauth", ">= 0.5.0"
|
28
28
|
spec.add_runtime_dependency "multi_json"
|
29
29
|
spec.add_runtime_dependency "activesupport", ">= 3.2", "< 6"
|
30
|
-
spec.add_runtime_dependency "fluentd", "~> 0.
|
31
|
-
spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
|
32
|
-
spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
|
33
|
-
spec.add_runtime_dependency "fluent-plugin-buffer-lightening", ">= 0.0.2"
|
30
|
+
spec.add_runtime_dependency "fluentd", "~> 0.14.0"
|
34
31
|
end
|
@@ -209,7 +209,7 @@ module Fluent
|
|
209
209
|
}
|
210
210
|
end
|
211
211
|
|
212
|
-
def load_schema(schema
|
212
|
+
def load_schema(schema)
|
213
213
|
schema.each do |field|
|
214
214
|
raise ConfigError, 'field must have type' unless field.key?('type')
|
215
215
|
|
@@ -220,13 +220,11 @@ module Fluent
|
|
220
220
|
field_schema_class = FIELD_TYPES[type]
|
221
221
|
raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
|
222
222
|
|
223
|
-
next if @fields.key?(name) and !allow_overwrite
|
224
|
-
|
225
223
|
field_schema = field_schema_class.new(name, mode)
|
226
224
|
@fields[name] = field_schema
|
227
225
|
if type == :record
|
228
226
|
raise ConfigError, "record field must have fields" unless field.key?('fields')
|
229
|
-
field_schema.load_schema(field['fields']
|
227
|
+
field_schema.load_schema(field['fields'])
|
230
228
|
end
|
231
229
|
end
|
232
230
|
end
|