fluent-plugin-bigquery 0.4.4 → 0.5.0.beta1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/README.md +123 -127
- data/fluent-plugin-bigquery.gemspec +1 -4
- data/lib/fluent/plugin/bigquery/schema.rb +2 -4
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +2 -8
- data/lib/fluent/plugin/out_bigquery.rb +431 -440
- data/test/helper.rb +5 -1
- data/test/plugin/test_out_bigquery.rb +479 -708
- data/test/plugin/test_record_schema.rb +8 -24
- data/test/run_test.rb +9 -0
- metadata +8 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 440ecd282c8e8e724c5a1e0baa43aa88c0051f26
|
4
|
+
data.tar.gz: 4e9767f4cfb54a6091dc25553dbb0190387b0d35
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f57040773228cd0c73610251a6a67497750f5df9d232c9d61a1177056d709671b3e1e0970f6e6557422bc6885d16c10024bd3b79954c57a3e75846d419974d6a
|
7
|
+
data.tar.gz: 1507120a4806737440d8666b57ea220df61076415adea45516d5feab651d3207bdbbb236c86ef579555226cbcb3a480c63260b361732ecb31a7cb6320aff9e7a
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -28,59 +28,87 @@ Because embbeded gem dependency sometimes restricts ruby environment.
|
|
28
28
|
|
29
29
|
### Options
|
30
30
|
|
31
|
-
| name | type | required? | default
|
32
|
-
| :------------------------------------- | :------------ | :----------- | :-------------------------
|
33
|
-
| method | string | no | insert
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
76
|
-
|
31
|
+
| name | type | required? | placeholder? | default | description |
|
32
|
+
| :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
|
33
|
+
| method | string | no | no | insert | `insert` (Streaming Insert) or `load` (load job) |
|
34
|
+
| auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
|
35
|
+
| email | string | yes (private_key) | no | nil | GCP Service Account Email |
|
36
|
+
| private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
|
37
|
+
| private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
|
38
|
+
| json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
|
39
|
+
| project | string | yes | yes | nil | |
|
40
|
+
| dataset | string | yes | yes | nil | |
|
41
|
+
| table | string | yes (either `tables`) | yes | nil | |
|
42
|
+
| tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` |
|
43
|
+
| template_suffix | string | no | yes | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
|
44
|
+
| auto_create_table | bool | no | no | false | If true, creates table automatically |
|
45
|
+
| skip_invalid_rows | bool | no | no | false | Only `insert` method. |
|
46
|
+
| max_bad_records | integer | no | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
|
47
|
+
| ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
|
48
|
+
| schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
|
49
|
+
| schema_path | string | yes (either `fetch_schema`) | no | nil | Schema Definition file path. It is formatted by JSON. |
|
50
|
+
| fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
|
51
|
+
| fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
|
52
|
+
| schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
53
|
+
| field_string | string | no | no | nil | see examples. |
|
54
|
+
| field_integer | string | no | no | nil | see examples. |
|
55
|
+
| field_float | string | no | no | nil | see examples. |
|
56
|
+
| field_boolean | string | no | no | nil | see examples. |
|
57
|
+
| field_timestamp | string | no | no | nil | see examples. |
|
58
|
+
| replace_record_key | bool | no | no | false | see examples. |
|
59
|
+
| replace_record_key_regexp{1-10} | string | no | no | nil | see examples. |
|
60
|
+
| convert_hash_to_json | bool | no | no | false | If true, converts Hash value of record to JSON String. |
|
61
|
+
| insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
|
62
|
+
| allow_retry_insert_errors | bool | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
|
63
|
+
| request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
|
64
|
+
| request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
65
|
+
| time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
|
66
|
+
| time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
|
67
|
+
|
68
|
+
### Buffer section
|
69
|
+
|
70
|
+
| name | type | required? | default | description |
|
71
|
+
| :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
|
72
|
+
| @type | string | no | memory (insert) or file (load) | |
|
73
|
+
| chunk_limit_size | integer | no | 1MB (insert) or 1GB (load) | |
|
74
|
+
| total_limit_size | integer | no | 1GB (insert) or 32GB (load) | |
|
75
|
+
| chunk_records_limit | integer | no | 500 (insert) or nil (load) | |
|
76
|
+
| flush_mode | enum | no | interval | default, lazy, interval, immediate |
|
77
|
+
| flush_interval | float | no | 0.25 (insert) or nil (load) | |
|
78
|
+
| flush_thread_interval | float | no | 0.05 (insert) or nil (load) | |
|
79
|
+
| flush_thread_burst_interval | float | no | 0.05 (insert) or nil (load) | |
|
80
|
+
|
81
|
+
And, other params (defined by base class) are available
|
82
|
+
|
83
|
+
see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin/output.rb
|
84
|
+
|
85
|
+
### Inject section
|
86
|
+
|
87
|
+
It is replacement of previous version `time_field` and `time_format`.
|
88
|
+
|
89
|
+
For example.
|
90
|
+
|
91
|
+
```
|
92
|
+
<inject>
|
93
|
+
time_key time_field_name
|
94
|
+
time_type string
|
95
|
+
time_format %Y-%m-%d %H:%M:%S
|
96
|
+
</inject>
|
97
|
+
```
|
77
98
|
|
78
99
|
| name | type | required? | default | description |
|
79
100
|
| :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
83
|
-
|
101
|
+
| hostname_key | string | no | nil | |
|
102
|
+
| hostname | string | no | nil | |
|
103
|
+
| tag_key | string | no | nil | |
|
104
|
+
| time_key | string | no | nil | |
|
105
|
+
| time_type | string | no | nil | |
|
106
|
+
| time_format | string | no | nil | |
|
107
|
+
| localtime | bool | no | true | |
|
108
|
+
| utc | bool | no | false | |
|
109
|
+
| timezone | string | no | nil | |
|
110
|
+
|
111
|
+
see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin_helper/inject.rb
|
84
112
|
|
85
113
|
## Examples
|
86
114
|
|
@@ -103,9 +131,6 @@ Configure insert specifications with target table schema, with your credentials.
|
|
103
131
|
dataset yourdataset_id
|
104
132
|
table tablename
|
105
133
|
|
106
|
-
time_format %s
|
107
|
-
time_field time
|
108
|
-
|
109
134
|
schema [
|
110
135
|
{"name": "time", "type": "INTEGER"},
|
111
136
|
{"name": "status", "type": "INTEGER"},
|
@@ -135,14 +160,15 @@ For high rate inserts over streaming inserts, you should specify flush intervals
|
|
135
160
|
@type bigquery
|
136
161
|
|
137
162
|
method insert # default
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
163
|
+
|
164
|
+
<buffer>
|
165
|
+
flush_interval 0.1 # flush as frequent as possible
|
166
|
+
|
167
|
+
buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
|
168
|
+
|
169
|
+
flush_thread_count 16
|
170
|
+
</buffer>
|
171
|
+
|
146
172
|
auth_method private_key # default
|
147
173
|
email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
|
148
174
|
private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
|
@@ -152,9 +178,6 @@ For high rate inserts over streaming inserts, you should specify flush intervals
|
|
152
178
|
dataset yourdataset_id
|
153
179
|
tables accesslog1,accesslog2,accesslog3
|
154
180
|
|
155
|
-
time_format %s
|
156
|
-
time_field time
|
157
|
-
|
158
181
|
schema [
|
159
182
|
{"name": "time", "type": "INTEGER"},
|
160
183
|
{"name": "status", "type": "INTEGER"},
|
@@ -183,23 +206,23 @@ Important options for high rate events are:
|
|
183
206
|
* 2 or more tables are available with ',' separator
|
184
207
|
* `out_bigquery` uses these tables for Table Sharding inserts
|
185
208
|
* these must have same schema
|
186
|
-
* `
|
209
|
+
* `buffer/chunk_limit_size`
|
187
210
|
* max size of an insert or chunk (default 1000000 or 1MB)
|
188
211
|
* the max size is limited to 1MB on BigQuery
|
189
|
-
* `
|
212
|
+
* `buffer/chunk_records_limit`
|
190
213
|
* number of records over streaming inserts API call is limited as 500, per insert or chunk
|
191
214
|
* `out_bigquery` flushes buffer with 500 records for 1 inserts API call
|
192
|
-
* `
|
215
|
+
* `buffer/queue_length_limit`
|
193
216
|
* BigQuery streaming inserts needs very small buffer chunks
|
194
217
|
* for high-rate events, `buffer_queue_limit` should be configured with big number
|
195
218
|
* Max 1GB memory may be used under network problem in default configuration
|
196
|
-
* `
|
197
|
-
* `
|
219
|
+
* `chunk_limit_size (default 1MB)` x `queue_length_limit (default 1024)`
|
220
|
+
* `buffer/flush_thread_count`
|
198
221
|
* threads for insert api calls in parallel
|
199
222
|
* specify this option for 100 or more records per seconds
|
200
223
|
* 10 or more threads seems good for inserts over internet
|
201
224
|
* less threads may be good for Google Compute Engine instances (with low latency for BigQuery)
|
202
|
-
* `flush_interval`
|
225
|
+
* `buffer/flush_interval`
|
203
226
|
* interval between data flushes (default 0.25)
|
204
227
|
* you can set subsecond values such as `0.15` on Fluentd v0.10.42 or later
|
205
228
|
|
@@ -212,19 +235,18 @@ section in the Google BigQuery document.
|
|
212
235
|
@type bigquery
|
213
236
|
|
214
237
|
method load
|
215
|
-
|
216
|
-
|
238
|
+
|
239
|
+
<buffer>
|
240
|
+
@type file
|
241
|
+
path bigquery.*.buffer
|
217
242
|
flush_interval 1800
|
218
243
|
flush_at_shutdown true
|
219
|
-
|
220
|
-
|
244
|
+
timekey_use_utc
|
245
|
+
</buffer>
|
221
246
|
|
222
247
|
auth_method json_key
|
223
248
|
json_key json_key_path.json
|
224
249
|
|
225
|
-
time_format %s
|
226
|
-
time_field time
|
227
|
-
|
228
250
|
project yourproject_id
|
229
251
|
dataset yourdataset_id
|
230
252
|
auto_create_table true
|
@@ -235,8 +257,6 @@ section in the Google BigQuery document.
|
|
235
257
|
|
236
258
|
I recommend to use file buffer and long flush interval.
|
237
259
|
|
238
|
-
__CAUTION: `flush_interval` default is still `0.25` even if `method` is `load` on current version.__
|
239
|
-
|
240
260
|
### Authentication
|
241
261
|
|
242
262
|
There are four methods supported to fetch access token for the service account.
|
@@ -304,8 +324,6 @@ Compute Engine instance, then you can configure fluentd like this.
|
|
304
324
|
dataset yourdataset_id
|
305
325
|
table tablename
|
306
326
|
|
307
|
-
time_format %s
|
308
|
-
time_field time
|
309
327
|
...
|
310
328
|
</match>
|
311
329
|
```
|
@@ -325,11 +343,15 @@ In this authentication method, the credentials returned are determined by the en
|
|
325
343
|
|
326
344
|
### Table id formatting
|
327
345
|
|
346
|
+
this plugin supports fluentd-0.14 style placeholder.
|
347
|
+
|
328
348
|
#### strftime formatting
|
329
349
|
`table` and `tables` options accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
|
330
350
|
format to construct table ids.
|
331
351
|
Table ids are formatted at runtime
|
332
|
-
using the
|
352
|
+
using the chunk key time.
|
353
|
+
|
354
|
+
see. http://docs.fluentd.org/v0.14/articles/output-plugin-overview
|
333
355
|
|
334
356
|
For example, with the configuration below,
|
335
357
|
data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
|
@@ -344,6 +366,9 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
|
|
344
366
|
dataset yourdataset_id
|
345
367
|
table accesslog_%Y_%m
|
346
368
|
|
369
|
+
<buffer time>
|
370
|
+
timekey 1d
|
371
|
+
</buffer>
|
347
372
|
...
|
348
373
|
</match>
|
349
374
|
```
|
@@ -351,12 +376,15 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
|
|
351
376
|
#### record attribute formatting
|
352
377
|
The format can be suffixed with attribute name.
|
353
378
|
|
354
|
-
|
379
|
+
__CAUTION: format is different with previous version__
|
355
380
|
|
356
381
|
```apache
|
357
382
|
<match dummy>
|
358
383
|
...
|
359
|
-
table accesslog_
|
384
|
+
table accesslog_${status_code}
|
385
|
+
|
386
|
+
<buffer status_code>
|
387
|
+
</buffer>
|
360
388
|
...
|
361
389
|
</match>
|
362
390
|
```
|
@@ -365,50 +393,27 @@ If attribute name is given, the time to be used for formatting is value of each
|
|
365
393
|
The value for the time should be a UNIX time.
|
366
394
|
|
367
395
|
#### time_slice_key formatting
|
368
|
-
Or, the options can use `%{time_slice}` placeholder.
|
369
|
-
`%{time_slice}` is replaced by formatted time slice key at runtime.
|
370
|
-
|
371
|
-
```apache
|
372
|
-
<match dummy>
|
373
|
-
@type bigquery
|
374
396
|
|
375
|
-
|
376
|
-
table accesslog%{time_slice}
|
377
|
-
...
|
378
|
-
</match>
|
379
|
-
```
|
380
|
-
|
381
|
-
#### record attribute value formatting
|
382
|
-
Or, `${attr_name}` placeholder is available to use value of attribute as part of table id.
|
383
|
-
`${attr_name}` is replaced by string value of the attribute specified by `attr_name`.
|
384
|
-
|
385
|
-
__NOTE: This feature is available only if `method` is `insert`.__
|
386
|
-
|
387
|
-
```apache
|
388
|
-
<match dummy>
|
389
|
-
...
|
390
|
-
table accesslog_%Y_%m_${subdomain}
|
391
|
-
...
|
392
|
-
</match>
|
393
|
-
```
|
397
|
+
Instead, Use strftime formatting.
|
394
398
|
|
395
|
-
|
396
|
-
|
397
|
-
- any type of attribute is allowed because stringified value will be used as replacement.
|
398
|
-
- acceptable characters are alphabets, digits and `_`. All other characters will be removed.
|
399
|
+
strftime formatting of current version is based on chunk key.
|
400
|
+
That is same with previous time_slice_key formatting .
|
399
401
|
|
400
402
|
### Date partitioned table support
|
401
403
|
this plugin can insert (load) into date partitioned table.
|
402
404
|
|
403
|
-
Use
|
405
|
+
Use placeholder.
|
404
406
|
|
405
407
|
```apache
|
406
408
|
<match dummy>
|
407
409
|
@type bigquery
|
408
410
|
|
409
411
|
...
|
410
|
-
|
411
|
-
|
412
|
+
table accesslog$%Y%m%d
|
413
|
+
|
414
|
+
<buffer time>
|
415
|
+
timekey 1d
|
416
|
+
</buffer>
|
412
417
|
...
|
413
418
|
</match>
|
414
419
|
```
|
@@ -452,9 +457,6 @@ you can also specify nested fields by prefixing their belonging record fields.
|
|
452
457
|
|
453
458
|
...
|
454
459
|
|
455
|
-
time_format %s
|
456
|
-
time_field time
|
457
|
-
|
458
460
|
schema [
|
459
461
|
{"name": "time", "type": "INTEGER"},
|
460
462
|
{"name": "status", "type": "INTEGER"},
|
@@ -505,10 +507,7 @@ The second method is to specify a path to a BigQuery schema file instead of list
|
|
505
507
|
@type bigquery
|
506
508
|
|
507
509
|
...
|
508
|
-
|
509
|
-
time_format %s
|
510
|
-
time_field time
|
511
|
-
|
510
|
+
|
512
511
|
schema_path /path/to/httpd.schema
|
513
512
|
</match>
|
514
513
|
```
|
@@ -521,10 +520,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
|
|
521
520
|
@type bigquery
|
522
521
|
|
523
522
|
...
|
524
|
-
|
525
|
-
time_format %s
|
526
|
-
time_field time
|
527
|
-
|
523
|
+
|
528
524
|
fetch_schema true
|
529
525
|
# fetch_schema_table other_table # if you want to fetch schema from other table
|
530
526
|
</match>
|
@@ -27,8 +27,5 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_runtime_dependency "googleauth", ">= 0.5.0"
|
28
28
|
spec.add_runtime_dependency "multi_json"
|
29
29
|
spec.add_runtime_dependency "activesupport", ">= 3.2", "< 6"
|
30
|
-
spec.add_runtime_dependency "fluentd", "~> 0.
|
31
|
-
spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
|
32
|
-
spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
|
33
|
-
spec.add_runtime_dependency "fluent-plugin-buffer-lightening", ">= 0.0.2"
|
30
|
+
spec.add_runtime_dependency "fluentd", "~> 0.14.0"
|
34
31
|
end
|
@@ -209,7 +209,7 @@ module Fluent
|
|
209
209
|
}
|
210
210
|
end
|
211
211
|
|
212
|
-
def load_schema(schema
|
212
|
+
def load_schema(schema)
|
213
213
|
schema.each do |field|
|
214
214
|
raise ConfigError, 'field must have type' unless field.key?('type')
|
215
215
|
|
@@ -220,13 +220,11 @@ module Fluent
|
|
220
220
|
field_schema_class = FIELD_TYPES[type]
|
221
221
|
raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
|
222
222
|
|
223
|
-
next if @fields.key?(name) and !allow_overwrite
|
224
|
-
|
225
223
|
field_schema = field_schema_class.new(name, mode)
|
226
224
|
@fields[name] = field_schema
|
227
225
|
if type == :record
|
228
226
|
raise ConfigError, "record field must have fields" unless field.key?('fields')
|
229
|
-
field_schema.load_schema(field['fields']
|
227
|
+
field_schema.load_schema(field['fields'])
|
230
228
|
end
|
231
229
|
end
|
232
230
|
end
|