fluent-plugin-bigquery 0.4.4 → 0.5.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fbc5b978030f084d076e9123adcbf28fda708023
4
- data.tar.gz: 7c99e2fa8af4ba7f211f5890ab9247732217b421
3
+ metadata.gz: 440ecd282c8e8e724c5a1e0baa43aa88c0051f26
4
+ data.tar.gz: 4e9767f4cfb54a6091dc25553dbb0190387b0d35
5
5
  SHA512:
6
- metadata.gz: d6e90f8a800bf58514a6d919188cc900a2b1cfa1a6f6138fc1087280e9fd899a4766f6971cdbd53bb00064ff2371a62ea444af4ed833906771109a1003be9a9d
7
- data.tar.gz: 34ccca10960087ab60d0221e84223b8c725c1a7ba7b216a4e1b830db640e9bf821534384d02c8fbdaac4b766d36096f3a52eabcdc744f4c2b08f3a9fb33a04eb
6
+ metadata.gz: f57040773228cd0c73610251a6a67497750f5df9d232c9d61a1177056d709671b3e1e0970f6e6557422bc6885d16c10024bd3b79954c57a3e75846d419974d6a
7
+ data.tar.gz: 1507120a4806737440d8666b57ea220df61076415adea45516d5feab651d3207bdbbb236c86ef579555226cbcb3a480c63260b361732ecb31a7cb6320aff9e7a
@@ -1,7 +1,6 @@
1
1
  language: ruby
2
2
 
3
3
  rvm:
4
- - 2.0
5
4
  - 2.1
6
5
  - 2.2
7
6
  - 2.3.3
data/README.md CHANGED
@@ -28,59 +28,87 @@ Because embbeded gem dependency sometimes restricts ruby environment.
28
28
 
29
29
  ### Options
30
30
 
31
- | name | type | required? | default | description |
32
- | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
33
- | method | string | no | insert | `insert` (Streaming Insert) or `load` (load job) |
34
- | buffer_type | string | no | lightening (insert) or file (load) | |
35
- | buffer_chunk_limit | integer | no | 1MB (insert) or 1GB (load) | |
36
- | buffer_queue_limit | integer | no | 1024 (insert) or 32 (load) | |
37
- | buffer_chunk_records_limit | integer | no | 500 | |
38
- | flush_interval | float | no | 0.25 (*insert) or default of time sliced output (load) | |
39
- | try_flush_interval | float | no | 0.05 (*insert) or default of time sliced output (load) | |
40
- | auth_method | enum | yes | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
41
- | email | string | yes (private_key) | nil | GCP Service Account Email |
42
- | private_key_path | string | yes (private_key) | nil | GCP Private Key file path |
43
- | private_key_passphrase | string | yes (private_key) | nil | GCP Private Key Passphrase |
44
- | json_key | string | yes (json_key) | nil | GCP JSON Key file path or JSON Key string |
45
- | project | string | yes | nil | |
46
- | table | string | yes (either `tables`) | nil | |
47
- | tables | string | yes (either `table`) | nil | can set multi table names splitted by `,` |
48
- | template_suffix | string | no | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
49
- | auto_create_table | bool | no | false | If true, creates table automatically |
50
- | skip_invalid_rows | bool | no | false | Only `insert` method. |
51
- | max_bad_records | integer | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
52
- | ignore_unknown_values | bool | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
53
- | schema | array | yes (either `fetch_schema` or `schema_path`) | nil | Schema Definition. It is formatted by JSON. |
54
- | schema_path | string | yes (either `fetch_schema`) | nil | Schema Definition file path. It is formatted by JSON. |
55
- | fetch_schema | bool | yes (either `schema_path`) | false | If true, fetch table schema definition from Bigquery table automatically. |
56
- | fetch_schema_table | string | no | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
57
- | schema_cache_expire | integer | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
58
- | field_string (deprecated) | string | no | nil | see examples. |
59
- | field_integer (deprecated) | string | no | nil | see examples. |
60
- | field_float (deprecated) | string | no | nil | see examples. |
61
- | field_boolean (deprecated) | string | no | nil | see examples. |
62
- | field_timestamp (deprecated) | string | no | nil | see examples. |
63
- | time_field | string | no | nil | If this param is set, plugin set formatted time string to this field. |
64
- | time_format | string | no | nil | ex. `%s`, `%Y/%m%d %H:%M:%S` |
65
- | replace_record_key | bool | no | false | see examples. |
66
- | replace_record_key_regexp{1-10} | string | no | nil | see examples. |
67
- | convert_hash_to_json (deprecated) | bool | no | false | If true, converts Hash value of record to JSON String. |
68
- | insert_id_field | string | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
69
- | add_insert_timestamp | string | no | no | nil | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
70
- | allow_retry_insert_errors | bool | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
71
- | request_timeout_sec | integer | no | nil | Bigquery API response timeout |
72
- | request_open_timeout_sec | integer | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
73
- | time_partitioning_type | enum | no (either day) | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
74
- | time_partitioning_expiration | time | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
75
-
76
- ### Standard Options
31
+ | name | type | required? | placeholder? | default | description |
32
+ | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
33
+ | method | string | no | no | insert | `insert` (Streaming Insert) or `load` (load job) |
34
+ | auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
35
+ | email | string | yes (private_key) | no | nil | GCP Service Account Email |
36
+ | private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
37
+ | private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
38
+ | json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
39
+ | project | string | yes | yes | nil | |
40
+ | dataset | string | yes | yes | nil | |
41
+ | table | string | yes (either `tables`) | yes | nil | |
42
+ | tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` |
43
+ | template_suffix | string | no | yes | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
44
+ | auto_create_table | bool | no | no | false | If true, creates table automatically |
45
+ | skip_invalid_rows | bool | no | no | false | Only `insert` method. |
46
+ | max_bad_records | integer | no | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
47
+ | ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
48
+ | schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
49
+ | schema_path | string | yes (either `fetch_schema`) | no | nil | Schema Definition file path. It is formatted by JSON. |
50
+ | fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
51
+ | fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
52
+ | schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
53
+ | field_string | string | no | no | nil | see examples. |
54
+ | field_integer | string | no | no | nil | see examples. |
55
+ | field_float | string | no | no | nil | see examples. |
56
+ | field_boolean | string | no | no | nil | see examples. |
57
+ | field_timestamp | string | no | no | nil | see examples. |
58
+ | replace_record_key | bool | no | no | false | see examples. |
59
+ | replace_record_key_regexp{1-10} | string | no | no | nil | see examples. |
60
+ | convert_hash_to_json | bool | no | no | false | If true, converts Hash value of record to JSON String. |
61
+ | insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
62
+ | allow_retry_insert_errors | bool | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
63
+ | request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
64
+ | request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
65
+ | time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
66
+ | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
67
+
68
+ ### Buffer section
69
+
70
+ | name | type | required? | default | description |
71
+ | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
72
+ | @type | string | no | memory (insert) or file (load) | |
73
+ | chunk_limit_size | integer | no | 1MB (insert) or 1GB (load) | |
74
+ | total_limit_size | integer | no | 1GB (insert) or 32GB (load) | |
75
+ | chunk_records_limit | integer | no | 500 (insert) or nil (load) | |
76
+ | flush_mode | enum | no | interval | default, lazy, interval, immediate |
77
+ | flush_interval | float | no | 0.25 (insert) or nil (load) | |
78
+ | flush_thread_interval | float | no | 0.05 (insert) or nil (load) | |
79
+ | flush_thread_burst_interval | float | no | 0.05 (insert) or nil (load) | |
80
+
81
+ And, other params (defined by base class) are available
82
+
83
+ see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin/output.rb
84
+
85
+ ### Inject section
86
+
87
+ It is replacement of previous version `time_field` and `time_format`.
88
+
89
+ For example.
90
+
91
+ ```
92
+ <inject>
93
+ time_key time_field_name
94
+ time_type string
95
+ time_format %Y-%m-%d %H:%M:%S
96
+ </inject>
97
+ ```
77
98
 
78
99
  | name | type | required? | default | description |
79
100
  | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
80
- | localtime | bool | no | nil | Use localtime |
81
- | utc | bool | no | nil | Use utc |
82
-
83
- And see http://docs.fluentd.org/articles/output-plugin-overview#time-sliced-output-parameters
101
+ | hostname_key | string | no | nil | |
102
+ | hostname | string | no | nil | |
103
+ | tag_key | string | no | nil | |
104
+ | time_key | string | no | nil | |
105
+ | time_type | string | no | nil | |
106
+ | time_format | string | no | nil | |
107
+ | localtime | bool | no | true | |
108
+ | utc | bool | no | false | |
109
+ | timezone | string | no | nil | |
110
+
111
+ see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin_helper/inject.rb
84
112
 
85
113
  ## Examples
86
114
 
@@ -103,9 +131,6 @@ Configure insert specifications with target table schema, with your credentials.
103
131
  dataset yourdataset_id
104
132
  table tablename
105
133
 
106
- time_format %s
107
- time_field time
108
-
109
134
  schema [
110
135
  {"name": "time", "type": "INTEGER"},
111
136
  {"name": "status", "type": "INTEGER"},
@@ -135,14 +160,15 @@ For high rate inserts over streaming inserts, you should specify flush intervals
135
160
  @type bigquery
136
161
 
137
162
  method insert # default
138
-
139
- flush_interval 1 # flush as frequent as possible
140
-
141
- buffer_chunk_records_limit 300 # default rate limit for users is 100
142
- buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
143
-
144
- num_threads 16
145
-
163
+
164
+ <buffer>
165
+ flush_interval 0.1 # flush as frequent as possible
166
+
167
+ buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
168
+
169
+ flush_thread_count 16
170
+ </buffer>
171
+
146
172
  auth_method private_key # default
147
173
  email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
148
174
  private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
@@ -152,9 +178,6 @@ For high rate inserts over streaming inserts, you should specify flush intervals
152
178
  dataset yourdataset_id
153
179
  tables accesslog1,accesslog2,accesslog3
154
180
 
155
- time_format %s
156
- time_field time
157
-
158
181
  schema [
159
182
  {"name": "time", "type": "INTEGER"},
160
183
  {"name": "status", "type": "INTEGER"},
@@ -183,23 +206,23 @@ Important options for high rate events are:
183
206
  * 2 or more tables are available with ',' separator
184
207
  * `out_bigquery` uses these tables for Table Sharding inserts
185
208
  * these must have same schema
186
- * `buffer_chunk_limit`
209
+ * `buffer/chunk_limit_size`
187
210
  * max size of an insert or chunk (default 1000000 or 1MB)
188
211
  * the max size is limited to 1MB on BigQuery
189
- * `buffer_chunk_records_limit`
212
+ * `buffer/chunk_records_limit`
190
213
  * number of records over streaming inserts API call is limited as 500, per insert or chunk
191
214
  * `out_bigquery` flushes buffer with 500 records for 1 inserts API call
192
- * `buffer_queue_limit`
215
+ * `buffer/queue_length_limit`
193
216
  * BigQuery streaming inserts needs very small buffer chunks
194
217
  * for high-rate events, `buffer_queue_limit` should be configured with big number
195
218
  * Max 1GB memory may be used under network problem in default configuration
196
- * `buffer_chunk_limit (default 1MB)` x `buffer_queue_limit (default 1024)`
197
- * `num_threads`
219
+ * `chunk_limit_size (default 1MB)` x `queue_length_limit (default 1024)`
220
+ * `buffer/flush_thread_count`
198
221
  * threads for insert api calls in parallel
199
222
  * specify this option for 100 or more records per seconds
200
223
  * 10 or more threads seems good for inserts over internet
201
224
  * less threads may be good for Google Compute Engine instances (with low latency for BigQuery)
202
- * `flush_interval`
225
+ * `buffer/flush_interval`
203
226
  * interval between data flushes (default 0.25)
204
227
  * you can set subsecond values such as `0.15` on Fluentd v0.10.42 or later
205
228
 
@@ -212,19 +235,18 @@ section in the Google BigQuery document.
212
235
  @type bigquery
213
236
 
214
237
  method load
215
- buffer_type file
216
- buffer_path bigquery.*.buffer
238
+
239
+ <buffer>
240
+ @type file
241
+ path bigquery.*.buffer
217
242
  flush_interval 1800
218
243
  flush_at_shutdown true
219
- try_flush_interval 1
220
- utc
244
+ timekey_use_utc
245
+ </buffer>
221
246
 
222
247
  auth_method json_key
223
248
  json_key json_key_path.json
224
249
 
225
- time_format %s
226
- time_field time
227
-
228
250
  project yourproject_id
229
251
  dataset yourdataset_id
230
252
  auto_create_table true
@@ -235,8 +257,6 @@ section in the Google BigQuery document.
235
257
 
236
258
  I recommend to use file buffer and long flush interval.
237
259
 
238
- __CAUTION: `flush_interval` default is still `0.25` even if `method` is `load` on current version.__
239
-
240
260
  ### Authentication
241
261
 
242
262
  There are four methods supported to fetch access token for the service account.
@@ -304,8 +324,6 @@ Compute Engine instance, then you can configure fluentd like this.
304
324
  dataset yourdataset_id
305
325
  table tablename
306
326
 
307
- time_format %s
308
- time_field time
309
327
  ...
310
328
  </match>
311
329
  ```
@@ -325,11 +343,15 @@ In this authentication method, the credentials returned are determined by the en
325
343
 
326
344
  ### Table id formatting
327
345
 
346
+ this plugin supports fluentd-0.14 style placeholder.
347
+
328
348
  #### strftime formatting
329
349
  `table` and `tables` options accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
330
350
  format to construct table ids.
331
351
  Table ids are formatted at runtime
332
- using the local time of the fluentd server.
352
+ using the chunk key time.
353
+
354
+ see. http://docs.fluentd.org/v0.14/articles/output-plugin-overview
333
355
 
334
356
  For example, with the configuration below,
335
357
  data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
@@ -344,6 +366,9 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
344
366
  dataset yourdataset_id
345
367
  table accesslog_%Y_%m
346
368
 
369
+ <buffer time>
370
+ timekey 1d
371
+ </buffer>
347
372
  ...
348
373
  </match>
349
374
  ```
@@ -351,12 +376,15 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
351
376
  #### record attribute formatting
352
377
  The format can be suffixed with attribute name.
353
378
 
354
- __NOTE: This feature is available only if `method` is `insert`. Because it makes performance impact. Use `%{time_slice}` instead of it.__
379
+ __CAUTION: format is different with previous version__
355
380
 
356
381
  ```apache
357
382
  <match dummy>
358
383
  ...
359
- table accesslog_%Y_%m@timestamp
384
+ table accesslog_${status_code}
385
+
386
+ <buffer status_code>
387
+ </buffer>
360
388
  ...
361
389
  </match>
362
390
  ```
@@ -365,50 +393,27 @@ If attribute name is given, the time to be used for formatting is value of each
365
393
  The value for the time should be a UNIX time.
366
394
 
367
395
  #### time_slice_key formatting
368
- Or, the options can use `%{time_slice}` placeholder.
369
- `%{time_slice}` is replaced by formatted time slice key at runtime.
370
-
371
- ```apache
372
- <match dummy>
373
- @type bigquery
374
396
 
375
- ...
376
- table accesslog%{time_slice}
377
- ...
378
- </match>
379
- ```
380
-
381
- #### record attribute value formatting
382
- Or, `${attr_name}` placeholder is available to use value of attribute as part of table id.
383
- `${attr_name}` is replaced by string value of the attribute specified by `attr_name`.
384
-
385
- __NOTE: This feature is available only if `method` is `insert`.__
386
-
387
- ```apache
388
- <match dummy>
389
- ...
390
- table accesslog_%Y_%m_${subdomain}
391
- ...
392
- </match>
393
- ```
397
+ Instead, Use strftime formatting.
394
398
 
395
- For example value of `subdomain` attribute is `"bq.fluent"`, table id will be like "accesslog_2016_03_bqfluent".
396
-
397
- - any type of attribute is allowed because stringified value will be used as replacement.
398
- - acceptable characters are alphabets, digits and `_`. All other characters will be removed.
399
+ strftime formatting of current version is based on chunk key.
400
+ That is same with previous time_slice_key formatting .
399
401
 
400
402
  ### Date partitioned table support
401
403
  this plugin can insert (load) into date partitioned table.
402
404
 
403
- Use `%{time_slice}`.
405
+ Use placeholder.
404
406
 
405
407
  ```apache
406
408
  <match dummy>
407
409
  @type bigquery
408
410
 
409
411
  ...
410
- time_slice_format %Y%m%d
411
- table accesslog$%{time_slice}
412
+ table accesslog$%Y%m%d
413
+
414
+ <buffer time>
415
+ timekey 1d
416
+ </buffer>
412
417
  ...
413
418
  </match>
414
419
  ```
@@ -452,9 +457,6 @@ you can also specify nested fields by prefixing their belonging record fields.
452
457
 
453
458
  ...
454
459
 
455
- time_format %s
456
- time_field time
457
-
458
460
  schema [
459
461
  {"name": "time", "type": "INTEGER"},
460
462
  {"name": "status", "type": "INTEGER"},
@@ -505,10 +507,7 @@ The second method is to specify a path to a BigQuery schema file instead of list
505
507
  @type bigquery
506
508
 
507
509
  ...
508
-
509
- time_format %s
510
- time_field time
511
-
510
+
512
511
  schema_path /path/to/httpd.schema
513
512
  </match>
514
513
  ```
@@ -521,10 +520,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
521
520
  @type bigquery
522
521
 
523
522
  ...
524
-
525
- time_format %s
526
- time_field time
527
-
523
+
528
524
  fetch_schema true
529
525
  # fetch_schema_table other_table # if you want to fetch schema from other table
530
526
  </match>
@@ -27,8 +27,5 @@ Gem::Specification.new do |spec|
27
27
  spec.add_runtime_dependency "googleauth", ">= 0.5.0"
28
28
  spec.add_runtime_dependency "multi_json"
29
29
  spec.add_runtime_dependency "activesupport", ">= 3.2", "< 6"
30
- spec.add_runtime_dependency "fluentd", "~> 0.12.0"
31
- spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
32
- spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
33
- spec.add_runtime_dependency "fluent-plugin-buffer-lightening", ">= 0.0.2"
30
+ spec.add_runtime_dependency "fluentd", "~> 0.14.0"
34
31
  end
@@ -209,7 +209,7 @@ module Fluent
209
209
  }
210
210
  end
211
211
 
212
- def load_schema(schema, allow_overwrite=true)
212
+ def load_schema(schema)
213
213
  schema.each do |field|
214
214
  raise ConfigError, 'field must have type' unless field.key?('type')
215
215
 
@@ -220,13 +220,11 @@ module Fluent
220
220
  field_schema_class = FIELD_TYPES[type]
221
221
  raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
222
222
 
223
- next if @fields.key?(name) and !allow_overwrite
224
-
225
223
  field_schema = field_schema_class.new(name, mode)
226
224
  @fields[name] = field_schema
227
225
  if type == :record
228
226
  raise ConfigError, "record field must have fields" unless field.key?('fields')
229
- field_schema.load_schema(field['fields'], allow_overwrite)
227
+ field_schema.load_schema(field['fields'])
230
228
  end
231
229
  end
232
230
  end
@@ -1,5 +1,5 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.4.4".freeze
3
+ VERSION = "0.5.0.beta1".freeze
4
4
  end
5
5
  end