fluent-plugin-bigquery 0.4.4 → 0.5.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fbc5b978030f084d076e9123adcbf28fda708023
4
- data.tar.gz: 7c99e2fa8af4ba7f211f5890ab9247732217b421
3
+ metadata.gz: 440ecd282c8e8e724c5a1e0baa43aa88c0051f26
4
+ data.tar.gz: 4e9767f4cfb54a6091dc25553dbb0190387b0d35
5
5
  SHA512:
6
- metadata.gz: d6e90f8a800bf58514a6d919188cc900a2b1cfa1a6f6138fc1087280e9fd899a4766f6971cdbd53bb00064ff2371a62ea444af4ed833906771109a1003be9a9d
7
- data.tar.gz: 34ccca10960087ab60d0221e84223b8c725c1a7ba7b216a4e1b830db640e9bf821534384d02c8fbdaac4b766d36096f3a52eabcdc744f4c2b08f3a9fb33a04eb
6
+ metadata.gz: f57040773228cd0c73610251a6a67497750f5df9d232c9d61a1177056d709671b3e1e0970f6e6557422bc6885d16c10024bd3b79954c57a3e75846d419974d6a
7
+ data.tar.gz: 1507120a4806737440d8666b57ea220df61076415adea45516d5feab651d3207bdbbb236c86ef579555226cbcb3a480c63260b361732ecb31a7cb6320aff9e7a
@@ -1,7 +1,6 @@
1
1
  language: ruby
2
2
 
3
3
  rvm:
4
- - 2.0
5
4
  - 2.1
6
5
  - 2.2
7
6
  - 2.3.3
data/README.md CHANGED
@@ -28,59 +28,87 @@ Because embbeded gem dependency sometimes restricts ruby environment.
28
28
 
29
29
  ### Options
30
30
 
31
- | name | type | required? | default | description |
32
- | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
33
- | method | string | no | insert | `insert` (Streaming Insert) or `load` (load job) |
34
- | buffer_type | string | no | lightening (insert) or file (load) | |
35
- | buffer_chunk_limit | integer | no | 1MB (insert) or 1GB (load) | |
36
- | buffer_queue_limit | integer | no | 1024 (insert) or 32 (load) | |
37
- | buffer_chunk_records_limit | integer | no | 500 | |
38
- | flush_interval | float | no | 0.25 (*insert) or default of time sliced output (load) | |
39
- | try_flush_interval | float | no | 0.05 (*insert) or default of time sliced output (load) | |
40
- | auth_method | enum | yes | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
41
- | email | string | yes (private_key) | nil | GCP Service Account Email |
42
- | private_key_path | string | yes (private_key) | nil | GCP Private Key file path |
43
- | private_key_passphrase | string | yes (private_key) | nil | GCP Private Key Passphrase |
44
- | json_key | string | yes (json_key) | nil | GCP JSON Key file path or JSON Key string |
45
- | project | string | yes | nil | |
46
- | table | string | yes (either `tables`) | nil | |
47
- | tables | string | yes (either `table`) | nil | can set multi table names splitted by `,` |
48
- | template_suffix | string | no | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
49
- | auto_create_table | bool | no | false | If true, creates table automatically |
50
- | skip_invalid_rows | bool | no | false | Only `insert` method. |
51
- | max_bad_records | integer | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
52
- | ignore_unknown_values | bool | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
53
- | schema | array | yes (either `fetch_schema` or `schema_path`) | nil | Schema Definition. It is formatted by JSON. |
54
- | schema_path | string | yes (either `fetch_schema`) | nil | Schema Definition file path. It is formatted by JSON. |
55
- | fetch_schema | bool | yes (either `schema_path`) | false | If true, fetch table schema definition from Bigquery table automatically. |
56
- | fetch_schema_table | string | no | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
57
- | schema_cache_expire | integer | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
58
- | field_string (deprecated) | string | no | nil | see examples. |
59
- | field_integer (deprecated) | string | no | nil | see examples. |
60
- | field_float (deprecated) | string | no | nil | see examples. |
61
- | field_boolean (deprecated) | string | no | nil | see examples. |
62
- | field_timestamp (deprecated) | string | no | nil | see examples. |
63
- | time_field | string | no | nil | If this param is set, plugin set formatted time string to this field. |
64
- | time_format | string | no | nil | ex. `%s`, `%Y/%m%d %H:%M:%S` |
65
- | replace_record_key | bool | no | false | see examples. |
66
- | replace_record_key_regexp{1-10} | string | no | nil | see examples. |
67
- | convert_hash_to_json (deprecated) | bool | no | false | If true, converts Hash value of record to JSON String. |
68
- | insert_id_field | string | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
69
- | add_insert_timestamp | string | no | no | nil | Adds a timestamp column just before sending the rows to BigQuery, so that buffering time is not taken into account. Gives a field in BigQuery which represents the insert time of the row. |
70
- | allow_retry_insert_errors | bool | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
71
- | request_timeout_sec | integer | no | nil | Bigquery API response timeout |
72
- | request_open_timeout_sec | integer | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
73
- | time_partitioning_type | enum | no (either day) | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
74
- | time_partitioning_expiration | time | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
75
-
76
- ### Standard Options
31
+ | name | type | required? | placeholder? | default | description |
32
+ | :------------------------------------- | :------------ | :----------- | :---------- | :------------------------- | :----------------------- |
33
+ | method | string | no | no | insert | `insert` (Streaming Insert) or `load` (load job) |
34
+ | auth_method | enum | yes | no | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
35
+ | email | string | yes (private_key) | no | nil | GCP Service Account Email |
36
+ | private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
37
+ | private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
38
+ | json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
39
+ | project | string | yes | yes | nil | |
40
+ | dataset | string | yes | yes | nil | |
41
+ | table | string | yes (either `tables`) | yes | nil | |
42
+ | tables | array(string) | yes (either `table`) | yes | nil | can set multi table names splitted by `,` |
43
+ | template_suffix | string | no | yes | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
44
+ | auto_create_table | bool | no | no | false | If true, creates table automatically |
45
+ | skip_invalid_rows | bool | no | no | false | Only `insert` method. |
46
+ | max_bad_records | integer | no | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
47
+ | ignore_unknown_values | bool | no | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
48
+ | schema | array | yes (either `fetch_schema` or `schema_path`) | no | nil | Schema Definition. It is formatted by JSON. |
49
+ | schema_path | string | yes (either `fetch_schema`) | no | nil | Schema Definition file path. It is formatted by JSON. |
50
+ | fetch_schema | bool | yes (either `schema_path`) | no | false | If true, fetch table schema definition from Bigquery table automatically. |
51
+ | fetch_schema_table | string | no | yes | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
52
+ | schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
53
+ | field_string | string | no | no | nil | see examples. |
54
+ | field_integer | string | no | no | nil | see examples. |
55
+ | field_float | string | no | no | nil | see examples. |
56
+ | field_boolean | string | no | no | nil | see examples. |
57
+ | field_timestamp | string | no | no | nil | see examples. |
58
+ | replace_record_key | bool | no | no | false | see examples. |
59
+ | replace_record_key_regexp{1-10} | string | no | no | nil | see examples. |
60
+ | convert_hash_to_json | bool | no | no | false | If true, converts Hash value of record to JSON String. |
61
+ | insert_id_field | string | no | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
62
+ | allow_retry_insert_errors | bool | no | false | Retry to insert rows when an insertErrors occurs. There is a possibility that rows are inserted in duplicate. |
63
+ | request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
64
+ | request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
65
+ | time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
66
+ | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
67
+
68
+ ### Buffer section
69
+
70
+ | name | type | required? | default | description |
71
+ | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
72
+ | @type | string | no | memory (insert) or file (load) | |
73
+ | chunk_limit_size | integer | no | 1MB (insert) or 1GB (load) | |
74
+ | total_limit_size | integer | no | 1GB (insert) or 32GB (load) | |
75
+ | chunk_records_limit | integer | no | 500 (insert) or nil (load) | |
76
+ | flush_mode | enum | no | interval | default, lazy, interval, immediate |
77
+ | flush_interval | float | no | 0.25 (insert) or nil (load) | |
78
+ | flush_thread_interval | float | no | 0.05 (insert) or nil (load) | |
79
+ | flush_thread_burst_interval | float | no | 0.05 (insert) or nil (load) | |
80
+
81
+ And, other params (defined by base class) are available
82
+
83
+ see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin/output.rb
84
+
85
+ ### Inject section
86
+
87
+ It is replacement of previous version `time_field` and `time_format`.
88
+
89
+ For example.
90
+
91
+ ```
92
+ <inject>
93
+ time_key time_field_name
94
+ time_type string
95
+ time_format %Y-%m-%d %H:%M:%S
96
+ </inject>
97
+ ```
77
98
 
78
99
  | name | type | required? | default | description |
79
100
  | :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
80
- | localtime | bool | no | nil | Use localtime |
81
- | utc | bool | no | nil | Use utc |
82
-
83
- And see http://docs.fluentd.org/articles/output-plugin-overview#time-sliced-output-parameters
101
+ | hostname_key | string | no | nil | |
102
+ | hostname | string | no | nil | |
103
+ | tag_key | string | no | nil | |
104
+ | time_key | string | no | nil | |
105
+ | time_type | string | no | nil | |
106
+ | time_format | string | no | nil | |
107
+ | localtime | bool | no | true | |
108
+ | utc | bool | no | false | |
109
+ | timezone | string | no | nil | |
110
+
111
+ see. https://github.com/fluent/fluentd/blob/master/lib/fluent/plugin_helper/inject.rb
84
112
 
85
113
  ## Examples
86
114
 
@@ -103,9 +131,6 @@ Configure insert specifications with target table schema, with your credentials.
103
131
  dataset yourdataset_id
104
132
  table tablename
105
133
 
106
- time_format %s
107
- time_field time
108
-
109
134
  schema [
110
135
  {"name": "time", "type": "INTEGER"},
111
136
  {"name": "status", "type": "INTEGER"},
@@ -135,14 +160,15 @@ For high rate inserts over streaming inserts, you should specify flush intervals
135
160
  @type bigquery
136
161
 
137
162
  method insert # default
138
-
139
- flush_interval 1 # flush as frequent as possible
140
-
141
- buffer_chunk_records_limit 300 # default rate limit for users is 100
142
- buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
143
-
144
- num_threads 16
145
-
163
+
164
+ <buffer>
165
+ flush_interval 0.1 # flush as frequent as possible
166
+
167
+ buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
168
+
169
+ flush_thread_count 16
170
+ </buffer>
171
+
146
172
  auth_method private_key # default
147
173
  email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
148
174
  private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
@@ -152,9 +178,6 @@ For high rate inserts over streaming inserts, you should specify flush intervals
152
178
  dataset yourdataset_id
153
179
  tables accesslog1,accesslog2,accesslog3
154
180
 
155
- time_format %s
156
- time_field time
157
-
158
181
  schema [
159
182
  {"name": "time", "type": "INTEGER"},
160
183
  {"name": "status", "type": "INTEGER"},
@@ -183,23 +206,23 @@ Important options for high rate events are:
183
206
  * 2 or more tables are available with ',' separator
184
207
  * `out_bigquery` uses these tables for Table Sharding inserts
185
208
  * these must have same schema
186
- * `buffer_chunk_limit`
209
+ * `buffer/chunk_limit_size`
187
210
  * max size of an insert or chunk (default 1000000 or 1MB)
188
211
  * the max size is limited to 1MB on BigQuery
189
- * `buffer_chunk_records_limit`
212
+ * `buffer/chunk_records_limit`
190
213
  * number of records over streaming inserts API call is limited as 500, per insert or chunk
191
214
  * `out_bigquery` flushes buffer with 500 records for 1 inserts API call
192
- * `buffer_queue_limit`
215
+ * `buffer/queue_length_limit`
193
216
  * BigQuery streaming inserts needs very small buffer chunks
194
217
  * for high-rate events, `buffer_queue_limit` should be configured with big number
195
218
  * Max 1GB memory may be used under network problem in default configuration
196
- * `buffer_chunk_limit (default 1MB)` x `buffer_queue_limit (default 1024)`
197
- * `num_threads`
219
+ * `chunk_limit_size (default 1MB)` x `queue_length_limit (default 1024)`
220
+ * `buffer/flush_thread_count`
198
221
  * threads for insert api calls in parallel
199
222
  * specify this option for 100 or more records per seconds
200
223
  * 10 or more threads seems good for inserts over internet
201
224
  * less threads may be good for Google Compute Engine instances (with low latency for BigQuery)
202
- * `flush_interval`
225
+ * `buffer/flush_interval`
203
226
  * interval between data flushes (default 0.25)
204
227
  * you can set subsecond values such as `0.15` on Fluentd v0.10.42 or later
205
228
 
@@ -212,19 +235,18 @@ section in the Google BigQuery document.
212
235
  @type bigquery
213
236
 
214
237
  method load
215
- buffer_type file
216
- buffer_path bigquery.*.buffer
238
+
239
+ <buffer>
240
+ @type file
241
+ path bigquery.*.buffer
217
242
  flush_interval 1800
218
243
  flush_at_shutdown true
219
- try_flush_interval 1
220
- utc
244
+ timekey_use_utc
245
+ </buffer>
221
246
 
222
247
  auth_method json_key
223
248
  json_key json_key_path.json
224
249
 
225
- time_format %s
226
- time_field time
227
-
228
250
  project yourproject_id
229
251
  dataset yourdataset_id
230
252
  auto_create_table true
@@ -235,8 +257,6 @@ section in the Google BigQuery document.
235
257
 
236
258
  I recommend to use file buffer and long flush interval.
237
259
 
238
- __CAUTION: `flush_interval` default is still `0.25` even if `method` is `load` on current version.__
239
-
240
260
  ### Authentication
241
261
 
242
262
  There are four methods supported to fetch access token for the service account.
@@ -304,8 +324,6 @@ Compute Engine instance, then you can configure fluentd like this.
304
324
  dataset yourdataset_id
305
325
  table tablename
306
326
 
307
- time_format %s
308
- time_field time
309
327
  ...
310
328
  </match>
311
329
  ```
@@ -325,11 +343,15 @@ In this authentication method, the credentials returned are determined by the en
325
343
 
326
344
  ### Table id formatting
327
345
 
346
+ this plugin supports fluentd-0.14 style placeholder.
347
+
328
348
  #### strftime formatting
329
349
  `table` and `tables` options accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
330
350
  format to construct table ids.
331
351
  Table ids are formatted at runtime
332
- using the local time of the fluentd server.
352
+ using the chunk key time.
353
+
354
+ see. http://docs.fluentd.org/v0.14/articles/output-plugin-overview
333
355
 
334
356
  For example, with the configuration below,
335
357
  data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
@@ -344,6 +366,9 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
344
366
  dataset yourdataset_id
345
367
  table accesslog_%Y_%m
346
368
 
369
+ <buffer time>
370
+ timekey 1d
371
+ </buffer>
347
372
  ...
348
373
  </match>
349
374
  ```
@@ -351,12 +376,15 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
351
376
  #### record attribute formatting
352
377
  The format can be suffixed with attribute name.
353
378
 
354
- __NOTE: This feature is available only if `method` is `insert`. Because it makes performance impact. Use `%{time_slice}` instead of it.__
379
+ __CAUTION: format is different with previous version__
355
380
 
356
381
  ```apache
357
382
  <match dummy>
358
383
  ...
359
- table accesslog_%Y_%m@timestamp
384
+ table accesslog_${status_code}
385
+
386
+ <buffer status_code>
387
+ </buffer>
360
388
  ...
361
389
  </match>
362
390
  ```
@@ -365,50 +393,27 @@ If attribute name is given, the time to be used for formatting is value of each
365
393
  The value for the time should be a UNIX time.
366
394
 
367
395
  #### time_slice_key formatting
368
- Or, the options can use `%{time_slice}` placeholder.
369
- `%{time_slice}` is replaced by formatted time slice key at runtime.
370
-
371
- ```apache
372
- <match dummy>
373
- @type bigquery
374
396
 
375
- ...
376
- table accesslog%{time_slice}
377
- ...
378
- </match>
379
- ```
380
-
381
- #### record attribute value formatting
382
- Or, `${attr_name}` placeholder is available to use value of attribute as part of table id.
383
- `${attr_name}` is replaced by string value of the attribute specified by `attr_name`.
384
-
385
- __NOTE: This feature is available only if `method` is `insert`.__
386
-
387
- ```apache
388
- <match dummy>
389
- ...
390
- table accesslog_%Y_%m_${subdomain}
391
- ...
392
- </match>
393
- ```
397
+ Instead, Use strftime formatting.
394
398
 
395
- For example value of `subdomain` attribute is `"bq.fluent"`, table id will be like "accesslog_2016_03_bqfluent".
396
-
397
- - any type of attribute is allowed because stringified value will be used as replacement.
398
- - acceptable characters are alphabets, digits and `_`. All other characters will be removed.
399
+ strftime formatting of current version is based on chunk key.
400
+ That is same with previous time_slice_key formatting .
399
401
 
400
402
  ### Date partitioned table support
401
403
  this plugin can insert (load) into date partitioned table.
402
404
 
403
- Use `%{time_slice}`.
405
+ Use placeholder.
404
406
 
405
407
  ```apache
406
408
  <match dummy>
407
409
  @type bigquery
408
410
 
409
411
  ...
410
- time_slice_format %Y%m%d
411
- table accesslog$%{time_slice}
412
+ table accesslog$%Y%m%d
413
+
414
+ <buffer time>
415
+ timekey 1d
416
+ </buffer>
412
417
  ...
413
418
  </match>
414
419
  ```
@@ -452,9 +457,6 @@ you can also specify nested fields by prefixing their belonging record fields.
452
457
 
453
458
  ...
454
459
 
455
- time_format %s
456
- time_field time
457
-
458
460
  schema [
459
461
  {"name": "time", "type": "INTEGER"},
460
462
  {"name": "status", "type": "INTEGER"},
@@ -505,10 +507,7 @@ The second method is to specify a path to a BigQuery schema file instead of list
505
507
  @type bigquery
506
508
 
507
509
  ...
508
-
509
- time_format %s
510
- time_field time
511
-
510
+
512
511
  schema_path /path/to/httpd.schema
513
512
  </match>
514
513
  ```
@@ -521,10 +520,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
521
520
  @type bigquery
522
521
 
523
522
  ...
524
-
525
- time_format %s
526
- time_field time
527
-
523
+
528
524
  fetch_schema true
529
525
  # fetch_schema_table other_table # if you want to fetch schema from other table
530
526
  </match>
@@ -27,8 +27,5 @@ Gem::Specification.new do |spec|
27
27
  spec.add_runtime_dependency "googleauth", ">= 0.5.0"
28
28
  spec.add_runtime_dependency "multi_json"
29
29
  spec.add_runtime_dependency "activesupport", ">= 3.2", "< 6"
30
- spec.add_runtime_dependency "fluentd", "~> 0.12.0"
31
- spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
32
- spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
33
- spec.add_runtime_dependency "fluent-plugin-buffer-lightening", ">= 0.0.2"
30
+ spec.add_runtime_dependency "fluentd", "~> 0.14.0"
34
31
  end
@@ -209,7 +209,7 @@ module Fluent
209
209
  }
210
210
  end
211
211
 
212
- def load_schema(schema, allow_overwrite=true)
212
+ def load_schema(schema)
213
213
  schema.each do |field|
214
214
  raise ConfigError, 'field must have type' unless field.key?('type')
215
215
 
@@ -220,13 +220,11 @@ module Fluent
220
220
  field_schema_class = FIELD_TYPES[type]
221
221
  raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
222
222
 
223
- next if @fields.key?(name) and !allow_overwrite
224
-
225
223
  field_schema = field_schema_class.new(name, mode)
226
224
  @fields[name] = field_schema
227
225
  if type == :record
228
226
  raise ConfigError, "record field must have fields" unless field.key?('fields')
229
- field_schema.load_schema(field['fields'], allow_overwrite)
227
+ field_schema.load_schema(field['fields'])
230
228
  end
231
229
  end
232
230
  end
@@ -1,5 +1,5 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.4.4".freeze
3
+ VERSION = "0.5.0.beta1".freeze
4
4
  end
5
5
  end