fluent-plugin-bigquery 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +41 -39
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +21 -6
- data/lib/fluent/plugin/out_bigquery.rb +7 -2
- data/test/plugin/test_out_bigquery.rb +63 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0f23c20f0c7feb488dc105d5b4ada3dcc27a4a3
|
4
|
+
data.tar.gz: 41497e9b9fa0e5b518f63dc4c664ae67b17f763f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a62ad2c42da6fabfc422457e06a40095957abc1b740cf3fa99337d2f03802b92dd79e314f60ebef1911ed4ba09ff972bf5adceccafc701cbd4d0d7e9411387e3
|
7
|
+
data.tar.gz: 6d399567ec70f674f9ce8725d16561f5362f38c64a5267ec4e519e72f40431b65852dfb03a2486a52bf1594b97615b94063e0cf101177be1da37544be823edf2
|
data/README.md
CHANGED
@@ -10,7 +10,7 @@
|
|
10
10
|
* load data
|
11
11
|
* for data loading as batch jobs, for big amount of data
|
12
12
|
* https://developers.google.com/bigquery/loading-data-into-bigquery
|
13
|
-
|
13
|
+
|
14
14
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
15
15
|
OAuth flow for installed applications.
|
16
16
|
|
@@ -57,6 +57,8 @@ OAuth flow for installed applications.
|
|
57
57
|
| insert_id_field | string | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
|
58
58
|
| request_timeout_sec | integer | no | nil | Bigquery API response timeout |
|
59
59
|
| request_open_timeout_sec | integer | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
60
|
+
| time_partitioning_type | enum | no (either day) | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
|
61
|
+
| time_partitioning_expiration | time | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
|
60
62
|
|
61
63
|
### Standard Options
|
62
64
|
|
@@ -76,21 +78,21 @@ Configure insert specifications with target table schema, with your credentials.
|
|
76
78
|
```apache
|
77
79
|
<match dummy>
|
78
80
|
@type bigquery
|
79
|
-
|
81
|
+
|
80
82
|
method insert # default
|
81
|
-
|
83
|
+
|
82
84
|
auth_method private_key # default
|
83
85
|
email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
|
84
86
|
private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
|
85
87
|
# private_key_passphrase notasecret # default
|
86
|
-
|
88
|
+
|
87
89
|
project yourproject_id
|
88
90
|
dataset yourdataset_id
|
89
91
|
table tablename
|
90
|
-
|
92
|
+
|
91
93
|
time_format %s
|
92
94
|
time_field time
|
93
|
-
|
95
|
+
|
94
96
|
field_integer time,status,bytes
|
95
97
|
field_string rhost,vhost,path,method,protocol,agent,referer
|
96
98
|
field_float requesttime
|
@@ -103,28 +105,28 @@ For high rate inserts over streaming inserts, you should specify flush intervals
|
|
103
105
|
```apache
|
104
106
|
<match dummy>
|
105
107
|
@type bigquery
|
106
|
-
|
108
|
+
|
107
109
|
method insert # default
|
108
|
-
|
110
|
+
|
109
111
|
flush_interval 1 # flush as frequent as possible
|
110
|
-
|
112
|
+
|
111
113
|
buffer_chunk_records_limit 300 # default rate limit for users is 100
|
112
114
|
buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
|
113
|
-
|
115
|
+
|
114
116
|
num_threads 16
|
115
|
-
|
117
|
+
|
116
118
|
auth_method private_key # default
|
117
119
|
email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
|
118
120
|
private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
|
119
121
|
# private_key_passphrase notasecret # default
|
120
|
-
|
122
|
+
|
121
123
|
project yourproject_id
|
122
124
|
dataset yourdataset_id
|
123
125
|
tables accesslog1,accesslog2,accesslog3
|
124
|
-
|
126
|
+
|
125
127
|
time_format %s
|
126
128
|
time_field time
|
127
|
-
|
129
|
+
|
128
130
|
field_integer time,status,bytes
|
129
131
|
field_string rhost,vhost,path,method,protocol,agent,referer
|
130
132
|
field_float requesttime
|
@@ -214,10 +216,10 @@ download its JSON key and deploy the key with fluentd.
|
|
214
216
|
```apache
|
215
217
|
<match dummy>
|
216
218
|
@type bigquery
|
217
|
-
|
219
|
+
|
218
220
|
auth_method json_key
|
219
221
|
json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
|
220
|
-
|
222
|
+
|
221
223
|
project yourproject_id
|
222
224
|
dataset yourdataset_id
|
223
225
|
table tablename
|
@@ -231,10 +233,10 @@ You need to only include `private_key` and `client_email` key from JSON key file
|
|
231
233
|
```apache
|
232
234
|
<match dummy>
|
233
235
|
@type bigquery
|
234
|
-
|
236
|
+
|
235
237
|
auth_method json_key
|
236
238
|
json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
|
237
|
-
|
239
|
+
|
238
240
|
project yourproject_id
|
239
241
|
dataset yourdataset_id
|
240
242
|
table tablename
|
@@ -252,16 +254,16 @@ Compute Engine instance, then you can configure fluentd like this.
|
|
252
254
|
```apache
|
253
255
|
<match dummy>
|
254
256
|
@type bigquery
|
255
|
-
|
257
|
+
|
256
258
|
auth_method compute_engine
|
257
|
-
|
259
|
+
|
258
260
|
project yourproject_id
|
259
261
|
dataset yourdataset_id
|
260
262
|
table tablename
|
261
|
-
|
263
|
+
|
262
264
|
time_format %s
|
263
265
|
time_field time
|
264
|
-
|
266
|
+
|
265
267
|
field_integer time,status,bytes
|
266
268
|
field_string rhost,vhost,path,method,protocol,agent,referer
|
267
269
|
field_float requesttime
|
@@ -296,13 +298,13 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
|
|
296
298
|
```apache
|
297
299
|
<match dummy>
|
298
300
|
@type bigquery
|
299
|
-
|
301
|
+
|
300
302
|
...
|
301
|
-
|
303
|
+
|
302
304
|
project yourproject_id
|
303
305
|
dataset yourdataset_id
|
304
306
|
table accesslog_%Y_%m
|
305
|
-
|
307
|
+
|
306
308
|
...
|
307
309
|
</match>
|
308
310
|
```
|
@@ -384,12 +386,12 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
|
|
384
386
|
```apache
|
385
387
|
<match dummy>
|
386
388
|
@type bigquery
|
387
|
-
|
389
|
+
|
388
390
|
...
|
389
|
-
|
391
|
+
|
390
392
|
auto_create_table true
|
391
393
|
table accesslog_%Y_%m
|
392
|
-
|
394
|
+
|
393
395
|
...
|
394
396
|
</match>
|
395
397
|
```
|
@@ -408,12 +410,12 @@ you can also specify nested fields by prefixing their belonging record fields.
|
|
408
410
|
```apache
|
409
411
|
<match dummy>
|
410
412
|
@type bigquery
|
411
|
-
|
413
|
+
|
412
414
|
...
|
413
|
-
|
415
|
+
|
414
416
|
time_format %s
|
415
417
|
time_field time
|
416
|
-
|
418
|
+
|
417
419
|
field_integer time,response.status,response.bytes
|
418
420
|
field_string request.vhost,request.path,request.method,request.protocol,request.agent,request.referer,remote.host,remote.ip,remote.user
|
419
421
|
field_float request.time
|
@@ -447,12 +449,12 @@ The second method is to specify a path to a BigQuery schema file instead of list
|
|
447
449
|
```apache
|
448
450
|
<match dummy>
|
449
451
|
@type bigquery
|
450
|
-
|
452
|
+
|
451
453
|
...
|
452
|
-
|
454
|
+
|
453
455
|
time_format %s
|
454
456
|
time_field time
|
455
|
-
|
457
|
+
|
456
458
|
schema_path /path/to/httpd.schema
|
457
459
|
field_integer time
|
458
460
|
</match>
|
@@ -464,12 +466,12 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
|
|
464
466
|
```apache
|
465
467
|
<match dummy>
|
466
468
|
@type bigquery
|
467
|
-
|
469
|
+
|
468
470
|
...
|
469
|
-
|
471
|
+
|
470
472
|
time_format %s
|
471
473
|
time_field time
|
472
|
-
|
474
|
+
|
473
475
|
fetch_schema true
|
474
476
|
# fetch_schema_table other_table # if you want to fetch schema from other table
|
475
477
|
field_integer time
|
@@ -489,9 +491,9 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
|
|
489
491
|
```apache
|
490
492
|
<match dummy>
|
491
493
|
@type bigquery
|
492
|
-
|
494
|
+
|
493
495
|
...
|
494
|
-
|
496
|
+
|
495
497
|
insert_id_field uuid
|
496
498
|
field_string uuid
|
497
499
|
</match>
|
@@ -60,20 +60,28 @@ module Fluent
|
|
60
60
|
@client = client
|
61
61
|
end
|
62
62
|
|
63
|
-
def create_table(project, dataset, table_id, record_schema)
|
63
|
+
def create_table(project, dataset, table_id, record_schema, time_partitioning_type: nil, time_partitioning_expiration: nil)
|
64
64
|
create_table_retry_limit = 3
|
65
65
|
create_table_retry_wait = 1
|
66
66
|
create_table_retry_count = 0
|
67
67
|
|
68
68
|
begin
|
69
|
-
|
69
|
+
definition = {
|
70
70
|
table_reference: {
|
71
71
|
table_id: table_id,
|
72
72
|
},
|
73
73
|
schema: {
|
74
74
|
fields: record_schema.to_a,
|
75
75
|
}
|
76
|
-
}
|
76
|
+
}
|
77
|
+
|
78
|
+
if time_partitioning_type
|
79
|
+
definition[:time_partitioning] = {
|
80
|
+
type: time_partitioning_type.to_s.upcase,
|
81
|
+
expiration_ms: time_partitioning_expiration ? time_partitioning_expiration * 1000 : nil
|
82
|
+
}.compact
|
83
|
+
end
|
84
|
+
client.insert_table(project, dataset, definition, {})
|
77
85
|
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
78
86
|
@client = nil
|
79
87
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
@@ -124,7 +132,7 @@ module Fluent
|
|
124
132
|
options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
|
125
133
|
})
|
126
134
|
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
127
|
-
log.warn "insert errors", insert_errors: res.insert_errors if res.insert_errors && !res.insert_errors.empty?
|
135
|
+
log.warn "insert errors", insert_errors: res.insert_errors.to_s if res.insert_errors && !res.insert_errors.empty?
|
128
136
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
129
137
|
@client = nil
|
130
138
|
|
@@ -138,7 +146,7 @@ module Fluent
|
|
138
146
|
end
|
139
147
|
end
|
140
148
|
|
141
|
-
def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60)
|
149
|
+
def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60, auto_create_table: nil, time_partitioning_type: nil, time_partitioning_expiration: nil)
|
142
150
|
configuration = {
|
143
151
|
configuration: {
|
144
152
|
load: {
|
@@ -157,6 +165,7 @@ module Fluent
|
|
157
165
|
}
|
158
166
|
}
|
159
167
|
}
|
168
|
+
configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if time_partitioning_type
|
160
169
|
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
161
170
|
|
162
171
|
# If target table is already exist, omit schema configuration.
|
@@ -188,7 +197,13 @@ module Fluent
|
|
188
197
|
reason = e.respond_to?(:reason) ? e.reason : nil
|
189
198
|
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
190
199
|
|
191
|
-
|
200
|
+
if auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
201
|
+
# Table Not Found: Auto Create Table
|
202
|
+
create_table(project, dataset, table_id, fields, time_partitioning_type: time_partitioning_type, time_partitioning_expiration: time_partitioning_expiration)
|
203
|
+
raise "table created. send rows next time."
|
204
|
+
end
|
205
|
+
|
206
|
+
return wait_load_job(project, dataset, job_id, table_id) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
192
207
|
|
193
208
|
if RETRYABLE_ERROR_REASON.include?(reason) || e.is_a?(Google::Apis::ServerError)
|
194
209
|
raise RetryableError.new(nil, e)
|
@@ -143,6 +143,10 @@ module Fluent
|
|
143
143
|
config_param :request_timeout_sec, :time, default: nil
|
144
144
|
config_param :request_open_timeout_sec, :time, default: 60
|
145
145
|
|
146
|
+
## Partitioning
|
147
|
+
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
148
|
+
config_param :time_partitioning_expiration, :time, default: nil
|
149
|
+
|
146
150
|
### Table types
|
147
151
|
# https://developers.google.com/bigquery/docs/tables
|
148
152
|
#
|
@@ -413,7 +417,7 @@ module Fluent
|
|
413
417
|
rescue Fluent::BigQuery::Writer::Error => e
|
414
418
|
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
415
419
|
# Table Not Found: Auto Create Table
|
416
|
-
writer.create_table(@project, @dataset, table_id, @fields)
|
420
|
+
writer.create_table(@project, @dataset, table_id, @fields, time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration)
|
417
421
|
raise "table created. send rows next time."
|
418
422
|
end
|
419
423
|
|
@@ -459,7 +463,8 @@ module Fluent
|
|
459
463
|
create_upload_source(chunk) do |upload_source|
|
460
464
|
res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
|
461
465
|
ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
|
462
|
-
timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec,
|
466
|
+
timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec, auto_create_table: @auto_create_table,
|
467
|
+
time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration
|
463
468
|
})
|
464
469
|
end
|
465
470
|
rescue Fluent::BigQuery::Writer::Error => e
|
@@ -1428,7 +1428,69 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
1428
1428
|
skip_invalid_rows: false,
|
1429
1429
|
ignore_unknown_values: false,
|
1430
1430
|
)) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1431
|
-
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
|
1431
|
+
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: nil, time_partitioning_expiration: nil)
|
1432
|
+
|
1433
|
+
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
1434
|
+
chunk << message.to_msgpack
|
1435
|
+
|
1436
|
+
driver.instance.start
|
1437
|
+
|
1438
|
+
assert_raise(RuntimeError) {
|
1439
|
+
driver.instance.write(chunk)
|
1440
|
+
}
|
1441
|
+
driver.instance.shutdown
|
1442
|
+
end
|
1443
|
+
|
1444
|
+
def test_auto_create_partitioned_table_by_bigquery_api
|
1445
|
+
now = Time.now
|
1446
|
+
message = {
|
1447
|
+
"json" => {
|
1448
|
+
"time" => now.to_i,
|
1449
|
+
"request" => {
|
1450
|
+
"vhost" => "bar",
|
1451
|
+
"path" => "/path/to/baz",
|
1452
|
+
"method" => "GET",
|
1453
|
+
"protocol" => "HTTP/1.0",
|
1454
|
+
"agent" => "libwww",
|
1455
|
+
"referer" => "http://referer.example",
|
1456
|
+
"time" => (now - 1).to_f,
|
1457
|
+
"bot_access" => true,
|
1458
|
+
"loginsession" => false,
|
1459
|
+
},
|
1460
|
+
"remote" => {
|
1461
|
+
"host" => "remote.example",
|
1462
|
+
"ip" => "192.168.1.1",
|
1463
|
+
"user" => "nagachika",
|
1464
|
+
},
|
1465
|
+
"response" => {
|
1466
|
+
"status" => 200,
|
1467
|
+
"bytes" => 72,
|
1468
|
+
},
|
1469
|
+
}
|
1470
|
+
}.deep_symbolize_keys
|
1471
|
+
|
1472
|
+
driver = create_driver(<<-CONFIG)
|
1473
|
+
table foo
|
1474
|
+
email foo@bar.example
|
1475
|
+
private_key_path /path/to/key
|
1476
|
+
project yourproject_id
|
1477
|
+
dataset yourdataset_id
|
1478
|
+
|
1479
|
+
time_format %s
|
1480
|
+
time_field time
|
1481
|
+
|
1482
|
+
auto_create_table true
|
1483
|
+
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
1484
|
+
|
1485
|
+
time_partitioning_type day
|
1486
|
+
time_partitioning_expiration 1h
|
1487
|
+
CONFIG
|
1488
|
+
writer = stub_writer(driver)
|
1489
|
+
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], hash_including(
|
1490
|
+
skip_invalid_rows: false,
|
1491
|
+
ignore_unknown_values: false,
|
1492
|
+
)) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1493
|
+
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: :day, time_partitioning_expiration: 3600)
|
1432
1494
|
|
1433
1495
|
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
1434
1496
|
chunk << message.to_msgpack
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|