fluent-plugin-bigquery 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +41 -39
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +21 -6
- data/lib/fluent/plugin/out_bigquery.rb +7 -2
- data/test/plugin/test_out_bigquery.rb +63 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0f23c20f0c7feb488dc105d5b4ada3dcc27a4a3
|
4
|
+
data.tar.gz: 41497e9b9fa0e5b518f63dc4c664ae67b17f763f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a62ad2c42da6fabfc422457e06a40095957abc1b740cf3fa99337d2f03802b92dd79e314f60ebef1911ed4ba09ff972bf5adceccafc701cbd4d0d7e9411387e3
|
7
|
+
data.tar.gz: 6d399567ec70f674f9ce8725d16561f5362f38c64a5267ec4e519e72f40431b65852dfb03a2486a52bf1594b97615b94063e0cf101177be1da37544be823edf2
|
data/README.md
CHANGED
@@ -10,7 +10,7 @@
|
|
10
10
|
* load data
|
11
11
|
* for data loading as batch jobs, for big amount of data
|
12
12
|
* https://developers.google.com/bigquery/loading-data-into-bigquery
|
13
|
-
|
13
|
+
|
14
14
|
Current version of this plugin supports Google API with Service Account Authentication, but does not support
|
15
15
|
OAuth flow for installed applications.
|
16
16
|
|
@@ -57,6 +57,8 @@ OAuth flow for installed applications.
|
|
57
57
|
| insert_id_field | string | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
|
58
58
|
| request_timeout_sec | integer | no | nil | Bigquery API response timeout |
|
59
59
|
| request_open_timeout_sec | integer | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
60
|
+
| time_partitioning_type | enum | no (either day) | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
|
61
|
+
| time_partitioning_expiration | time | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
|
60
62
|
|
61
63
|
### Standard Options
|
62
64
|
|
@@ -76,21 +78,21 @@ Configure insert specifications with target table schema, with your credentials.
|
|
76
78
|
```apache
|
77
79
|
<match dummy>
|
78
80
|
@type bigquery
|
79
|
-
|
81
|
+
|
80
82
|
method insert # default
|
81
|
-
|
83
|
+
|
82
84
|
auth_method private_key # default
|
83
85
|
email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
|
84
86
|
private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
|
85
87
|
# private_key_passphrase notasecret # default
|
86
|
-
|
88
|
+
|
87
89
|
project yourproject_id
|
88
90
|
dataset yourdataset_id
|
89
91
|
table tablename
|
90
|
-
|
92
|
+
|
91
93
|
time_format %s
|
92
94
|
time_field time
|
93
|
-
|
95
|
+
|
94
96
|
field_integer time,status,bytes
|
95
97
|
field_string rhost,vhost,path,method,protocol,agent,referer
|
96
98
|
field_float requesttime
|
@@ -103,28 +105,28 @@ For high rate inserts over streaming inserts, you should specify flush intervals
|
|
103
105
|
```apache
|
104
106
|
<match dummy>
|
105
107
|
@type bigquery
|
106
|
-
|
108
|
+
|
107
109
|
method insert # default
|
108
|
-
|
110
|
+
|
109
111
|
flush_interval 1 # flush as frequent as possible
|
110
|
-
|
112
|
+
|
111
113
|
buffer_chunk_records_limit 300 # default rate limit for users is 100
|
112
114
|
buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
|
113
|
-
|
115
|
+
|
114
116
|
num_threads 16
|
115
|
-
|
117
|
+
|
116
118
|
auth_method private_key # default
|
117
119
|
email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
|
118
120
|
private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
|
119
121
|
# private_key_passphrase notasecret # default
|
120
|
-
|
122
|
+
|
121
123
|
project yourproject_id
|
122
124
|
dataset yourdataset_id
|
123
125
|
tables accesslog1,accesslog2,accesslog3
|
124
|
-
|
126
|
+
|
125
127
|
time_format %s
|
126
128
|
time_field time
|
127
|
-
|
129
|
+
|
128
130
|
field_integer time,status,bytes
|
129
131
|
field_string rhost,vhost,path,method,protocol,agent,referer
|
130
132
|
field_float requesttime
|
@@ -214,10 +216,10 @@ download its JSON key and deploy the key with fluentd.
|
|
214
216
|
```apache
|
215
217
|
<match dummy>
|
216
218
|
@type bigquery
|
217
|
-
|
219
|
+
|
218
220
|
auth_method json_key
|
219
221
|
json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
|
220
|
-
|
222
|
+
|
221
223
|
project yourproject_id
|
222
224
|
dataset yourdataset_id
|
223
225
|
table tablename
|
@@ -231,10 +233,10 @@ You need to only include `private_key` and `client_email` key from JSON key file
|
|
231
233
|
```apache
|
232
234
|
<match dummy>
|
233
235
|
@type bigquery
|
234
|
-
|
236
|
+
|
235
237
|
auth_method json_key
|
236
238
|
json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
|
237
|
-
|
239
|
+
|
238
240
|
project yourproject_id
|
239
241
|
dataset yourdataset_id
|
240
242
|
table tablename
|
@@ -252,16 +254,16 @@ Compute Engine instance, then you can configure fluentd like this.
|
|
252
254
|
```apache
|
253
255
|
<match dummy>
|
254
256
|
@type bigquery
|
255
|
-
|
257
|
+
|
256
258
|
auth_method compute_engine
|
257
|
-
|
259
|
+
|
258
260
|
project yourproject_id
|
259
261
|
dataset yourdataset_id
|
260
262
|
table tablename
|
261
|
-
|
263
|
+
|
262
264
|
time_format %s
|
263
265
|
time_field time
|
264
|
-
|
266
|
+
|
265
267
|
field_integer time,status,bytes
|
266
268
|
field_string rhost,vhost,path,method,protocol,agent,referer
|
267
269
|
field_float requesttime
|
@@ -296,13 +298,13 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
|
|
296
298
|
```apache
|
297
299
|
<match dummy>
|
298
300
|
@type bigquery
|
299
|
-
|
301
|
+
|
300
302
|
...
|
301
|
-
|
303
|
+
|
302
304
|
project yourproject_id
|
303
305
|
dataset yourdataset_id
|
304
306
|
table accesslog_%Y_%m
|
305
|
-
|
307
|
+
|
306
308
|
...
|
307
309
|
</match>
|
308
310
|
```
|
@@ -384,12 +386,12 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
|
|
384
386
|
```apache
|
385
387
|
<match dummy>
|
386
388
|
@type bigquery
|
387
|
-
|
389
|
+
|
388
390
|
...
|
389
|
-
|
391
|
+
|
390
392
|
auto_create_table true
|
391
393
|
table accesslog_%Y_%m
|
392
|
-
|
394
|
+
|
393
395
|
...
|
394
396
|
</match>
|
395
397
|
```
|
@@ -408,12 +410,12 @@ you can also specify nested fields by prefixing their belonging record fields.
|
|
408
410
|
```apache
|
409
411
|
<match dummy>
|
410
412
|
@type bigquery
|
411
|
-
|
413
|
+
|
412
414
|
...
|
413
|
-
|
415
|
+
|
414
416
|
time_format %s
|
415
417
|
time_field time
|
416
|
-
|
418
|
+
|
417
419
|
field_integer time,response.status,response.bytes
|
418
420
|
field_string request.vhost,request.path,request.method,request.protocol,request.agent,request.referer,remote.host,remote.ip,remote.user
|
419
421
|
field_float request.time
|
@@ -447,12 +449,12 @@ The second method is to specify a path to a BigQuery schema file instead of list
|
|
447
449
|
```apache
|
448
450
|
<match dummy>
|
449
451
|
@type bigquery
|
450
|
-
|
452
|
+
|
451
453
|
...
|
452
|
-
|
454
|
+
|
453
455
|
time_format %s
|
454
456
|
time_field time
|
455
|
-
|
457
|
+
|
456
458
|
schema_path /path/to/httpd.schema
|
457
459
|
field_integer time
|
458
460
|
</match>
|
@@ -464,12 +466,12 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
|
|
464
466
|
```apache
|
465
467
|
<match dummy>
|
466
468
|
@type bigquery
|
467
|
-
|
469
|
+
|
468
470
|
...
|
469
|
-
|
471
|
+
|
470
472
|
time_format %s
|
471
473
|
time_field time
|
472
|
-
|
474
|
+
|
473
475
|
fetch_schema true
|
474
476
|
# fetch_schema_table other_table # if you want to fetch schema from other table
|
475
477
|
field_integer time
|
@@ -489,9 +491,9 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
|
|
489
491
|
```apache
|
490
492
|
<match dummy>
|
491
493
|
@type bigquery
|
492
|
-
|
494
|
+
|
493
495
|
...
|
494
|
-
|
496
|
+
|
495
497
|
insert_id_field uuid
|
496
498
|
field_string uuid
|
497
499
|
</match>
|
@@ -60,20 +60,28 @@ module Fluent
|
|
60
60
|
@client = client
|
61
61
|
end
|
62
62
|
|
63
|
-
def create_table(project, dataset, table_id, record_schema)
|
63
|
+
def create_table(project, dataset, table_id, record_schema, time_partitioning_type: nil, time_partitioning_expiration: nil)
|
64
64
|
create_table_retry_limit = 3
|
65
65
|
create_table_retry_wait = 1
|
66
66
|
create_table_retry_count = 0
|
67
67
|
|
68
68
|
begin
|
69
|
-
|
69
|
+
definition = {
|
70
70
|
table_reference: {
|
71
71
|
table_id: table_id,
|
72
72
|
},
|
73
73
|
schema: {
|
74
74
|
fields: record_schema.to_a,
|
75
75
|
}
|
76
|
-
}
|
76
|
+
}
|
77
|
+
|
78
|
+
if time_partitioning_type
|
79
|
+
definition[:time_partitioning] = {
|
80
|
+
type: time_partitioning_type.to_s.upcase,
|
81
|
+
expiration_ms: time_partitioning_expiration ? time_partitioning_expiration * 1000 : nil
|
82
|
+
}.compact
|
83
|
+
end
|
84
|
+
client.insert_table(project, dataset, definition, {})
|
77
85
|
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
78
86
|
@client = nil
|
79
87
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
@@ -124,7 +132,7 @@ module Fluent
|
|
124
132
|
options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
|
125
133
|
})
|
126
134
|
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
127
|
-
log.warn "insert errors", insert_errors: res.insert_errors if res.insert_errors && !res.insert_errors.empty?
|
135
|
+
log.warn "insert errors", insert_errors: res.insert_errors.to_s if res.insert_errors && !res.insert_errors.empty?
|
128
136
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
129
137
|
@client = nil
|
130
138
|
|
@@ -138,7 +146,7 @@ module Fluent
|
|
138
146
|
end
|
139
147
|
end
|
140
148
|
|
141
|
-
def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60)
|
149
|
+
def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60, auto_create_table: nil, time_partitioning_type: nil, time_partitioning_expiration: nil)
|
142
150
|
configuration = {
|
143
151
|
configuration: {
|
144
152
|
load: {
|
@@ -157,6 +165,7 @@ module Fluent
|
|
157
165
|
}
|
158
166
|
}
|
159
167
|
}
|
168
|
+
configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if time_partitioning_type
|
160
169
|
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
161
170
|
|
162
171
|
# If target table is already exist, omit schema configuration.
|
@@ -188,7 +197,13 @@ module Fluent
|
|
188
197
|
reason = e.respond_to?(:reason) ? e.reason : nil
|
189
198
|
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
190
199
|
|
191
|
-
|
200
|
+
if auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
201
|
+
# Table Not Found: Auto Create Table
|
202
|
+
create_table(project, dataset, table_id, fields, time_partitioning_type: time_partitioning_type, time_partitioning_expiration: time_partitioning_expiration)
|
203
|
+
raise "table created. send rows next time."
|
204
|
+
end
|
205
|
+
|
206
|
+
return wait_load_job(project, dataset, job_id, table_id) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
192
207
|
|
193
208
|
if RETRYABLE_ERROR_REASON.include?(reason) || e.is_a?(Google::Apis::ServerError)
|
194
209
|
raise RetryableError.new(nil, e)
|
@@ -143,6 +143,10 @@ module Fluent
|
|
143
143
|
config_param :request_timeout_sec, :time, default: nil
|
144
144
|
config_param :request_open_timeout_sec, :time, default: 60
|
145
145
|
|
146
|
+
## Partitioning
|
147
|
+
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
148
|
+
config_param :time_partitioning_expiration, :time, default: nil
|
149
|
+
|
146
150
|
### Table types
|
147
151
|
# https://developers.google.com/bigquery/docs/tables
|
148
152
|
#
|
@@ -413,7 +417,7 @@ module Fluent
|
|
413
417
|
rescue Fluent::BigQuery::Writer::Error => e
|
414
418
|
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
415
419
|
# Table Not Found: Auto Create Table
|
416
|
-
writer.create_table(@project, @dataset, table_id, @fields)
|
420
|
+
writer.create_table(@project, @dataset, table_id, @fields, time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration)
|
417
421
|
raise "table created. send rows next time."
|
418
422
|
end
|
419
423
|
|
@@ -459,7 +463,8 @@ module Fluent
|
|
459
463
|
create_upload_source(chunk) do |upload_source|
|
460
464
|
res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
|
461
465
|
ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
|
462
|
-
timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec,
|
466
|
+
timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec, auto_create_table: @auto_create_table,
|
467
|
+
time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration
|
463
468
|
})
|
464
469
|
end
|
465
470
|
rescue Fluent::BigQuery::Writer::Error => e
|
@@ -1428,7 +1428,69 @@ class BigQueryOutputTest < Test::Unit::TestCase
|
|
1428
1428
|
skip_invalid_rows: false,
|
1429
1429
|
ignore_unknown_values: false,
|
1430
1430
|
)) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1431
|
-
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
|
1431
|
+
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: nil, time_partitioning_expiration: nil)
|
1432
|
+
|
1433
|
+
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
1434
|
+
chunk << message.to_msgpack
|
1435
|
+
|
1436
|
+
driver.instance.start
|
1437
|
+
|
1438
|
+
assert_raise(RuntimeError) {
|
1439
|
+
driver.instance.write(chunk)
|
1440
|
+
}
|
1441
|
+
driver.instance.shutdown
|
1442
|
+
end
|
1443
|
+
|
1444
|
+
def test_auto_create_partitioned_table_by_bigquery_api
|
1445
|
+
now = Time.now
|
1446
|
+
message = {
|
1447
|
+
"json" => {
|
1448
|
+
"time" => now.to_i,
|
1449
|
+
"request" => {
|
1450
|
+
"vhost" => "bar",
|
1451
|
+
"path" => "/path/to/baz",
|
1452
|
+
"method" => "GET",
|
1453
|
+
"protocol" => "HTTP/1.0",
|
1454
|
+
"agent" => "libwww",
|
1455
|
+
"referer" => "http://referer.example",
|
1456
|
+
"time" => (now - 1).to_f,
|
1457
|
+
"bot_access" => true,
|
1458
|
+
"loginsession" => false,
|
1459
|
+
},
|
1460
|
+
"remote" => {
|
1461
|
+
"host" => "remote.example",
|
1462
|
+
"ip" => "192.168.1.1",
|
1463
|
+
"user" => "nagachika",
|
1464
|
+
},
|
1465
|
+
"response" => {
|
1466
|
+
"status" => 200,
|
1467
|
+
"bytes" => 72,
|
1468
|
+
},
|
1469
|
+
}
|
1470
|
+
}.deep_symbolize_keys
|
1471
|
+
|
1472
|
+
driver = create_driver(<<-CONFIG)
|
1473
|
+
table foo
|
1474
|
+
email foo@bar.example
|
1475
|
+
private_key_path /path/to/key
|
1476
|
+
project yourproject_id
|
1477
|
+
dataset yourdataset_id
|
1478
|
+
|
1479
|
+
time_format %s
|
1480
|
+
time_field time
|
1481
|
+
|
1482
|
+
auto_create_table true
|
1483
|
+
schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
|
1484
|
+
|
1485
|
+
time_partitioning_type day
|
1486
|
+
time_partitioning_expiration 1h
|
1487
|
+
CONFIG
|
1488
|
+
writer = stub_writer(driver)
|
1489
|
+
mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], hash_including(
|
1490
|
+
skip_invalid_rows: false,
|
1491
|
+
ignore_unknown_values: false,
|
1492
|
+
)) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
|
1493
|
+
mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: :day, time_partitioning_expiration: 3600)
|
1432
1494
|
|
1433
1495
|
chunk = Fluent::MemoryBufferChunk.new("my.tag")
|
1434
1496
|
chunk << message.to_msgpack
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|