fluent-plugin-bigquery 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77b839947c8f721f341499a0be5c9f21552833ad
4
- data.tar.gz: 3efa172577cb54e19290f0df245ecbb677994869
3
+ metadata.gz: f0f23c20f0c7feb488dc105d5b4ada3dcc27a4a3
4
+ data.tar.gz: 41497e9b9fa0e5b518f63dc4c664ae67b17f763f
5
5
  SHA512:
6
- metadata.gz: f0dbcd7b1cd8f4462657006f9a51338e0327f0b61a172ae0d39860a4c88bae4af0ba8f299cc9657a8ac3eb44acf53f5bc1161d023a06b5ab6d133d4ab72aeba2
7
- data.tar.gz: aabd97536deeeb1a6b3f55e0c71f28ddbcfbf99ced84c7d299e881409dab907a34a943c8fa69df0a3373d6c6e1eb5a83fcde157ea6c601b90d84ad7c93b816b4
6
+ metadata.gz: a62ad2c42da6fabfc422457e06a40095957abc1b740cf3fa99337d2f03802b92dd79e314f60ebef1911ed4ba09ff972bf5adceccafc701cbd4d0d7e9411387e3
7
+ data.tar.gz: 6d399567ec70f674f9ce8725d16561f5362f38c64a5267ec4e519e72f40431b65852dfb03a2486a52bf1594b97615b94063e0cf101177be1da37544be823edf2
data/README.md CHANGED
@@ -10,7 +10,7 @@
10
10
  * load data
11
11
  * for data loading as batch jobs, for big amount of data
12
12
  * https://developers.google.com/bigquery/loading-data-into-bigquery
13
-
13
+
14
14
  Current version of this plugin supports Google API with Service Account Authentication, but does not support
15
15
  OAuth flow for installed applications.
16
16
 
@@ -57,6 +57,8 @@ OAuth flow for installed applications.
57
57
  | insert_id_field | string | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
58
58
  | request_timeout_sec | integer | no | nil | Bigquery API response timeout |
59
59
  | request_open_timeout_sec | integer | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
60
+ | time_partitioning_type | enum | no (either day) | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
61
+ | time_partitioning_expiration | time | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
60
62
 
61
63
  ### Standard Options
62
64
 
@@ -76,21 +78,21 @@ Configure insert specifications with target table schema, with your credentials.
76
78
  ```apache
77
79
  <match dummy>
78
80
  @type bigquery
79
-
81
+
80
82
  method insert # default
81
-
83
+
82
84
  auth_method private_key # default
83
85
  email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
84
86
  private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
85
87
  # private_key_passphrase notasecret # default
86
-
88
+
87
89
  project yourproject_id
88
90
  dataset yourdataset_id
89
91
  table tablename
90
-
92
+
91
93
  time_format %s
92
94
  time_field time
93
-
95
+
94
96
  field_integer time,status,bytes
95
97
  field_string rhost,vhost,path,method,protocol,agent,referer
96
98
  field_float requesttime
@@ -103,28 +105,28 @@ For high rate inserts over streaming inserts, you should specify flush intervals
103
105
  ```apache
104
106
  <match dummy>
105
107
  @type bigquery
106
-
108
+
107
109
  method insert # default
108
-
110
+
109
111
  flush_interval 1 # flush as frequent as possible
110
-
112
+
111
113
  buffer_chunk_records_limit 300 # default rate limit for users is 100
112
114
  buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
113
-
115
+
114
116
  num_threads 16
115
-
117
+
116
118
  auth_method private_key # default
117
119
  email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
118
120
  private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
119
121
  # private_key_passphrase notasecret # default
120
-
122
+
121
123
  project yourproject_id
122
124
  dataset yourdataset_id
123
125
  tables accesslog1,accesslog2,accesslog3
124
-
126
+
125
127
  time_format %s
126
128
  time_field time
127
-
129
+
128
130
  field_integer time,status,bytes
129
131
  field_string rhost,vhost,path,method,protocol,agent,referer
130
132
  field_float requesttime
@@ -214,10 +216,10 @@ download its JSON key and deploy the key with fluentd.
214
216
  ```apache
215
217
  <match dummy>
216
218
  @type bigquery
217
-
219
+
218
220
  auth_method json_key
219
221
  json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
220
-
222
+
221
223
  project yourproject_id
222
224
  dataset yourdataset_id
223
225
  table tablename
@@ -231,10 +233,10 @@ You need to only include `private_key` and `client_email` key from JSON key file
231
233
  ```apache
232
234
  <match dummy>
233
235
  @type bigquery
234
-
236
+
235
237
  auth_method json_key
236
238
  json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
237
-
239
+
238
240
  project yourproject_id
239
241
  dataset yourdataset_id
240
242
  table tablename
@@ -252,16 +254,16 @@ Compute Engine instance, then you can configure fluentd like this.
252
254
  ```apache
253
255
  <match dummy>
254
256
  @type bigquery
255
-
257
+
256
258
  auth_method compute_engine
257
-
259
+
258
260
  project yourproject_id
259
261
  dataset yourdataset_id
260
262
  table tablename
261
-
263
+
262
264
  time_format %s
263
265
  time_field time
264
-
266
+
265
267
  field_integer time,status,bytes
266
268
  field_string rhost,vhost,path,method,protocol,agent,referer
267
269
  field_float requesttime
@@ -296,13 +298,13 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
296
298
  ```apache
297
299
  <match dummy>
298
300
  @type bigquery
299
-
301
+
300
302
  ...
301
-
303
+
302
304
  project yourproject_id
303
305
  dataset yourdataset_id
304
306
  table accesslog_%Y_%m
305
-
307
+
306
308
  ...
307
309
  </match>
308
310
  ```
@@ -384,12 +386,12 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
384
386
  ```apache
385
387
  <match dummy>
386
388
  @type bigquery
387
-
389
+
388
390
  ...
389
-
391
+
390
392
  auto_create_table true
391
393
  table accesslog_%Y_%m
392
-
394
+
393
395
  ...
394
396
  </match>
395
397
  ```
@@ -408,12 +410,12 @@ you can also specify nested fields by prefixing their belonging record fields.
408
410
  ```apache
409
411
  <match dummy>
410
412
  @type bigquery
411
-
413
+
412
414
  ...
413
-
415
+
414
416
  time_format %s
415
417
  time_field time
416
-
418
+
417
419
  field_integer time,response.status,response.bytes
418
420
  field_string request.vhost,request.path,request.method,request.protocol,request.agent,request.referer,remote.host,remote.ip,remote.user
419
421
  field_float request.time
@@ -447,12 +449,12 @@ The second method is to specify a path to a BigQuery schema file instead of list
447
449
  ```apache
448
450
  <match dummy>
449
451
  @type bigquery
450
-
452
+
451
453
  ...
452
-
454
+
453
455
  time_format %s
454
456
  time_field time
455
-
457
+
456
458
  schema_path /path/to/httpd.schema
457
459
  field_integer time
458
460
  </match>
@@ -464,12 +466,12 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
464
466
  ```apache
465
467
  <match dummy>
466
468
  @type bigquery
467
-
469
+
468
470
  ...
469
-
471
+
470
472
  time_format %s
471
473
  time_field time
472
-
474
+
473
475
  fetch_schema true
474
476
  # fetch_schema_table other_table # if you want to fetch schema from other table
475
477
  field_integer time
@@ -489,9 +491,9 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
489
491
  ```apache
490
492
  <match dummy>
491
493
  @type bigquery
492
-
494
+
493
495
  ...
494
-
496
+
495
497
  insert_id_field uuid
496
498
  field_string uuid
497
499
  </match>
@@ -1,6 +1,6 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.3.0"
3
+ VERSION = "0.3.1"
4
4
  end
5
5
  end
6
6
 
@@ -60,20 +60,28 @@ module Fluent
60
60
  @client = client
61
61
  end
62
62
 
63
- def create_table(project, dataset, table_id, record_schema)
63
+ def create_table(project, dataset, table_id, record_schema, time_partitioning_type: nil, time_partitioning_expiration: nil)
64
64
  create_table_retry_limit = 3
65
65
  create_table_retry_wait = 1
66
66
  create_table_retry_count = 0
67
67
 
68
68
  begin
69
- client.insert_table(project, dataset, {
69
+ definition = {
70
70
  table_reference: {
71
71
  table_id: table_id,
72
72
  },
73
73
  schema: {
74
74
  fields: record_schema.to_a,
75
75
  }
76
- }, {})
76
+ }
77
+
78
+ if time_partitioning_type
79
+ definition[:time_partitioning] = {
80
+ type: time_partitioning_type.to_s.upcase,
81
+ expiration_ms: time_partitioning_expiration ? time_partitioning_expiration * 1000 : nil
82
+ }.compact
83
+ end
84
+ client.insert_table(project, dataset, definition, {})
77
85
  log.debug "create table", project_id: project, dataset: dataset, table: table_id
78
86
  @client = nil
79
87
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
@@ -124,7 +132,7 @@ module Fluent
124
132
  options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
125
133
  })
126
134
  log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
127
- log.warn "insert errors", insert_errors: res.insert_errors if res.insert_errors && !res.insert_errors.empty?
135
+ log.warn "insert errors", insert_errors: res.insert_errors.to_s if res.insert_errors && !res.insert_errors.empty?
128
136
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
129
137
  @client = nil
130
138
 
@@ -138,7 +146,7 @@ module Fluent
138
146
  end
139
147
  end
140
148
 
141
- def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60)
149
+ def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60, auto_create_table: nil, time_partitioning_type: nil, time_partitioning_expiration: nil)
142
150
  configuration = {
143
151
  configuration: {
144
152
  load: {
@@ -157,6 +165,7 @@ module Fluent
157
165
  }
158
166
  }
159
167
  }
168
+ configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if time_partitioning_type
160
169
  configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
161
170
 
162
171
  # If target table is already exist, omit schema configuration.
@@ -188,7 +197,13 @@ module Fluent
188
197
  reason = e.respond_to?(:reason) ? e.reason : nil
189
198
  log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
190
199
 
191
- return wait_load_job(project, dataset, job_id, table_id, retryable: false) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
200
+ if auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
201
+ # Table Not Found: Auto Create Table
202
+ create_table(project, dataset, table_id, fields, time_partitioning_type: time_partitioning_type, time_partitioning_expiration: time_partitioning_expiration)
203
+ raise "table created. send rows next time."
204
+ end
205
+
206
+ return wait_load_job(project, dataset, job_id, table_id) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
192
207
 
193
208
  if RETRYABLE_ERROR_REASON.include?(reason) || e.is_a?(Google::Apis::ServerError)
194
209
  raise RetryableError.new(nil, e)
@@ -143,6 +143,10 @@ module Fluent
143
143
  config_param :request_timeout_sec, :time, default: nil
144
144
  config_param :request_open_timeout_sec, :time, default: 60
145
145
 
146
+ ## Partitioning
147
+ config_param :time_partitioning_type, :enum, list: [:day], default: nil
148
+ config_param :time_partitioning_expiration, :time, default: nil
149
+
146
150
  ### Table types
147
151
  # https://developers.google.com/bigquery/docs/tables
148
152
  #
@@ -413,7 +417,7 @@ module Fluent
413
417
  rescue Fluent::BigQuery::Writer::Error => e
414
418
  if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
415
419
  # Table Not Found: Auto Create Table
416
- writer.create_table(@project, @dataset, table_id, @fields)
420
+ writer.create_table(@project, @dataset, table_id, @fields, time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration)
417
421
  raise "table created. send rows next time."
418
422
  end
419
423
 
@@ -459,7 +463,8 @@ module Fluent
459
463
  create_upload_source(chunk) do |upload_source|
460
464
  res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
461
465
  ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
462
- timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec,
466
+ timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec, auto_create_table: @auto_create_table,
467
+ time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration
463
468
  })
464
469
  end
465
470
  rescue Fluent::BigQuery::Writer::Error => e
@@ -1428,7 +1428,69 @@ class BigQueryOutputTest < Test::Unit::TestCase
1428
1428
  skip_invalid_rows: false,
1429
1429
  ignore_unknown_values: false,
1430
1430
  )) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
1431
- mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
1431
+ mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: nil, time_partitioning_expiration: nil)
1432
+
1433
+ chunk = Fluent::MemoryBufferChunk.new("my.tag")
1434
+ chunk << message.to_msgpack
1435
+
1436
+ driver.instance.start
1437
+
1438
+ assert_raise(RuntimeError) {
1439
+ driver.instance.write(chunk)
1440
+ }
1441
+ driver.instance.shutdown
1442
+ end
1443
+
1444
+ def test_auto_create_partitioned_table_by_bigquery_api
1445
+ now = Time.now
1446
+ message = {
1447
+ "json" => {
1448
+ "time" => now.to_i,
1449
+ "request" => {
1450
+ "vhost" => "bar",
1451
+ "path" => "/path/to/baz",
1452
+ "method" => "GET",
1453
+ "protocol" => "HTTP/1.0",
1454
+ "agent" => "libwww",
1455
+ "referer" => "http://referer.example",
1456
+ "time" => (now - 1).to_f,
1457
+ "bot_access" => true,
1458
+ "loginsession" => false,
1459
+ },
1460
+ "remote" => {
1461
+ "host" => "remote.example",
1462
+ "ip" => "192.168.1.1",
1463
+ "user" => "nagachika",
1464
+ },
1465
+ "response" => {
1466
+ "status" => 200,
1467
+ "bytes" => 72,
1468
+ },
1469
+ }
1470
+ }.deep_symbolize_keys
1471
+
1472
+ driver = create_driver(<<-CONFIG)
1473
+ table foo
1474
+ email foo@bar.example
1475
+ private_key_path /path/to/key
1476
+ project yourproject_id
1477
+ dataset yourdataset_id
1478
+
1479
+ time_format %s
1480
+ time_field time
1481
+
1482
+ auto_create_table true
1483
+ schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
1484
+
1485
+ time_partitioning_type day
1486
+ time_partitioning_expiration 1h
1487
+ CONFIG
1488
+ writer = stub_writer(driver)
1489
+ mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], hash_including(
1490
+ skip_invalid_rows: false,
1491
+ ignore_unknown_values: false,
1492
+ )) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
1493
+ mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: :day, time_partitioning_expiration: 3600)
1432
1494
 
1433
1495
  chunk = Fluent::MemoryBufferChunk.new("my.tag")
1434
1496
  chunk << message.to_msgpack
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naoya Ito
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-07 00:00:00.000000000 Z
11
+ date: 2016-10-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake