fluent-plugin-bigquery 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77b839947c8f721f341499a0be5c9f21552833ad
4
- data.tar.gz: 3efa172577cb54e19290f0df245ecbb677994869
3
+ metadata.gz: f0f23c20f0c7feb488dc105d5b4ada3dcc27a4a3
4
+ data.tar.gz: 41497e9b9fa0e5b518f63dc4c664ae67b17f763f
5
5
  SHA512:
6
- metadata.gz: f0dbcd7b1cd8f4462657006f9a51338e0327f0b61a172ae0d39860a4c88bae4af0ba8f299cc9657a8ac3eb44acf53f5bc1161d023a06b5ab6d133d4ab72aeba2
7
- data.tar.gz: aabd97536deeeb1a6b3f55e0c71f28ddbcfbf99ced84c7d299e881409dab907a34a943c8fa69df0a3373d6c6e1eb5a83fcde157ea6c601b90d84ad7c93b816b4
6
+ metadata.gz: a62ad2c42da6fabfc422457e06a40095957abc1b740cf3fa99337d2f03802b92dd79e314f60ebef1911ed4ba09ff972bf5adceccafc701cbd4d0d7e9411387e3
7
+ data.tar.gz: 6d399567ec70f674f9ce8725d16561f5362f38c64a5267ec4e519e72f40431b65852dfb03a2486a52bf1594b97615b94063e0cf101177be1da37544be823edf2
data/README.md CHANGED
@@ -10,7 +10,7 @@
10
10
  * load data
11
11
  * for data loading as batch jobs, for big amount of data
12
12
  * https://developers.google.com/bigquery/loading-data-into-bigquery
13
-
13
+
14
14
  Current version of this plugin supports Google API with Service Account Authentication, but does not support
15
15
  OAuth flow for installed applications.
16
16
 
@@ -57,6 +57,8 @@ OAuth flow for installed applications.
57
57
  | insert_id_field | string | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
58
58
  | request_timeout_sec | integer | no | nil | Bigquery API response timeout |
59
59
  | request_open_timeout_sec | integer | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
60
+ | time_partitioning_type | enum | no (either day) | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
61
+ | time_partitioning_expiration | time | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
60
62
 
61
63
  ### Standard Options
62
64
 
@@ -76,21 +78,21 @@ Configure insert specifications with target table schema, with your credentials.
76
78
  ```apache
77
79
  <match dummy>
78
80
  @type bigquery
79
-
81
+
80
82
  method insert # default
81
-
83
+
82
84
  auth_method private_key # default
83
85
  email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
84
86
  private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
85
87
  # private_key_passphrase notasecret # default
86
-
88
+
87
89
  project yourproject_id
88
90
  dataset yourdataset_id
89
91
  table tablename
90
-
92
+
91
93
  time_format %s
92
94
  time_field time
93
-
95
+
94
96
  field_integer time,status,bytes
95
97
  field_string rhost,vhost,path,method,protocol,agent,referer
96
98
  field_float requesttime
@@ -103,28 +105,28 @@ For high rate inserts over streaming inserts, you should specify flush intervals
103
105
  ```apache
104
106
  <match dummy>
105
107
  @type bigquery
106
-
108
+
107
109
  method insert # default
108
-
110
+
109
111
  flush_interval 1 # flush as frequent as possible
110
-
112
+
111
113
  buffer_chunk_records_limit 300 # default rate limit for users is 100
112
114
  buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
113
-
115
+
114
116
  num_threads 16
115
-
117
+
116
118
  auth_method private_key # default
117
119
  email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
118
120
  private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
119
121
  # private_key_passphrase notasecret # default
120
-
122
+
121
123
  project yourproject_id
122
124
  dataset yourdataset_id
123
125
  tables accesslog1,accesslog2,accesslog3
124
-
126
+
125
127
  time_format %s
126
128
  time_field time
127
-
129
+
128
130
  field_integer time,status,bytes
129
131
  field_string rhost,vhost,path,method,protocol,agent,referer
130
132
  field_float requesttime
@@ -214,10 +216,10 @@ download its JSON key and deploy the key with fluentd.
214
216
  ```apache
215
217
  <match dummy>
216
218
  @type bigquery
217
-
219
+
218
220
  auth_method json_key
219
221
  json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
220
-
222
+
221
223
  project yourproject_id
222
224
  dataset yourdataset_id
223
225
  table tablename
@@ -231,10 +233,10 @@ You need to only include `private_key` and `client_email` key from JSON key file
231
233
  ```apache
232
234
  <match dummy>
233
235
  @type bigquery
234
-
236
+
235
237
  auth_method json_key
236
238
  json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
237
-
239
+
238
240
  project yourproject_id
239
241
  dataset yourdataset_id
240
242
  table tablename
@@ -252,16 +254,16 @@ Compute Engine instance, then you can configure fluentd like this.
252
254
  ```apache
253
255
  <match dummy>
254
256
  @type bigquery
255
-
257
+
256
258
  auth_method compute_engine
257
-
259
+
258
260
  project yourproject_id
259
261
  dataset yourdataset_id
260
262
  table tablename
261
-
263
+
262
264
  time_format %s
263
265
  time_field time
264
-
266
+
265
267
  field_integer time,status,bytes
266
268
  field_string rhost,vhost,path,method,protocol,agent,referer
267
269
  field_float requesttime
@@ -296,13 +298,13 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
296
298
  ```apache
297
299
  <match dummy>
298
300
  @type bigquery
299
-
301
+
300
302
  ...
301
-
303
+
302
304
  project yourproject_id
303
305
  dataset yourdataset_id
304
306
  table accesslog_%Y_%m
305
-
307
+
306
308
  ...
307
309
  </match>
308
310
  ```
@@ -384,12 +386,12 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
384
386
  ```apache
385
387
  <match dummy>
386
388
  @type bigquery
387
-
389
+
388
390
  ...
389
-
391
+
390
392
  auto_create_table true
391
393
  table accesslog_%Y_%m
392
-
394
+
393
395
  ...
394
396
  </match>
395
397
  ```
@@ -408,12 +410,12 @@ you can also specify nested fields by prefixing their belonging record fields.
408
410
  ```apache
409
411
  <match dummy>
410
412
  @type bigquery
411
-
413
+
412
414
  ...
413
-
415
+
414
416
  time_format %s
415
417
  time_field time
416
-
418
+
417
419
  field_integer time,response.status,response.bytes
418
420
  field_string request.vhost,request.path,request.method,request.protocol,request.agent,request.referer,remote.host,remote.ip,remote.user
419
421
  field_float request.time
@@ -447,12 +449,12 @@ The second method is to specify a path to a BigQuery schema file instead of list
447
449
  ```apache
448
450
  <match dummy>
449
451
  @type bigquery
450
-
452
+
451
453
  ...
452
-
454
+
453
455
  time_format %s
454
456
  time_field time
455
-
457
+
456
458
  schema_path /path/to/httpd.schema
457
459
  field_integer time
458
460
  </match>
@@ -464,12 +466,12 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
464
466
  ```apache
465
467
  <match dummy>
466
468
  @type bigquery
467
-
469
+
468
470
  ...
469
-
471
+
470
472
  time_format %s
471
473
  time_field time
472
-
474
+
473
475
  fetch_schema true
474
476
  # fetch_schema_table other_table # if you want to fetch schema from other table
475
477
  field_integer time
@@ -489,9 +491,9 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
489
491
  ```apache
490
492
  <match dummy>
491
493
  @type bigquery
492
-
494
+
493
495
  ...
494
-
496
+
495
497
  insert_id_field uuid
496
498
  field_string uuid
497
499
  </match>
@@ -1,6 +1,6 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.3.0"
3
+ VERSION = "0.3.1"
4
4
  end
5
5
  end
6
6
 
@@ -60,20 +60,28 @@ module Fluent
60
60
  @client = client
61
61
  end
62
62
 
63
- def create_table(project, dataset, table_id, record_schema)
63
+ def create_table(project, dataset, table_id, record_schema, time_partitioning_type: nil, time_partitioning_expiration: nil)
64
64
  create_table_retry_limit = 3
65
65
  create_table_retry_wait = 1
66
66
  create_table_retry_count = 0
67
67
 
68
68
  begin
69
- client.insert_table(project, dataset, {
69
+ definition = {
70
70
  table_reference: {
71
71
  table_id: table_id,
72
72
  },
73
73
  schema: {
74
74
  fields: record_schema.to_a,
75
75
  }
76
- }, {})
76
+ }
77
+
78
+ if time_partitioning_type
79
+ definition[:time_partitioning] = {
80
+ type: time_partitioning_type.to_s.upcase,
81
+ expiration_ms: time_partitioning_expiration ? time_partitioning_expiration * 1000 : nil
82
+ }.compact
83
+ end
84
+ client.insert_table(project, dataset, definition, {})
77
85
  log.debug "create table", project_id: project, dataset: dataset, table: table_id
78
86
  @client = nil
79
87
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
@@ -124,7 +132,7 @@ module Fluent
124
132
  options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
125
133
  })
126
134
  log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
127
- log.warn "insert errors", insert_errors: res.insert_errors if res.insert_errors && !res.insert_errors.empty?
135
+ log.warn "insert errors", insert_errors: res.insert_errors.to_s if res.insert_errors && !res.insert_errors.empty?
128
136
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
129
137
  @client = nil
130
138
 
@@ -138,7 +146,7 @@ module Fluent
138
146
  end
139
147
  end
140
148
 
141
- def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60)
149
+ def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60, auto_create_table: nil, time_partitioning_type: nil, time_partitioning_expiration: nil)
142
150
  configuration = {
143
151
  configuration: {
144
152
  load: {
@@ -157,6 +165,7 @@ module Fluent
157
165
  }
158
166
  }
159
167
  }
168
+ configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if time_partitioning_type
160
169
  configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
161
170
 
162
171
  # If target table is already exist, omit schema configuration.
@@ -188,7 +197,13 @@ module Fluent
188
197
  reason = e.respond_to?(:reason) ? e.reason : nil
189
198
  log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
190
199
 
191
- return wait_load_job(project, dataset, job_id, table_id, retryable: false) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
200
+ if auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
201
+ # Table Not Found: Auto Create Table
202
+ create_table(project, dataset, table_id, fields, time_partitioning_type: time_partitioning_type, time_partitioning_expiration: time_partitioning_expiration)
203
+ raise "table created. send rows next time."
204
+ end
205
+
206
+ return wait_load_job(project, dataset, job_id, table_id) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
192
207
 
193
208
  if RETRYABLE_ERROR_REASON.include?(reason) || e.is_a?(Google::Apis::ServerError)
194
209
  raise RetryableError.new(nil, e)
@@ -143,6 +143,10 @@ module Fluent
143
143
  config_param :request_timeout_sec, :time, default: nil
144
144
  config_param :request_open_timeout_sec, :time, default: 60
145
145
 
146
+ ## Partitioning
147
+ config_param :time_partitioning_type, :enum, list: [:day], default: nil
148
+ config_param :time_partitioning_expiration, :time, default: nil
149
+
146
150
  ### Table types
147
151
  # https://developers.google.com/bigquery/docs/tables
148
152
  #
@@ -413,7 +417,7 @@ module Fluent
413
417
  rescue Fluent::BigQuery::Writer::Error => e
414
418
  if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
415
419
  # Table Not Found: Auto Create Table
416
- writer.create_table(@project, @dataset, table_id, @fields)
420
+ writer.create_table(@project, @dataset, table_id, @fields, time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration)
417
421
  raise "table created. send rows next time."
418
422
  end
419
423
 
@@ -459,7 +463,8 @@ module Fluent
459
463
  create_upload_source(chunk) do |upload_source|
460
464
  res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
461
465
  ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
462
- timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec,
466
+ timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec, auto_create_table: @auto_create_table,
467
+ time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration
463
468
  })
464
469
  end
465
470
  rescue Fluent::BigQuery::Writer::Error => e
@@ -1428,7 +1428,69 @@ class BigQueryOutputTest < Test::Unit::TestCase
1428
1428
  skip_invalid_rows: false,
1429
1429
  ignore_unknown_values: false,
1430
1430
  )) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
1431
- mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
1431
+ mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: nil, time_partitioning_expiration: nil)
1432
+
1433
+ chunk = Fluent::MemoryBufferChunk.new("my.tag")
1434
+ chunk << message.to_msgpack
1435
+
1436
+ driver.instance.start
1437
+
1438
+ assert_raise(RuntimeError) {
1439
+ driver.instance.write(chunk)
1440
+ }
1441
+ driver.instance.shutdown
1442
+ end
1443
+
1444
+ def test_auto_create_partitioned_table_by_bigquery_api
1445
+ now = Time.now
1446
+ message = {
1447
+ "json" => {
1448
+ "time" => now.to_i,
1449
+ "request" => {
1450
+ "vhost" => "bar",
1451
+ "path" => "/path/to/baz",
1452
+ "method" => "GET",
1453
+ "protocol" => "HTTP/1.0",
1454
+ "agent" => "libwww",
1455
+ "referer" => "http://referer.example",
1456
+ "time" => (now - 1).to_f,
1457
+ "bot_access" => true,
1458
+ "loginsession" => false,
1459
+ },
1460
+ "remote" => {
1461
+ "host" => "remote.example",
1462
+ "ip" => "192.168.1.1",
1463
+ "user" => "nagachika",
1464
+ },
1465
+ "response" => {
1466
+ "status" => 200,
1467
+ "bytes" => 72,
1468
+ },
1469
+ }
1470
+ }.deep_symbolize_keys
1471
+
1472
+ driver = create_driver(<<-CONFIG)
1473
+ table foo
1474
+ email foo@bar.example
1475
+ private_key_path /path/to/key
1476
+ project yourproject_id
1477
+ dataset yourdataset_id
1478
+
1479
+ time_format %s
1480
+ time_field time
1481
+
1482
+ auto_create_table true
1483
+ schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
1484
+
1485
+ time_partitioning_type day
1486
+ time_partitioning_expiration 1h
1487
+ CONFIG
1488
+ writer = stub_writer(driver)
1489
+ mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], hash_including(
1490
+ skip_invalid_rows: false,
1491
+ ignore_unknown_values: false,
1492
+ )) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
1493
+ mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: :day, time_partitioning_expiration: 3600)
1432
1494
 
1433
1495
  chunk = Fluent::MemoryBufferChunk.new("my.tag")
1434
1496
  chunk << message.to_msgpack
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naoya Ito
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-07 00:00:00.000000000 Z
11
+ date: 2016-10-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake