fluent-plugin-bigquery 2.1.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b463f412345eb71d1b263bf56e0cd51ebe1c2dacffaa223293edb8d4e5776e73
4
- data.tar.gz: f5f7766b2d0f4498239389ef38eb29ef9d20dbe9b118890e8d651b23330d33ca
3
+ metadata.gz: 36b950bf0783d3ce350d7c7514f5b7946b10fe4b867aec015c9331656e86eb48
4
+ data.tar.gz: b4b8e92f41008043b09822b20698a7e29ca8daf9ba69c2a5c38c696553e86d71
5
5
  SHA512:
6
- metadata.gz: 8d3851b83d9cbc7c802836dc5f5709d2f92009f980a3a6d3566730eea55fdaf697540c0370220441ed1d88687c27eb8677506e9897693469ef4fcb347d1e7825
7
- data.tar.gz: 39223f99503c53a812549b4ff8de2a94c3b7db670e6dd9819840d86d561fe68c922f82c18f0201abe8625b2dbf79d0741413d21c56d9d0855b1889b68946a2f8
6
+ metadata.gz: 01d3d39d9247134ca9059b990d0d6a52f308b27711d8cd989de30dfeb4e91a1673f1047d4e9269d24447169d9ec4bbac1d0d9b9f7d93b08b7be5d6c170593f1f
7
+ data.tar.gz: f226de7925fb048ba5533bf9b7c626f43e4b63eeb92c119d700737d1ae44611fb6fe6294e1ed5f989456de2ee3e1f98334c2d4cd1d89c49b52ef945a3674c8ce
data/README.md CHANGED
@@ -44,6 +44,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
44
44
  | private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
45
45
  | private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
46
46
  | json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
47
+ | location | string | no | no | nil | BigQuery Data Location. The geographic location of the job. Required except for US and EU. |
47
48
  | project | string | yes | yes | nil | |
48
49
  | dataset | string | yes | yes | nil | |
49
50
  | table | string | yes (either `tables`) | yes | nil | |
@@ -57,10 +58,10 @@ Because embbeded gem dependency sometimes restricts ruby environment.
57
58
  | schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
58
59
  | request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
59
60
  | request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
60
- | time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
61
- | time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition(experimental feature on BigQuery). |
62
- | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
63
- | time_partitioning_require_partition_filter | bool | no | no | false | If true, queries over this table require a partition filter that can be used for partition elimination to be specified. (experimental feature on BigQuery) |
61
+ | time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature. |
62
+ | time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition. |
63
+ | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. |
64
+ | clustering_fields | array(string) | no | no | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
64
65
 
65
66
  #### bigquery_insert
66
67
 
@@ -433,7 +434,7 @@ Use placeholder.
433
434
 
434
435
  ```apache
435
436
  <match dummy>
436
- @type bigquery_insert
437
+ @type bigquery_load
437
438
 
438
439
  ...
439
440
  table accesslog$%Y%m%d
@@ -446,6 +447,8 @@ Use placeholder.
446
447
  ```
447
448
 
448
449
  But, Dynamic table creating doesn't support date partitioned table yet.
450
+ And streaming insert is not allowed to insert with `$%Y%m%d` suffix.
451
+ If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`.
449
452
 
450
453
  ### Dynamic table creating
451
454
 
@@ -467,6 +470,8 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
467
470
  </match>
468
471
  ```
469
472
 
473
+ Also, you can create clustered table by using `clustering_fields`.
474
+
470
475
  ### Table schema
471
476
 
472
477
  There are three methods to describe the schema of the target table.
@@ -86,6 +86,16 @@ module Fluent
86
86
  end
87
87
  end
88
88
 
89
+ class NumericFieldSchema < FieldSchema
90
+ def type
91
+ :numeric
92
+ end
93
+
94
+ def format_one(value)
95
+ value.to_s
96
+ end
97
+ end
98
+
89
99
  class BooleanFieldSchema < FieldSchema
90
100
  def type
91
101
  :boolean
@@ -169,6 +179,7 @@ module Fluent
169
179
  string: StringFieldSchema,
170
180
  integer: IntegerFieldSchema,
171
181
  float: FloatFieldSchema,
182
+ numeric: NumericFieldSchema,
172
183
  boolean: BooleanFieldSchema,
173
184
  timestamp: TimestampFieldSchema,
174
185
  date: DateFieldSchema,
@@ -1,5 +1,5 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "2.1.0".freeze
3
+ VERSION = "2.2.0".freeze
4
4
  end
5
5
  end
@@ -35,6 +35,7 @@ module Fluent
35
35
  }
36
36
 
37
37
  definition.merge!(time_partitioning: time_partitioning) if time_partitioning
38
+ definition.merge!(clustering: clustering) if clustering
38
39
  client.insert_table(project, dataset, definition, {})
39
40
  log.debug "create table", project_id: project, dataset: dataset, table: table_id
40
41
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
@@ -149,6 +150,7 @@ module Fluent
149
150
  raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
150
151
  configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
151
152
  configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
153
+ configuration[:configuration][:load].merge!(clustering: clustering) if clustering
152
154
  end
153
155
  end
154
156
 
@@ -174,8 +176,9 @@ module Fluent
174
176
  def fetch_load_job(job_reference)
175
177
  project = job_reference.project_id
176
178
  job_id = job_reference.job_id
179
+ location = @options[:location]
177
180
 
178
- res = client.get_job(project, job_id)
181
+ res = client.get_job(project, job_id, location: location)
179
182
  log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
180
183
 
181
184
  if res.status.state == "DONE"
@@ -309,13 +312,24 @@ module Fluent
309
312
  type: @options[:time_partitioning_type].to_s.upcase,
310
313
  field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
311
314
  expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
312
- require_partition_filter: @options[:time_partitioning_require_partition_filter],
313
315
  }.reject { |_, v| v.nil? }
314
316
  else
315
317
  @time_partitioning
316
318
  end
317
319
  end
318
320
 
321
+ def clustering
322
+ return @clustering if instance_variable_defined?(:@clustering)
323
+
324
+ if @options[:clustering_fields]
325
+ @clustering = {
326
+ fields: @options[:clustering_fields]
327
+ }
328
+ else
329
+ @clustering
330
+ end
331
+ end
332
+
319
333
  def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
320
334
  try_count ||= 1
321
335
  res = client.insert_all_table_data(project, dataset, table_id, body, {})
@@ -29,6 +29,9 @@ module Fluent
29
29
  config_param :private_key_path, :string, default: nil
30
30
  config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
31
31
  config_param :json_key, default: nil, secret: true
32
+ # The geographic location of the job. Required except for US and EU.
33
+ # https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
34
+ config_param :location, :string, default: nil
32
35
 
33
36
  # see as simple reference
34
37
  # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
@@ -69,7 +72,9 @@ module Fluent
69
72
  config_param :time_partitioning_type, :enum, list: [:day], default: nil
70
73
  config_param :time_partitioning_field, :string, default: nil
71
74
  config_param :time_partitioning_expiration, :time, default: nil
72
- config_param :time_partitioning_require_partition_filter, :bool, default: false
75
+
76
+ ## Clustering
77
+ config_param :clustering_fields, :array, default: nil
73
78
 
74
79
  ## Formatter
75
80
  config_section :format do
@@ -132,6 +137,7 @@ module Fluent
132
137
  private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
133
138
  email: @email,
134
139
  json_key: @json_key,
140
+ location: @location,
135
141
  source_format: @source_format,
136
142
  skip_invalid_rows: @skip_invalid_rows,
137
143
  ignore_unknown_values: @ignore_unknown_values,
@@ -142,7 +148,7 @@ module Fluent
142
148
  time_partitioning_type: @time_partitioning_type,
143
149
  time_partitioning_field: @time_partitioning_field,
144
150
  time_partitioning_expiration: @time_partitioning_expiration,
145
- time_partitioning_require_partition_filter: @time_partitioning_require_partition_filter,
151
+ clustering_fields: @clustering_fields,
146
152
  timeout_sec: @request_timeout_sec,
147
153
  open_timeout_sec: @request_open_timeout_sec,
148
154
  })
@@ -400,6 +400,85 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
400
400
  }
401
401
  }
402
402
 
403
+ driver = create_driver(<<-CONFIG)
404
+ table foo
405
+ email foo@bar.example
406
+ private_key_path /path/to/key
407
+ project yourproject_id
408
+ dataset yourdataset_id
409
+
410
+ time_format %s
411
+ time_field time
412
+
413
+ auto_create_table true
414
+ schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
415
+
416
+ time_partitioning_type day
417
+ time_partitioning_field time
418
+ time_partitioning_expiration 1h
419
+ CONFIG
420
+
421
+ stub_writer do |writer|
422
+ body = {
423
+ rows: [message],
424
+ skip_invalid_rows: false,
425
+ ignore_unknown_values: false,
426
+ }
427
+ mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
428
+ raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
429
+ end.at_least(1)
430
+ mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
431
+
432
+ mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
433
+ table_reference: {
434
+ table_id: 'foo',
435
+ },
436
+ schema: {
437
+ fields: driver.instance.instance_variable_get(:@table_schema).to_a,
438
+ },
439
+ time_partitioning: {
440
+ type: 'DAY',
441
+ field: 'time',
442
+ expiration_ms: 3600000,
443
+ },
444
+ }, {})
445
+ end
446
+
447
+ assert_raise(RuntimeError) do
448
+ driver.run do
449
+ driver.feed("tag", Fluent::EventTime.now, message[:json])
450
+ end
451
+ end
452
+ end
453
+
454
+ def test_auto_create_clustered_table_by_bigquery_api
455
+ now = Time.now
456
+ message = {
457
+ json: {
458
+ time: now.to_i,
459
+ request: {
460
+ vhost: "bar",
461
+ path: "/path/to/baz",
462
+ method: "GET",
463
+ protocol: "HTTP/1.0",
464
+ agent: "libwww",
465
+ referer: "http://referer.example",
466
+ time: (now - 1).to_f,
467
+ bot_access: true,
468
+ loginsession: false,
469
+ },
470
+ remote: {
471
+ host: "remote.example",
472
+ ip: "192.168.1.1",
473
+ user: "nagachika",
474
+ },
475
+ response: {
476
+ status: 200,
477
+ bytes: 72,
478
+ },
479
+ }
480
+ }
481
+
403
482
  driver = create_driver(<<-CONFIG)
404
483
  table foo
405
484
  email foo@bar.example
@@ -417,6 +496,11 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
417
496
  time_partitioning_field time
418
497
  time_partitioning_expiration 1h
419
498
  time_partitioning_require_partition_filter true
499
+
500
+ clustering_fields [
501
+ "time",
502
+ "vhost"
503
+ ]
420
504
  CONFIG
421
505
 
422
506
  stub_writer do |writer|
@@ -441,7 +525,12 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
441
525
  type: 'DAY',
442
526
  field: 'time',
443
527
  expiration_ms: 3600000,
444
- require_partition_filter: true
528
+ },
529
+ clustering: {
530
+ fields: [
531
+ 'time',
532
+ 'vhost',
533
+ ],
445
534
  },
446
535
  }, {})
447
536
  end
@@ -158,7 +158,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
158
158
  stub!.job_reference.stub!.job_id { "dummy_job_id" }
159
159
  end
160
160
 
161
- mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
161
+ mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
162
162
  stub! do |s|
163
163
  s.id { 'dummy_job_id' }
164
164
  s.configuration.stub! do |_s|
@@ -241,7 +241,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
241
241
  stub!.job_reference.stub!.job_id { "dummy_job_id" }
242
242
  end
243
243
 
244
- mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
244
+ mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
245
245
  stub! do |s|
246
246
  s.id { 'dummy_job_id' }
247
247
  s.configuration.stub! do |_s|
@@ -27,6 +27,11 @@ class RecordSchemaTest < Test::Unit::TestCase
27
27
  "name" => "argv",
28
28
  "type" => "STRING",
29
29
  "mode" => "REPEATED"
30
+ },
31
+ {
32
+ "name" => "utilisation",
33
+ "type" => "NUMERIC",
34
+ "mode" => "NULLABLE"
30
35
  }
31
36
  ]
32
37
  end
@@ -58,6 +63,11 @@ class RecordSchemaTest < Test::Unit::TestCase
58
63
  "type" => "STRING",
59
64
  "mode" => "REPEATED"
60
65
  },
66
+ {
67
+ "name" => "utilisation",
68
+ "type" => "NUMERIC",
69
+ "mode" => "NULLABLE"
70
+ },
61
71
  {
62
72
  "name" => "new_column",
63
73
  "type" => "STRING",
@@ -93,6 +103,11 @@ class RecordSchemaTest < Test::Unit::TestCase
93
103
  "type" => "STRING",
94
104
  "mode" => "REPEATED"
95
105
  },
106
+ {
107
+ "name" => "utilisation",
108
+ "type" => "NUMERIC",
109
+ "mode" => "NULLABLE"
110
+ }
96
111
  ]
97
112
  end
98
113
 
@@ -142,12 +157,12 @@ class RecordSchemaTest < Test::Unit::TestCase
142
157
  time = Time.local(2016, 2, 7, 19, 0, 0).utc
143
158
 
144
159
  formatted = fields.format_one({
145
- "time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42]
160
+ "time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42], "utilisation" => "0.837"
146
161
  })
147
162
  assert_equal(
148
163
  formatted,
149
164
  {
150
- "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"]
165
+ "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"], "utilisation" => "0.837"
151
166
  }
152
167
  )
153
168
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naoya Ito
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-11-05 00:00:00.000000000 Z
12
+ date: 2019-08-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -183,8 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
183
183
  - !ruby/object:Gem::Version
184
184
  version: '0'
185
185
  requirements: []
186
- rubyforge_project:
187
- rubygems_version: 2.7.7
186
+ rubygems_version: 3.0.3
188
187
  signing_key:
189
188
  specification_version: 4
190
189
  summary: Fluentd plugin to store data on Google BigQuery