fluent-plugin-bigquery 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b463f412345eb71d1b263bf56e0cd51ebe1c2dacffaa223293edb8d4e5776e73
4
- data.tar.gz: f5f7766b2d0f4498239389ef38eb29ef9d20dbe9b118890e8d651b23330d33ca
3
+ metadata.gz: 36b950bf0783d3ce350d7c7514f5b7946b10fe4b867aec015c9331656e86eb48
4
+ data.tar.gz: b4b8e92f41008043b09822b20698a7e29ca8daf9ba69c2a5c38c696553e86d71
5
5
  SHA512:
6
- metadata.gz: 8d3851b83d9cbc7c802836dc5f5709d2f92009f980a3a6d3566730eea55fdaf697540c0370220441ed1d88687c27eb8677506e9897693469ef4fcb347d1e7825
7
- data.tar.gz: 39223f99503c53a812549b4ff8de2a94c3b7db670e6dd9819840d86d561fe68c922f82c18f0201abe8625b2dbf79d0741413d21c56d9d0855b1889b68946a2f8
6
+ metadata.gz: 01d3d39d9247134ca9059b990d0d6a52f308b27711d8cd989de30dfeb4e91a1673f1047d4e9269d24447169d9ec4bbac1d0d9b9f7d93b08b7be5d6c170593f1f
7
+ data.tar.gz: f226de7925fb048ba5533bf9b7c626f43e4b63eeb92c119d700737d1ae44611fb6fe6294e1ed5f989456de2ee3e1f98334c2d4cd1d89c49b52ef945a3674c8ce
data/README.md CHANGED
@@ -44,6 +44,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
44
44
  | private_key_path | string | yes (private_key) | no | nil | GCP Private Key file path |
45
45
  | private_key_passphrase | string | yes (private_key) | no | nil | GCP Private Key Passphrase |
46
46
  | json_key | string | yes (json_key) | no | nil | GCP JSON Key file path or JSON Key string |
47
+ | location | string | no | no | nil | BigQuery Data Location. The geographic location of the job. Required except for US and EU. |
47
48
  | project | string | yes | yes | nil | |
48
49
  | dataset | string | yes | yes | nil | |
49
50
  | table | string | yes (either `tables`) | yes | nil | |
@@ -57,10 +58,10 @@ Because embbeded gem dependency sometimes restricts ruby environment.
57
58
  | schema_cache_expire | integer | no | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
58
59
  | request_timeout_sec | integer | no | no | nil | Bigquery API response timeout |
59
60
  | request_open_timeout_sec | integer | no | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
60
- | time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature(experimental feature on BigQuery). |
61
- | time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition(experimental feature on BigQuery). |
62
- | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery) |
63
- | time_partitioning_require_partition_filter | bool | no | no | false | If true, queries over this table require a partition filter that can be used for partition elimination to be specified. (experimental feature on BigQuery) |
61
+ | time_partitioning_type | enum | no (either day) | no | nil | Type of bigquery time partitioning feature. |
62
+ | time_partitioning_field | string | no | no | nil | Field used to determine how to create a time-based partition. |
63
+ | time_partitioning_expiration | time | no | no | nil | Expiration milliseconds for bigquery time partitioning. |
64
+ | clustering_fields | array(string) | no | no | nil | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
64
65
 
65
66
  #### bigquery_insert
66
67
 
@@ -433,7 +434,7 @@ Use placeholder.
433
434
 
434
435
  ```apache
435
436
  <match dummy>
436
- @type bigquery_insert
437
+ @type bigquery_load
437
438
 
438
439
  ...
439
440
  table accesslog$%Y%m%d
@@ -446,6 +447,8 @@ Use placeholder.
446
447
  ```
447
448
 
448
449
  But, Dynamic table creating doesn't support date partitioned table yet.
450
+ And streaming insert is not allowed to insert with `$%Y%m%d` suffix.
451
+ If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`.
449
452
 
450
453
  ### Dynamic table creating
451
454
 
@@ -467,6 +470,8 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
467
470
  </match>
468
471
  ```
469
472
 
473
+ Also, you can create clustered table by using `clustering_fields`.
474
+
470
475
  ### Table schema
471
476
 
472
477
  There are three methods to describe the schema of the target table.
@@ -86,6 +86,16 @@ module Fluent
86
86
  end
87
87
  end
88
88
 
89
+ class NumericFieldSchema < FieldSchema
90
+ def type
91
+ :numeric
92
+ end
93
+
94
+ def format_one(value)
95
+ value.to_s
96
+ end
97
+ end
98
+
89
99
  class BooleanFieldSchema < FieldSchema
90
100
  def type
91
101
  :boolean
@@ -169,6 +179,7 @@ module Fluent
169
179
  string: StringFieldSchema,
170
180
  integer: IntegerFieldSchema,
171
181
  float: FloatFieldSchema,
182
+ numeric: NumericFieldSchema,
172
183
  boolean: BooleanFieldSchema,
173
184
  timestamp: TimestampFieldSchema,
174
185
  date: DateFieldSchema,
@@ -1,5 +1,5 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "2.1.0".freeze
3
+ VERSION = "2.2.0".freeze
4
4
  end
5
5
  end
@@ -35,6 +35,7 @@ module Fluent
35
35
  }
36
36
 
37
37
  definition.merge!(time_partitioning: time_partitioning) if time_partitioning
38
+ definition.merge!(clustering: clustering) if clustering
38
39
  client.insert_table(project, dataset, definition, {})
39
40
  log.debug "create table", project_id: project, dataset: dataset, table: table_id
40
41
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
@@ -149,6 +150,7 @@ module Fluent
149
150
  raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
150
151
  configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
151
152
  configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
153
+ configuration[:configuration][:load].merge!(clustering: clustering) if clustering
152
154
  end
153
155
  end
154
156
 
@@ -174,8 +176,9 @@ module Fluent
174
176
  def fetch_load_job(job_reference)
175
177
  project = job_reference.project_id
176
178
  job_id = job_reference.job_id
179
+ location = @options[:location]
177
180
 
178
- res = client.get_job(project, job_id)
181
+ res = client.get_job(project, job_id, location: location)
179
182
  log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
180
183
 
181
184
  if res.status.state == "DONE"
@@ -309,13 +312,24 @@ module Fluent
309
312
  type: @options[:time_partitioning_type].to_s.upcase,
310
313
  field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
311
314
  expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
312
- require_partition_filter: @options[:time_partitioning_require_partition_filter],
313
315
  }.reject { |_, v| v.nil? }
314
316
  else
315
317
  @time_partitioning
316
318
  end
317
319
  end
318
320
 
321
+ def clustering
322
+ return @clustering if instance_variable_defined?(:@clustering)
323
+
324
+ if @options[:clustering_fields]
325
+ @clustering = {
326
+ fields: @options[:clustering_fields]
327
+ }
328
+ else
329
+ @clustering
330
+ end
331
+ end
332
+
319
333
  def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
320
334
  try_count ||= 1
321
335
  res = client.insert_all_table_data(project, dataset, table_id, body, {})
@@ -29,6 +29,9 @@ module Fluent
29
29
  config_param :private_key_path, :string, default: nil
30
30
  config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
31
31
  config_param :json_key, default: nil, secret: true
32
+ # The geographic location of the job. Required except for US and EU.
33
+ # https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
34
+ config_param :location, :string, default: nil
32
35
 
33
36
  # see as simple reference
34
37
  # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
@@ -69,7 +72,9 @@ module Fluent
69
72
  config_param :time_partitioning_type, :enum, list: [:day], default: nil
70
73
  config_param :time_partitioning_field, :string, default: nil
71
74
  config_param :time_partitioning_expiration, :time, default: nil
72
- config_param :time_partitioning_require_partition_filter, :bool, default: false
75
+
76
+ ## Clustering
77
+ config_param :clustering_fields, :array, default: nil
73
78
 
74
79
  ## Formatter
75
80
  config_section :format do
@@ -132,6 +137,7 @@ module Fluent
132
137
  private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
133
138
  email: @email,
134
139
  json_key: @json_key,
140
+ location: @location,
135
141
  source_format: @source_format,
136
142
  skip_invalid_rows: @skip_invalid_rows,
137
143
  ignore_unknown_values: @ignore_unknown_values,
@@ -142,7 +148,7 @@ module Fluent
142
148
  time_partitioning_type: @time_partitioning_type,
143
149
  time_partitioning_field: @time_partitioning_field,
144
150
  time_partitioning_expiration: @time_partitioning_expiration,
145
- time_partitioning_require_partition_filter: @time_partitioning_require_partition_filter,
151
+ clustering_fields: @clustering_fields,
146
152
  timeout_sec: @request_timeout_sec,
147
153
  open_timeout_sec: @request_open_timeout_sec,
148
154
  })
@@ -400,6 +400,85 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
400
400
  }
401
401
  }
402
402
 
403
+ driver = create_driver(<<-CONFIG)
404
+ table foo
405
+ email foo@bar.example
406
+ private_key_path /path/to/key
407
+ project yourproject_id
408
+ dataset yourdataset_id
409
+
410
+ time_format %s
411
+ time_field time
412
+
413
+ auto_create_table true
414
+ schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
415
+
416
+ time_partitioning_type day
417
+ time_partitioning_field time
418
+ time_partitioning_expiration 1h
419
+ CONFIG
420
+
421
+ stub_writer do |writer|
422
+ body = {
423
+ rows: [message],
424
+ skip_invalid_rows: false,
425
+ ignore_unknown_values: false,
426
+ }
427
+ mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
428
+ raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
429
+ end.at_least(1)
430
+ mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
431
+
432
+ mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
433
+ table_reference: {
434
+ table_id: 'foo',
435
+ },
436
+ schema: {
437
+ fields: driver.instance.instance_variable_get(:@table_schema).to_a,
438
+ },
439
+ time_partitioning: {
440
+ type: 'DAY',
441
+ field: 'time',
442
+ expiration_ms: 3600000,
443
+ },
444
+ }, {})
445
+ end
446
+
447
+ assert_raise(RuntimeError) do
448
+ driver.run do
449
+ driver.feed("tag", Fluent::EventTime.now, message[:json])
450
+ end
451
+ end
452
+ end
453
+
454
+ def test_auto_create_clustered_table_by_bigquery_api
455
+ now = Time.now
456
+ message = {
457
+ json: {
458
+ time: now.to_i,
459
+ request: {
460
+ vhost: "bar",
461
+ path: "/path/to/baz",
462
+ method: "GET",
463
+ protocol: "HTTP/1.0",
464
+ agent: "libwww",
465
+ referer: "http://referer.example",
466
+ time: (now - 1).to_f,
467
+ bot_access: true,
468
+ loginsession: false,
469
+ },
470
+ remote: {
471
+ host: "remote.example",
472
+ ip: "192.168.1.1",
473
+ user: "nagachika",
474
+ },
475
+ response: {
476
+ status: 200,
477
+ bytes: 72,
478
+ },
479
+ }
480
+ }
481
+
403
482
  driver = create_driver(<<-CONFIG)
404
483
  table foo
405
484
  email foo@bar.example
@@ -417,6 +496,11 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
417
496
  time_partitioning_field time
418
497
  time_partitioning_expiration 1h
419
498
  time_partitioning_require_partition_filter true
499
+
500
+ clustering_fields [
501
+ "time",
502
+ "vhost"
503
+ ]
420
504
  CONFIG
421
505
 
422
506
  stub_writer do |writer|
@@ -441,7 +525,12 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
441
525
  type: 'DAY',
442
526
  field: 'time',
443
527
  expiration_ms: 3600000,
444
- require_partition_filter: true
528
+ },
529
+ clustering: {
530
+ fields: [
531
+ 'time',
532
+ 'vhost',
533
+ ],
445
534
  },
446
535
  }, {})
447
536
  end
@@ -158,7 +158,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
158
158
  stub!.job_reference.stub!.job_id { "dummy_job_id" }
159
159
  end
160
160
 
161
- mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
161
+ mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
162
162
  stub! do |s|
163
163
  s.id { 'dummy_job_id' }
164
164
  s.configuration.stub! do |_s|
@@ -241,7 +241,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
241
241
  stub!.job_reference.stub!.job_id { "dummy_job_id" }
242
242
  end
243
243
 
244
- mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
244
+ mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
245
245
  stub! do |s|
246
246
  s.id { 'dummy_job_id' }
247
247
  s.configuration.stub! do |_s|
@@ -27,6 +27,11 @@ class RecordSchemaTest < Test::Unit::TestCase
27
27
  "name" => "argv",
28
28
  "type" => "STRING",
29
29
  "mode" => "REPEATED"
30
+ },
31
+ {
32
+ "name" => "utilisation",
33
+ "type" => "NUMERIC",
34
+ "mode" => "NULLABLE"
30
35
  }
31
36
  ]
32
37
  end
@@ -58,6 +63,11 @@ class RecordSchemaTest < Test::Unit::TestCase
58
63
  "type" => "STRING",
59
64
  "mode" => "REPEATED"
60
65
  },
66
+ {
67
+ "name" => "utilisation",
68
+ "type" => "NUMERIC",
69
+ "mode" => "NULLABLE"
70
+ },
61
71
  {
62
72
  "name" => "new_column",
63
73
  "type" => "STRING",
@@ -93,6 +103,11 @@ class RecordSchemaTest < Test::Unit::TestCase
93
103
  "type" => "STRING",
94
104
  "mode" => "REPEATED"
95
105
  },
106
+ {
107
+ "name" => "utilisation",
108
+ "type" => "NUMERIC",
109
+ "mode" => "NULLABLE"
110
+ }
96
111
  ]
97
112
  end
98
113
 
@@ -142,12 +157,12 @@ class RecordSchemaTest < Test::Unit::TestCase
142
157
  time = Time.local(2016, 2, 7, 19, 0, 0).utc
143
158
 
144
159
  formatted = fields.format_one({
145
- "time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42]
160
+ "time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42], "utilisation" => "0.837"
146
161
  })
147
162
  assert_equal(
148
163
  formatted,
149
164
  {
150
- "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"]
165
+ "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"], "utilisation" => "0.837"
151
166
  }
152
167
  )
153
168
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naoya Ito
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-11-05 00:00:00.000000000 Z
12
+ date: 2019-08-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -183,8 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
183
183
  - !ruby/object:Gem::Version
184
184
  version: '0'
185
185
  requirements: []
186
- rubyforge_project:
187
- rubygems_version: 2.7.7
186
+ rubygems_version: 3.0.3
188
187
  signing_key:
189
188
  specification_version: 4
190
189
  summary: Fluentd plugin to store data on Google BigQuery