fluent-plugin-bigquery 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: eca8d5626366799524377edcdd7ced540cc633ef
4
- data.tar.gz: f39debef79fbb52c9c8c7068d4e68f8cc7bbc036
3
+ metadata.gz: e55e2b00afb7ffd8b32cc9b7fa0a305ab66af863
4
+ data.tar.gz: b93594427c232d70db4f7ceca1a28554732b5122
5
5
  SHA512:
6
- metadata.gz: bc6e6af3faf9b1eb9a6dc0fe96a17b8534828fbbcc1133f3e6f9d7c956cca5a65c15319dd2c8df6e4f55a674b561edbb88087ca7c5e298c24ee50db3924e546b
7
- data.tar.gz: 6eff9dcacc9e6b956cc2eb0b55b91f6db0b3418474f941bd83a18eb1f7cef33c68541b6a7d650c42f053edb7ae7464a460d737d09b71bb47e19591521758450f
6
+ metadata.gz: 5662de9f1d839a5ef38fd04df24bd4b90de8bd4a494020ecde58e20191df1317322b903eaef985f7cdaf68f24260ed86b802a6aca53dfb3e3825772547cafaf9
7
+ data.tar.gz: 03a4f811ccc74d36530a4d341218a31c9bc913f530321565696a6e70227c94067bc61a91c2ea97bd5ba5ebf2c617bc1da51e0c1b8c71e97037c444092f62c18c
data/README.md CHANGED
@@ -163,6 +163,26 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
163
163
  Note that the timestamp of logs and the date in the table id do not always match,
164
164
  because there is a time lag between collection and transmission of logs.
165
165
 
166
+ ### Dynamic table creating
167
+
168
+ When `auto_create_table` is set to `true`, try to create the table using BigQuery API when insertion failed with code=404 "Not Found: Table ...".
169
+ Next retry of insertion is expected to be success.
170
+
171
+ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should create the table on ahead to use `fetch_schema`.
172
+
173
+ ```apache
174
+ <match dummy>
175
+ type bigquery
176
+
177
+ ...
178
+
179
+ auto_create_table true
180
+ table accesslog_%Y_%m
181
+
182
+ ...
183
+ </match>
184
+ ```
185
+
166
186
  ### Table schema
167
187
 
168
188
  There are three methods to describe the schema of the target table.
@@ -1,6 +1,6 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.2.6"
3
+ VERSION = "0.2.7"
4
4
  end
5
5
  end
6
6
 
@@ -40,12 +40,12 @@ module Fluent
40
40
  # Available methods are:
41
41
  # * private_key -- Use service account credential
42
42
  # * compute_engine -- Use access token available in instances of ComputeEngine
43
- config_param :auth_method, :string, :default => 'private_key'
43
+ config_param :auth_method, :string, default: 'private_key'
44
44
 
45
45
  ### Service Account credential
46
- config_param :email, :string, :default => nil
47
- config_param :private_key_path, :string, :default => nil
48
- config_param :private_key_passphrase, :string, :default => 'notasecret'
46
+ config_param :email, :string, default: nil
47
+ config_param :private_key_path, :string, default: nil
48
+ config_param :private_key_passphrase, :string, default: 'notasecret'
49
49
 
50
50
  # see as simple reference
51
51
  # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
@@ -58,32 +58,34 @@ module Fluent
58
58
 
59
59
  # table_id
60
60
  # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
61
- config_param :table, :string, :default => nil
62
- config_param :tables, :string, :default => nil
63
-
64
- config_param :schema_path, :string, :default => nil
65
- config_param :fetch_schema, :bool, :default => false
66
- config_param :field_string, :string, :default => nil
67
- config_param :field_integer, :string, :default => nil
68
- config_param :field_float, :string, :default => nil
69
- config_param :field_boolean, :string, :default => nil
70
- config_param :field_timestamp, :string, :default => nil
61
+ config_param :table, :string, default: nil
62
+ config_param :tables, :string, default: nil
63
+
64
+ config_param :auto_create_table, :bool, default: false
65
+
66
+ config_param :schema_path, :string, default: nil
67
+ config_param :fetch_schema, :bool, default: false
68
+ config_param :field_string, :string, default: nil
69
+ config_param :field_integer, :string, default: nil
70
+ config_param :field_float, :string, default: nil
71
+ config_param :field_boolean, :string, default: nil
72
+ config_param :field_timestamp, :string, default: nil
71
73
  ### TODO: record field stream inserts doesn't works well?
72
74
  ### At table creation, table type json + field type record -> field type validation fails
73
75
  ### At streaming inserts, schema cannot be specified
74
- # config_param :field_record, :string, :defualt => nil
75
- # config_param :optional_data_field, :string, :default => nil
76
+ # config_param :field_record, :string, defualt: nil
77
+ # config_param :optional_data_field, :string, default: nil
76
78
 
77
- config_param :time_format, :string, :default => nil
78
- config_param :localtime, :bool, :default => nil
79
- config_param :utc, :bool, :default => nil
80
- config_param :time_field, :string, :default => nil
79
+ config_param :time_format, :string, default: nil
80
+ config_param :localtime, :bool, default: nil
81
+ config_param :utc, :bool, default: nil
82
+ config_param :time_field, :string, default: nil
81
83
 
82
- config_param :insert_id_field, :string, :default => nil
84
+ config_param :insert_id_field, :string, default: nil
83
85
 
84
- config_param :method, :string, :default => 'insert' # or 'load' # TODO: not implemented now
86
+ config_param :method, :string, default: 'insert' # or 'load' # TODO: not implemented now
85
87
 
86
- config_param :load_size_limit, :integer, :default => 1000**4 # < 1TB (1024^4) # TODO: not implemented now
88
+ config_param :load_size_limit, :integer, default: 1000**4 # < 1TB (1024^4) # TODO: not implemented now
87
89
  ### method: 'load'
88
90
  # https://developers.google.com/bigquery/loading-data-into-bigquery
89
91
  # Maximum File Sizes:
@@ -92,9 +94,9 @@ module Fluent
92
94
  # Without new-lines in strings: 1 TB
93
95
  # JSON 1 GB 1 TB
94
96
 
95
- config_param :row_size_limit, :integer, :default => 100*1000 # < 100KB # configurable in google ?
96
- # config_param :insert_size_limit, :integer, :default => 1000**2 # < 1MB
97
- # config_param :rows_per_second_limit, :integer, :default => 1000 # spike limit
97
+ config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
98
+ # config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
99
+ # config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
98
100
  ### method: ''Streaming data inserts support
99
101
  # https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
100
102
  # Maximum row size: 100 KB
@@ -137,7 +139,7 @@ module Fluent
137
139
 
138
140
  case @auth_method
139
141
  when 'private_key'
140
- if !@email || !@private_key_path
142
+ unless @email && @private_key_path
141
143
  raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
142
144
  end
143
145
  when 'compute_engine'
@@ -146,7 +148,7 @@ module Fluent
146
148
  raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
147
149
  end
148
150
 
149
- if (!@table && !@tables) || (@table && @tables)
151
+ unless @table.nil? ^ @tables.nil?
150
152
  raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
151
153
  end
152
154
 
@@ -156,53 +158,34 @@ module Fluent
156
158
  if @schema_path
157
159
  @fields.load_schema(JSON.parse(File.read(@schema_path)))
158
160
  end
159
- if @field_string
160
- @field_string.split(',').each do |fieldname|
161
- @fields.register_field fieldname.strip, :string
162
- end
163
- end
164
- if @field_integer
165
- @field_integer.split(',').each do |fieldname|
166
- @fields.register_field fieldname.strip, :integer
167
- end
168
- end
169
- if @field_float
170
- @field_float.split(',').each do |fieldname|
171
- @fields.register_field fieldname.strip, :float
172
- end
173
- end
174
- if @field_boolean
175
- @field_boolean.split(',').each do |fieldname|
176
- @fields.register_field fieldname.strip, :boolean
177
- end
178
- end
179
- if @field_timestamp
180
- @field_timestamp.split(',').each do |fieldname|
181
- @fields.register_field fieldname.strip, :timestamp
182
- end
183
- end
184
161
 
185
- if @localtime.nil?
186
- if @utc
187
- @localtime = false
162
+ types = %w(string integer float boolean timestamp)
163
+ types.each do |type|
164
+ raw_fields = instance_variable_get("@field_#{type}")
165
+ next unless raw_fields
166
+ raw_fields.split(',').each do |field|
167
+ @fields.register_field field.strip, type.to_sym
188
168
  end
189
169
  end
170
+
171
+ @localtime = false if @localtime.nil? && @utc
172
+
190
173
  @timef = TimeFormatter.new(@time_format, @localtime)
191
174
 
192
175
  if @time_field
193
176
  keys = @time_field.split('.')
194
177
  last_key = keys.pop
195
- @add_time_field = lambda {|record, time|
178
+ @add_time_field = ->(record, time) {
196
179
  keys.inject(record) { |h, k| h[k] ||= {} }[last_key] = @timef.format(time)
197
180
  record
198
181
  }
199
182
  else
200
- @add_time_field = lambda {|record, time| record }
183
+ @add_time_field = ->(record, time) { record }
201
184
  end
202
185
 
203
186
  if @insert_id_field
204
187
  insert_id_keys = @insert_id_field.split('.')
205
- @get_insert_id = lambda {|record|
188
+ @get_insert_id = ->(record) {
206
189
  insert_id_keys.inject(record) {|h, k| h[k] }
207
190
  }
208
191
  else
@@ -223,17 +206,12 @@ module Fluent
223
206
  fetch_schema() if @fetch_schema
224
207
  end
225
208
 
226
- def shutdown
227
- super
228
- # nothing to do
229
- end
230
-
231
209
  def client
232
210
  return @cached_client if @cached_client && @cached_client_expiration > Time.now
233
211
 
234
212
  client = Google::APIClient.new(
235
- :application_name => 'Fluentd BigQuery plugin',
236
- :application_version => Fluent::BigQueryPlugin::VERSION
213
+ application_name: 'Fluentd BigQuery plugin',
214
+ application_version: Fluent::BigQueryPlugin::VERSION
237
215
  )
238
216
 
239
217
  case @auth_method
@@ -264,17 +242,20 @@ module Fluent
264
242
  current_time.strftime(table_id_format)
265
243
  end
266
244
 
267
- def insert(table_id_format, rows)
268
- table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
245
+ def create_table(table_id)
269
246
  res = client().execute(
270
- :api_method => @bq.tabledata.insert_all,
247
+ :api_method => @bq.tables.insert,
271
248
  :parameters => {
272
249
  'projectId' => @project,
273
250
  'datasetId' => @dataset,
274
- 'tableId' => table_id,
275
251
  },
276
252
  :body_object => {
277
- "rows" => rows
253
+ 'tableReference' => {
254
+ 'tableId' => table_id,
255
+ },
256
+ 'schema' => {
257
+ 'fields' => @fields.to_a,
258
+ },
278
259
  }
279
260
  )
280
261
  unless res.success?
@@ -289,8 +270,42 @@ module Fluent
289
270
  rescue => e
290
271
  log.warn "Parse error: google api error response body", :body => res.body
291
272
  end
273
+ if res_obj and res_obj['code'] == 409 and /Already Exists:/ =~ message
274
+ # ignore 'Already Exists' error
275
+ return
276
+ end
292
277
  end
293
- log.error "tabledata.insertAll API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => res.status, :message => message
278
+ log.error "tables.insert API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => res.status, :message => message
279
+ raise "failed to create table in bigquery" # TODO: error class
280
+ end
281
+ end
282
+
283
+ def insert(table_id_format, rows)
284
+ table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
285
+ res = client().execute(
286
+ api_method: @bq.tabledata.insert_all,
287
+ parameters: {
288
+ 'projectId' => @project,
289
+ 'datasetId' => @dataset,
290
+ 'tableId' => table_id,
291
+ },
292
+ body_object: {
293
+ "rows" => rows
294
+ }
295
+ )
296
+ unless res.success?
297
+ # api_error? -> client cache clear
298
+ @cached_client = nil
299
+
300
+ res_obj = extract_response_obj(res.body)
301
+ message = res_obj['error']['message'] || res.body
302
+ if res_obj
303
+ if @auto_create_table and res_obj and res_obj['error']['code'] == 404 and /Not Found: Table/ =~ message.to_s
304
+ # Table Not Found: Auto Create Table
305
+ create_table(table_id)
306
+ end
307
+ end
308
+ log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: res.status, message: message
294
309
  raise "failed to insert into bigquery" # TODO: error class
295
310
  end
296
311
  end
@@ -335,8 +350,8 @@ module Fluent
335
350
  table_id_format = @tablelist[0]
336
351
  table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
337
352
  res = client.execute(
338
- :api_method => @bq.tables.get,
339
- :parameters => {
353
+ api_method: @bq.tables.get,
354
+ parameters: {
340
355
  'projectId' => @project,
341
356
  'datasetId' => @dataset,
342
357
  'tableId' => table_id,
@@ -346,17 +361,8 @@ module Fluent
346
361
  unless res.success?
347
362
  # api_error? -> client cache clear
348
363
  @cached_client = nil
349
-
350
- message = res.body
351
- if res.body =~ /^\{/
352
- begin
353
- res_obj = JSON.parse(res.body)
354
- message = res_obj['error']['message'] || res.body
355
- rescue => e
356
- log.warn "Parse error: google api error response body", :body => res.body
357
- end
358
- end
359
- log.error "tables.get API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => res.status, :message => message
364
+ message = extract_error_message(res.body)
365
+ log.error "tables.get API", project_id: @project, dataset: @dataset, table: table_id, code: res.status, message: message
360
366
  raise "failed to fetch schema from bigquery" # TODO: error class
361
367
  end
362
368
 
@@ -370,19 +376,33 @@ module Fluent
370
376
  # raise NotImplementedError, "OAuth needs browser authentication..."
371
377
  #
372
378
  # client = Google::APIClient.new(
373
- # :application_name => 'Example Ruby application',
374
- # :application_version => '1.0.0'
379
+ # application_name: 'Example Ruby application',
380
+ # application_version: '1.0.0'
375
381
  # )
376
382
  # bigquery = client.discovered_api('bigquery', 'v2')
377
383
  # flow = Google::APIClient::InstalledAppFlow.new(
378
- # :client_id => @client_id
379
- # :client_secret => @client_secret
380
- # :scope => ['https://www.googleapis.com/auth/bigquery']
384
+ # client_id: @client_id
385
+ # client_secret: @client_secret
386
+ # scope: ['https://www.googleapis.com/auth/bigquery']
381
387
  # )
382
388
  # client.authorization = flow.authorize # browser authentication !
383
389
  # client
384
390
  # end
385
391
 
392
+ def extract_response_obj(response_body)
393
+ return nil unless response_body =~ /^\{/
394
+ JSON.parse(response_body)
395
+ rescue
396
+ log.warn "Parse error: google api error response body", body: response_body
397
+ return nil
398
+ end
399
+
400
+ def extract_error_message(response_body)
401
+ res_obj = extract_response_obj(response_body)
402
+ return response_body if res_obj.nil?
403
+ res_obj['error']['message'] || response_body
404
+ end
405
+
386
406
  class FieldSchema
387
407
  def initialize(name, mode = :nullable)
388
408
  unless [:nullable, :required, :repeated].include?(mode)
@@ -419,6 +439,14 @@ module Fluent
419
439
  def format_one(value)
420
440
  raise NotImplementedError, "Must implement in a subclass"
421
441
  end
442
+
443
+ def to_h
444
+ {
445
+ 'name' => name,
446
+ 'type' => type.to_s.upcase,
447
+ 'mode' => mode.to_s.upcase,
448
+ }
449
+ end
422
450
  end
423
451
 
424
452
  class StringFieldSchema < FieldSchema
@@ -473,12 +501,12 @@ module Fluent
473
501
 
474
502
  class RecordSchema < FieldSchema
475
503
  FIELD_TYPES = {
476
- :string => StringFieldSchema,
477
- :integer => IntegerFieldSchema,
478
- :float => FloatFieldSchema,
479
- :boolean => BooleanFieldSchema,
480
- :timestamp => TimestampFieldSchema,
481
- :record => RecordSchema
504
+ string: StringFieldSchema,
505
+ integer: IntegerFieldSchema,
506
+ float: FloatFieldSchema,
507
+ boolean: BooleanFieldSchema,
508
+ timestamp: TimestampFieldSchema,
509
+ record: RecordSchema
482
510
  }.freeze
483
511
 
484
512
  def initialize(name, mode = :nullable)
@@ -494,6 +522,21 @@ module Fluent
494
522
  @fields[name]
495
523
  end
496
524
 
525
+ def to_a
526
+ @fields.map do |_, field_schema|
527
+ field_schema.to_h
528
+ end
529
+ end
530
+
531
+ def to_h
532
+ {
533
+ 'name' => name,
534
+ 'type' => type.to_s.upcase,
535
+ 'mode' => mode.to_s.upcase,
536
+ 'fields' => self.to_a,
537
+ }
538
+ end
539
+
497
540
  def load_schema(schema, allow_overwrite=true)
498
541
  schema.each do |field|
499
542
  raise ConfigError, 'field must have type' unless field.key?('type')
@@ -717,6 +717,103 @@ class BigQueryOutputTest < Test::Unit::TestCase
717
717
  assert_equal 'foo_2014_08_11', table_id
718
718
  end
719
719
 
720
+ def test_auto_create_table_by_bigquery_api
721
+ now = Time.now
722
+ message = {
723
+ "json" => {
724
+ "time" => now.to_i,
725
+ "request" => {
726
+ "vhost" => "bar",
727
+ "path" => "/path/to/baz",
728
+ "method" => "GET",
729
+ "protocol" => "HTTP/1.0",
730
+ "agent" => "libwww",
731
+ "referer" => "http://referer.example",
732
+ "time" => (now - 1).to_f,
733
+ "bot_access" => true,
734
+ "loginsession" => false,
735
+ },
736
+ "remote" => {
737
+ "host" => "remote.example",
738
+ "ip" => "192.168.1.1",
739
+ "user" => "nagachika",
740
+ },
741
+ "response" => {
742
+ "status" => 200,
743
+ "bytes" => 72,
744
+ },
745
+ }
746
+ }
747
+
748
+ driver = create_driver(<<-CONFIG)
749
+ table foo
750
+ email foo@bar.example
751
+ private_key_path /path/to/key
752
+ project yourproject_id
753
+ dataset yourdataset_id
754
+
755
+ time_format %s
756
+ time_field time
757
+
758
+ auto_create_table true
759
+ schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
760
+ CONFIG
761
+ mock_client(driver) do |expect|
762
+ expect.discovered_api("bigquery", "v2") {
763
+ mock! {
764
+ tables.mock!.insert { Object.new }
765
+ tabledata.mock!.insert_all { Object.new }
766
+ }
767
+ }
768
+ expect.execute(
769
+ :api_method => anything,
770
+ :parameters => {
771
+ 'projectId' => 'yourproject_id',
772
+ 'datasetId' => 'yourdataset_id',
773
+ 'tableId' => 'foo'
774
+ },
775
+ :body_object => {
776
+ "rows" => [ message ]
777
+ }
778
+ ) {
779
+ s = stub!
780
+ s.success? { false }
781
+ s.body { JSON.generate({
782
+ 'error' => { "code" => 404, "message" => "Not Found: Table yourproject_id:yourdataset_id.foo" }
783
+ }) }
784
+ s.status { 404 }
785
+ s
786
+ }
787
+ expect.execute(
788
+ :api_method => anything,
789
+ :parameters => {
790
+ 'projectId' => 'yourproject_id',
791
+ 'datasetId' => 'yourdataset_id',
792
+ },
793
+ :body_object => {
794
+ 'tableReference' => {
795
+ 'tableId' => 'foo',
796
+ },
797
+ 'schema' => {
798
+ 'fields' => JSON.parse(File.read(File.join(File.dirname(__FILE__), "testdata", "apache.schema")))
799
+ }
800
+ }
801
+ ) {
802
+ s = stub!
803
+ s.success? { true }
804
+ s
805
+ }
806
+ end
807
+ chunk = Fluent::MemoryBufferChunk.new("my.tag")
808
+ chunk << message.to_msgpack
809
+
810
+ driver.instance.start
811
+ assert_raise(RuntimeError) {
812
+ driver.instance.write(chunk)
813
+ }
814
+ driver.instance.shutdown
815
+ end
816
+
720
817
  private
721
818
 
722
819
  def sudo_schema_response
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naoya Ito
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-22 00:00:00.000000000 Z
11
+ date: 2015-01-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake