fluent-plugin-bigquery-custom 0.3.2 → 0.3.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cf7ef29505d6d6d1a7c0b1bb4418d33be7e81b1f
4
- data.tar.gz: 53193d13414cc8a1ef05f3825469a2c5cba0fb36
3
+ metadata.gz: 4af83e7241135e5fa4386ddd2342bc990448fd58
4
+ data.tar.gz: 94d7f710bcf578befd9fc1a0ba4cf35eaf79e72b
5
5
  SHA512:
6
- metadata.gz: bc5e2572202bcb9f99531cb1a6f03645310e91d5edc6133a5d509e5263e9f2efb3a203acfbd01cdcda14e1482f14251db89a053045cea2c6714b601c0f9b7c0a
7
- data.tar.gz: b30ddefbaa82d17732f483733d924962ffadb85677b9734e3794a0381de0865ea9804e569a9718ac4bc0c5111f9d15f5ab091130fe9a48e6e137c64cd9eef712
6
+ metadata.gz: aad322ae03a689b9bd9459f7eae50f8990801e95bd0f8816a9ce80948b6f962da2724ef35a8728d71d36c266b8a878507953e865f80928503b2ff925c4109f9d
7
+ data.tar.gz: 407e7510c1739175cfc40dc476df07bc01526221cc9a77712e9db0d21545ce175143d91e3d21dfe609790e0fa47598f7d9c303f818df9ac0a03837b6c0b6027e
data/README.md CHANGED
@@ -27,7 +27,15 @@ OAuth flow for installed applications.
27
27
  - `max_bad_records`
28
28
  - `ignore_unknown_values`
29
29
  - `prevent_duplicate_load`
30
+ - `template_suffix`
31
+ - `schema_cache_expire`
30
32
  - Improve error handling
33
+ - Add templateSuffix feature
34
+ - `template_suffix` can use same placeholder for `table`
35
+ - If use load method, emulate templateSuffix process. But, slightly different with Streaming Insert.
36
+ 1. Fetch Schema from base table per `schema_cache_expire` time
37
+ 1. If table exists, Insert job with no schema data.
38
+ 1. Unless table exists, Insert job with fetched schema data.
31
39
 
32
40
  ## Configuration
33
41
 
@@ -1,6 +1,6 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.3.2"
3
+ VERSION = "0.3.6"
4
4
  end
5
5
  end
6
6
 
@@ -63,6 +63,7 @@ module Fluent
63
63
  # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
64
64
  config_param :table, :string, default: nil
65
65
  config_param :tables, :string, default: nil
66
+ config_param :template_suffix, :string, default: nil
66
67
 
67
68
  config_param :auto_create_table, :bool, default: false
68
69
 
@@ -82,6 +83,7 @@ module Fluent
82
83
 
83
84
  config_param :schema_path, :string, default: nil
84
85
  config_param :fetch_schema, :bool, default: false
86
+ config_param :schema_cache_expire, :time, default: 600
85
87
  config_param :field_string, :string, default: nil
86
88
  config_param :field_integer, :string, default: nil
87
89
  config_param :field_float, :string, default: nil
@@ -171,7 +173,7 @@ module Fluent
171
173
  require 'digest/sha1'
172
174
  extend(LoadImplementation)
173
175
  else
174
- raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
176
+ raise Fluend::ConfigError, "'method' must be 'insert' or 'load'"
175
177
  end
176
178
 
177
179
  case @auth_method
@@ -253,8 +255,10 @@ module Fluent
253
255
 
254
256
  @tables_queue = @tablelist.dup.shuffle
255
257
  @tables_mutex = Mutex.new
258
+ @fetch_schema_mutex = Mutex.new
256
259
 
257
- fetch_schema() if @fetch_schema
260
+ @last_fetch_schema_time = 0
261
+ fetch_schema(false) if @fetch_schema
258
262
  end
259
263
 
260
264
  def client
@@ -356,32 +360,52 @@ module Fluent
356
360
  t
357
361
  end
358
362
  table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now), chunk)
359
- _write(chunk, table_id)
363
+ template_suffix = @template_suffix ? generate_table_id(@template_suffix, Time.at(Fluent::Engine.now), chunk) : nil
364
+ _write(chunk, table_id, template_suffix)
360
365
  end
361
366
 
362
- def fetch_schema
363
- table_id_format = @tablelist[0]
364
- table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
365
- res = client.get_table(@project, @dataset, table_id)
366
-
367
- schema = res.schema.fields.as_json
368
- log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
369
- @fields.load_schema(schema, false)
367
+ def fetch_schema(allow_overwrite = true)
368
+ table_id = nil
369
+ @fetch_schema_mutex.synchronize do
370
+ if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
371
+ table_id_format = @tablelist[0]
372
+ table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
373
+ res = client.get_table(@project, @dataset, table_id)
374
+
375
+ schema = res.schema.fields.as_json
376
+ log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
377
+ if allow_overwrite
378
+ fields = RecordSchema.new("record")
379
+ fields.load_schema(schema, allow_overwrite)
380
+ @fields = fields
381
+ else
382
+ @fields.load_schema(schema, allow_overwrite)
383
+ end
384
+ @last_fetch_schema_time = Fluent::Engine.now
385
+ end
386
+ end
370
387
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
371
388
  # api_error? -> client cache clear
372
389
  @cached_client = nil
373
390
  message = e.message
374
391
  log.error "tables.get API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
375
- raise "failed to fetch schema from bigquery" # TODO: error class
392
+ if @fields.empty?
393
+ raise "failed to fetch schema from bigquery" # TODO: error class
394
+ else
395
+ log.warn "Use previous schema"
396
+ @last_fetch_schema_time = Fluent::Engine.now
397
+ end
376
398
  end
377
399
 
378
400
  module InsertImplementation
379
401
  def format(tag, time, record)
380
- buf = ''
402
+ fetch_schema if @template_suffix
381
403
 
382
404
  if @replace_record_key
383
405
  record = replace_record_key(record)
384
406
  end
407
+
408
+ buf = String.new
385
409
  row = @fields.format(@add_time_field.call(record, time))
386
410
  unless row.empty?
387
411
  row = {"json" => row}
@@ -391,18 +415,20 @@ module Fluent
391
415
  buf
392
416
  end
393
417
 
394
- def _write(chunk, table_id)
418
+ def _write(chunk, table_id, template_suffix)
395
419
  rows = []
396
420
  chunk.msgpack_each do |row_object|
397
421
  # TODO: row size limit
398
422
  rows << row_object.deep_symbolize_keys
399
423
  end
400
424
 
401
- res = client.insert_all_table_data(@project, @dataset, table_id, {
425
+ body = {
402
426
  rows: rows,
403
427
  skip_invalid_rows: @skip_invalid_rows,
404
428
  ignore_unknown_values: @ignore_unknown_values,
405
- }, {})
429
+ }
430
+ body.merge!(template_suffix: template_suffix) if template_suffix
431
+ res = client.insert_all_table_data(@project, @dataset, table_id, body, {})
406
432
 
407
433
  if res.insert_errors
408
434
  reasons = []
@@ -428,7 +454,7 @@ module Fluent
428
454
  end
429
455
 
430
456
  reason = e.respond_to?(:reason) ? e.reason : nil
431
- log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
457
+ log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
432
458
 
433
459
  raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
434
460
 
@@ -441,11 +467,13 @@ module Fluent
441
467
 
442
468
  module LoadImplementation
443
469
  def format(tag, time, record)
444
- buf = ''
470
+ fetch_schema if @template_suffix
445
471
 
446
472
  if @replace_record_key
447
473
  record = replace_record_key(record)
448
474
  end
475
+
476
+ buf = String.new
449
477
  row = @fields.format(@add_time_field.call(record, time))
450
478
  unless row.empty?
451
479
  buf << MultiJson.dump(row) + "\n"
@@ -453,7 +481,7 @@ module Fluent
453
481
  buf
454
482
  end
455
483
 
456
- def _write(chunk, table_id)
484
+ def _write(chunk, table_id, template_suffix)
457
485
  res = nil
458
486
  job_id = nil
459
487
 
@@ -461,25 +489,7 @@ module Fluent
461
489
  if @prevent_duplicate_load
462
490
  job_id = create_job_id(upload_source.path, @dataset, @table, @fields.to_a, @max_bad_records, @ignore_unknown_values)
463
491
  end
464
- configuration = {
465
- configuration: {
466
- load: {
467
- destination_table: {
468
- project_id: @project,
469
- dataset_id: @dataset,
470
- table_id: table_id,
471
- },
472
- schema: {
473
- fields: @fields.to_a,
474
- },
475
- write_disposition: "WRITE_APPEND",
476
- source_format: "NEWLINE_DELIMITED_JSON",
477
- ignore_unknown_values: @ignore_unknown_values,
478
- max_bad_records: @max_bad_records,
479
- }
480
- }
481
- }
482
- configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
492
+ configuration = load_configuration(table_id, template_suffix, upload_source)
483
493
  res = client.insert_job(@project, configuration, {upload_source: upload_source, content_type: "application/octet-stream"})
484
494
  end
485
495
 
@@ -502,6 +512,45 @@ module Fluent
502
512
 
503
513
  private
504
514
 
515
+ def load_configuration(table_id, template_suffix, upload_source)
516
+ job_id = nil
517
+ if @prevent_duplicate_load
518
+ job_id = create_job_id(upload_source.path, @dataset, "#{table_id}#{template_suffix}", @fields.to_a, @max_bad_records, @ignore_unknown_values)
519
+ end
520
+
521
+ configuration = {
522
+ configuration: {
523
+ load: {
524
+ destination_table: {
525
+ project_id: @project,
526
+ dataset_id: @dataset,
527
+ table_id: "#{table_id}#{template_suffix}",
528
+ },
529
+ schema: {
530
+ fields: @fields.to_a,
531
+ },
532
+ write_disposition: "WRITE_APPEND",
533
+ source_format: "NEWLINE_DELIMITED_JSON",
534
+ ignore_unknown_values: @ignore_unknown_values,
535
+ max_bad_records: @max_bad_records,
536
+ }
537
+ }
538
+ }
539
+ configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
540
+
541
+ # If target table is already exist, omit schema configuration.
542
+ # Because schema changing is easier.
543
+ begin
544
+ if template_suffix && client.get_table(@project, @dataset, "#{table_id}#{template_suffix}")
545
+ configuration[:configuration][:load].delete(:schema)
546
+ end
547
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError
548
+ raise "Schema is empty" if @fields.empty?
549
+ end
550
+
551
+ configuration
552
+ end
553
+
505
554
  def wait_load(job_id)
506
555
  wait_interval = 10
507
556
  _response = client.get_job(@project, job_id)
@@ -581,8 +630,12 @@ module Fluent
581
630
  when :nullable
582
631
  format_one(value) unless value.nil?
583
632
  when :required
584
- raise "Required field #{name} cannot be null" if value.nil?
585
- format_one(value)
633
+ if value.nil?
634
+ log.warn "Required field #{name} cannot be null"
635
+ nil
636
+ else
637
+ format_one(value)
638
+ end
586
639
  when :repeated
587
640
  value.nil? ? [] : value.map {|v| format_one(v) }
588
641
  end
@@ -642,12 +695,32 @@ module Fluent
642
695
  end
643
696
 
644
697
  class TimestampFieldSchema < FieldSchema
698
+ INTEGER_REGEXP = /\A-?[[:digit:]]+\z/.freeze
699
+ FLOAT_REGEXP = /\A-?[[:digit:]]+(\.[[:digit:]]+)\z/.freeze
700
+
645
701
  def type
646
702
  :timestamp
647
703
  end
648
704
 
649
705
  def format_one(value)
650
- value
706
+ case value
707
+ when Time
708
+ value.strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
709
+ when String
710
+ if value =~ INTEGER_REGEXP
711
+ value.to_i
712
+ elsif value =~ FLOAT_REGEXP
713
+ value.to_f
714
+ else
715
+ begin
716
+ Time.parse(value).strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
717
+ rescue
718
+ value
719
+ end
720
+ end
721
+ else
722
+ value
723
+ end
651
724
  end
652
725
  end
653
726
 
@@ -674,6 +747,10 @@ module Fluent
674
747
  @fields[name]
675
748
  end
676
749
 
750
+ def empty?
751
+ @fields.empty?
752
+ end
753
+
677
754
  def to_a
678
755
  @fields.map do |_, field_schema|
679
756
  field_schema.to_h
@@ -729,11 +806,10 @@ module Fluent
729
806
 
730
807
  def format_one(record)
731
808
  out = {}
732
- @fields.each do |key, schema|
733
- value = record[key]
734
- formatted = schema.format(value)
735
- next if formatted.nil? # field does not exists, or null value
736
- out[key] = formatted
809
+ record.each do |key, value|
810
+ next if value.nil?
811
+ schema = @fields[key]
812
+ out[key] = schema ? schema.format(value) : value
737
813
  end
738
814
  out
739
815
  end
@@ -275,9 +275,15 @@ class BigQueryOutputTest < Test::Unit::TestCase
275
275
  "requesttime" => (now - 1).to_f.to_s.to_f,
276
276
  "bot_access" => true,
277
277
  "loginsession" => false,
278
+ "something-else" => "would be ignored",
279
+ "yet-another" => {
280
+ "foo" => "bar",
281
+ "baz" => 1,
282
+ },
278
283
  "remote" => {
279
284
  "host" => "remote.example",
280
285
  "ip" => "192.0.2.1",
286
+ "port" => 12345,
281
287
  "user" => "tagomoris",
282
288
  }
283
289
  }
@@ -429,12 +435,18 @@ class BigQueryOutputTest < Test::Unit::TestCase
429
435
  "remote" => {
430
436
  "host" => "remote.example",
431
437
  "ip" => "192.0.2.1",
438
+ "port" => 12345,
432
439
  "user" => "tagomoris",
433
440
  },
434
441
  "response" => {
435
442
  "status" => 1,
436
443
  "bytes" => 3,
437
444
  },
445
+ "something-else" => "would be ignored",
446
+ "yet-another" => {
447
+ "foo" => "bar",
448
+ "baz" => 1,
449
+ },
438
450
  }
439
451
  }
440
452
 
@@ -739,38 +751,6 @@ class BigQueryOutputTest < Test::Unit::TestCase
739
751
  assert_equal expected, buf
740
752
  end
741
753
 
742
- def test_empty_value_in_required
743
- now = Time.now
744
- input = [
745
- now,
746
- {
747
- "tty" => "pts/1",
748
- "pwd" => "/home/yugui",
749
- "user" => nil,
750
- "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
751
- }
752
- ]
753
-
754
- driver = create_driver(<<-CONFIG)
755
- table foo
756
- email foo@bar.example
757
- private_key_path /path/to/key
758
- project yourproject_id
759
- dataset yourdataset_id
760
-
761
- time_format %s
762
- time_field time
763
-
764
- schema_path #{File.join(File.dirname(__FILE__), "testdata", "sudo.schema")}
765
- field_integer time
766
- CONFIG
767
- driver.instance.start
768
- assert_raises(RuntimeError.new("Required field user cannot be null")) do
769
- driver.instance.format_stream("my.tag", [input])
770
- end
771
- driver.instance.shutdown
772
- end
773
-
774
754
  def test_replace_record_key
775
755
  now = Time.now
776
756
  input = [
@@ -0,0 +1,173 @@
1
+ require 'helper'
2
+ require 'active_support/json'
3
+ require 'active_support/core_ext/hash'
4
+ require 'active_support/core_ext/object/json'
5
+
6
+ class RecordSchemaTest < Test::Unit::TestCase
7
+ def base_schema
8
+ [
9
+ {
10
+ "name" => "time",
11
+ "type" => "TIMESTAMP",
12
+ "mode" => "REQUIRED"
13
+ },
14
+ {
15
+ "name" => "tty",
16
+ "type" => "STRING",
17
+ "mode" => "NULLABLE"
18
+ },
19
+ {
20
+ "name" => "pwd",
21
+ "type" => "STRING",
22
+ "mode" => "REQUIRED"
23
+ },
24
+ {
25
+ "name" => "user",
26
+ "type" => "STRING",
27
+ "mode" => "REQUIRED"
28
+ },
29
+ {
30
+ "name" => "argv",
31
+ "type" => "STRING",
32
+ "mode" => "REPEATED"
33
+ }
34
+ ]
35
+ end
36
+
37
+ def base_schema_with_new_column
38
+ [
39
+ {
40
+ "name" => "time",
41
+ "type" => "TIMESTAMP",
42
+ "mode" => "REQUIRED"
43
+ },
44
+ {
45
+ "name" => "tty",
46
+ "type" => "STRING",
47
+ "mode" => "NULLABLE"
48
+ },
49
+ {
50
+ "name" => "pwd",
51
+ "type" => "STRING",
52
+ "mode" => "REQUIRED"
53
+ },
54
+ {
55
+ "name" => "user",
56
+ "type" => "STRING",
57
+ "mode" => "REQUIRED"
58
+ },
59
+ {
60
+ "name" => "argv",
61
+ "type" => "STRING",
62
+ "mode" => "REPEATED"
63
+ },
64
+ {
65
+ "name" => "new_column",
66
+ "type" => "STRING",
67
+ "mode" => "REQUIRED"
68
+ }
69
+ ]
70
+ end
71
+
72
+ def base_schema_with_type_changed_column
73
+ [
74
+ {
75
+ "name" => "time",
76
+ "type" => "INTEGER", # change type
77
+ "mode" => "REQUIRED"
78
+ },
79
+ {
80
+ "name" => "tty",
81
+ "type" => "STRING",
82
+ "mode" => "NULLABLE"
83
+ },
84
+ {
85
+ "name" => "pwd",
86
+ "type" => "STRING",
87
+ "mode" => "REQUIRED"
88
+ },
89
+ {
90
+ "name" => "user",
91
+ "type" => "STRING",
92
+ "mode" => "REQUIRED"
93
+ },
94
+ {
95
+ "name" => "argv",
96
+ "type" => "STRING",
97
+ "mode" => "REPEATED"
98
+ },
99
+ ]
100
+ end
101
+
102
+ def test_load_schema
103
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
104
+ fields.load_schema(base_schema, true)
105
+ assert { fields.to_a.as_json == base_schema }
106
+ end
107
+
108
+ def test_load_schema_allow_overwrite_with_type_changed_column
109
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
110
+ fields.load_schema(base_schema, true)
111
+
112
+ fields.load_schema(base_schema_with_type_changed_column, true)
113
+ assert { fields.to_a.as_json == base_schema_with_type_changed_column }
114
+ end
115
+
116
+ def test_load_schema_allow_overwrite_with_new_column
117
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
118
+ fields.load_schema(base_schema, true)
119
+
120
+ fields.load_schema(base_schema_with_new_column, true)
121
+ assert { fields.to_a.as_json == base_schema_with_new_column }
122
+ end
123
+
124
+ def test_load_schema_not_allow_overwrite_with_type_changed_column
125
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
126
+ fields.load_schema(base_schema, false)
127
+
128
+ fields.load_schema(base_schema_with_type_changed_column, false)
129
+ assert { fields.to_a.as_json == base_schema }
130
+ end
131
+
132
+ def test_load_schema_no_allow_overwrite_with_new_column
133
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
134
+ fields.load_schema(base_schema, false)
135
+
136
+ fields.load_schema(base_schema_with_new_column, false)
137
+ assert { fields.to_a.as_json == base_schema_with_new_column }
138
+ end
139
+
140
+ def test_format_one
141
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
142
+ fields.load_schema(base_schema, false)
143
+
144
+ time = Time.local(2016, 2, 7, 19, 0, 0).utc
145
+
146
+ formatted = fields.format_one({
147
+ "time" => time, "tty" => nil, "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", 42]
148
+ })
149
+ assert_equal(
150
+ formatted,
151
+ {
152
+ "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", "42"]
153
+ }
154
+ )
155
+ end
156
+
157
+ def test_format_one_with_extra_column
158
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
159
+ fields.load_schema(base_schema, false)
160
+
161
+ time = Time.local(2016, 2, 7, 19, 0, 0).utc
162
+
163
+ formatted = fields.format_one({
164
+ "time" => time, "tty" => nil, "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", 42.195], "extra" => "extra_data"
165
+ })
166
+ assert_equal(
167
+ formatted,
168
+ {
169
+ "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", "42.195"], "extra" => "extra_data"
170
+ }
171
+ )
172
+ end
173
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery-custom
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomohiro Hashidate
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-22 00:00:00.000000000 Z
11
+ date: 2016-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -183,6 +183,7 @@ files:
183
183
  - lib/fluent/plugin/out_bigquery.rb
184
184
  - test/helper.rb
185
185
  - test/plugin/test_out_bigquery.rb
186
+ - test/plugin/test_record_schema.rb
186
187
  - test/plugin/testdata/apache.schema
187
188
  - test/plugin/testdata/json_key.json
188
189
  - test/plugin/testdata/sudo.schema
@@ -213,6 +214,7 @@ summary: Fluentd plugin to store data on Google BigQuery
213
214
  test_files:
214
215
  - test/helper.rb
215
216
  - test/plugin/test_out_bigquery.rb
217
+ - test/plugin/test_record_schema.rb
216
218
  - test/plugin/testdata/apache.schema
217
219
  - test/plugin/testdata/json_key.json
218
220
  - test/plugin/testdata/sudo.schema