fluent-plugin-bigquery-custom 0.3.2 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cf7ef29505d6d6d1a7c0b1bb4418d33be7e81b1f
4
- data.tar.gz: 53193d13414cc8a1ef05f3825469a2c5cba0fb36
3
+ metadata.gz: 4af83e7241135e5fa4386ddd2342bc990448fd58
4
+ data.tar.gz: 94d7f710bcf578befd9fc1a0ba4cf35eaf79e72b
5
5
  SHA512:
6
- metadata.gz: bc5e2572202bcb9f99531cb1a6f03645310e91d5edc6133a5d509e5263e9f2efb3a203acfbd01cdcda14e1482f14251db89a053045cea2c6714b601c0f9b7c0a
7
- data.tar.gz: b30ddefbaa82d17732f483733d924962ffadb85677b9734e3794a0381de0865ea9804e569a9718ac4bc0c5111f9d15f5ab091130fe9a48e6e137c64cd9eef712
6
+ metadata.gz: aad322ae03a689b9bd9459f7eae50f8990801e95bd0f8816a9ce80948b6f962da2724ef35a8728d71d36c266b8a878507953e865f80928503b2ff925c4109f9d
7
+ data.tar.gz: 407e7510c1739175cfc40dc476df07bc01526221cc9a77712e9db0d21545ce175143d91e3d21dfe609790e0fa47598f7d9c303f818df9ac0a03837b6c0b6027e
data/README.md CHANGED
@@ -27,7 +27,15 @@ OAuth flow for installed applications.
27
27
  - `max_bad_records`
28
28
  - `ignore_unknown_values`
29
29
  - `prevent_duplicate_load`
30
+ - `template_suffix`
31
+ - `schema_cache_expire`
30
32
  - Improve error handling
33
+ - Add templateSuffix feature
34
+ - `template_suffix` can use same placeholder for `table`
35
+ - If use load method, emulate templateSuffix process. But, slightly different with Streaming Insert.
36
+ 1. Fetch Schema from base table per `schema_cache_expire` time
37
+ 1. If table exists, Insert job with no schema data.
38
+ 1. Unless table exists, Insert job with fetched schema data.
31
39
 
32
40
  ## Configuration
33
41
 
@@ -1,6 +1,6 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.3.2"
3
+ VERSION = "0.3.6"
4
4
  end
5
5
  end
6
6
 
@@ -63,6 +63,7 @@ module Fluent
63
63
  # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
64
64
  config_param :table, :string, default: nil
65
65
  config_param :tables, :string, default: nil
66
+ config_param :template_suffix, :string, default: nil
66
67
 
67
68
  config_param :auto_create_table, :bool, default: false
68
69
 
@@ -82,6 +83,7 @@ module Fluent
82
83
 
83
84
  config_param :schema_path, :string, default: nil
84
85
  config_param :fetch_schema, :bool, default: false
86
+ config_param :schema_cache_expire, :time, default: 600
85
87
  config_param :field_string, :string, default: nil
86
88
  config_param :field_integer, :string, default: nil
87
89
  config_param :field_float, :string, default: nil
@@ -171,7 +173,7 @@ module Fluent
171
173
  require 'digest/sha1'
172
174
  extend(LoadImplementation)
173
175
  else
174
- raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
176
+ raise Fluend::ConfigError, "'method' must be 'insert' or 'load'"
175
177
  end
176
178
 
177
179
  case @auth_method
@@ -253,8 +255,10 @@ module Fluent
253
255
 
254
256
  @tables_queue = @tablelist.dup.shuffle
255
257
  @tables_mutex = Mutex.new
258
+ @fetch_schema_mutex = Mutex.new
256
259
 
257
- fetch_schema() if @fetch_schema
260
+ @last_fetch_schema_time = 0
261
+ fetch_schema(false) if @fetch_schema
258
262
  end
259
263
 
260
264
  def client
@@ -356,32 +360,52 @@ module Fluent
356
360
  t
357
361
  end
358
362
  table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now), chunk)
359
- _write(chunk, table_id)
363
+ template_suffix = @template_suffix ? generate_table_id(@template_suffix, Time.at(Fluent::Engine.now), chunk) : nil
364
+ _write(chunk, table_id, template_suffix)
360
365
  end
361
366
 
362
- def fetch_schema
363
- table_id_format = @tablelist[0]
364
- table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
365
- res = client.get_table(@project, @dataset, table_id)
366
-
367
- schema = res.schema.fields.as_json
368
- log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
369
- @fields.load_schema(schema, false)
367
+ def fetch_schema(allow_overwrite = true)
368
+ table_id = nil
369
+ @fetch_schema_mutex.synchronize do
370
+ if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
371
+ table_id_format = @tablelist[0]
372
+ table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
373
+ res = client.get_table(@project, @dataset, table_id)
374
+
375
+ schema = res.schema.fields.as_json
376
+ log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
377
+ if allow_overwrite
378
+ fields = RecordSchema.new("record")
379
+ fields.load_schema(schema, allow_overwrite)
380
+ @fields = fields
381
+ else
382
+ @fields.load_schema(schema, allow_overwrite)
383
+ end
384
+ @last_fetch_schema_time = Fluent::Engine.now
385
+ end
386
+ end
370
387
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
371
388
  # api_error? -> client cache clear
372
389
  @cached_client = nil
373
390
  message = e.message
374
391
  log.error "tables.get API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
375
- raise "failed to fetch schema from bigquery" # TODO: error class
392
+ if @fields.empty?
393
+ raise "failed to fetch schema from bigquery" # TODO: error class
394
+ else
395
+ log.warn "Use previous schema"
396
+ @last_fetch_schema_time = Fluent::Engine.now
397
+ end
376
398
  end
377
399
 
378
400
  module InsertImplementation
379
401
  def format(tag, time, record)
380
- buf = ''
402
+ fetch_schema if @template_suffix
381
403
 
382
404
  if @replace_record_key
383
405
  record = replace_record_key(record)
384
406
  end
407
+
408
+ buf = String.new
385
409
  row = @fields.format(@add_time_field.call(record, time))
386
410
  unless row.empty?
387
411
  row = {"json" => row}
@@ -391,18 +415,20 @@ module Fluent
391
415
  buf
392
416
  end
393
417
 
394
- def _write(chunk, table_id)
418
+ def _write(chunk, table_id, template_suffix)
395
419
  rows = []
396
420
  chunk.msgpack_each do |row_object|
397
421
  # TODO: row size limit
398
422
  rows << row_object.deep_symbolize_keys
399
423
  end
400
424
 
401
- res = client.insert_all_table_data(@project, @dataset, table_id, {
425
+ body = {
402
426
  rows: rows,
403
427
  skip_invalid_rows: @skip_invalid_rows,
404
428
  ignore_unknown_values: @ignore_unknown_values,
405
- }, {})
429
+ }
430
+ body.merge!(template_suffix: template_suffix) if template_suffix
431
+ res = client.insert_all_table_data(@project, @dataset, table_id, body, {})
406
432
 
407
433
  if res.insert_errors
408
434
  reasons = []
@@ -428,7 +454,7 @@ module Fluent
428
454
  end
429
455
 
430
456
  reason = e.respond_to?(:reason) ? e.reason : nil
431
- log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
457
+ log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
432
458
 
433
459
  raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
434
460
 
@@ -441,11 +467,13 @@ module Fluent
441
467
 
442
468
  module LoadImplementation
443
469
  def format(tag, time, record)
444
- buf = ''
470
+ fetch_schema if @template_suffix
445
471
 
446
472
  if @replace_record_key
447
473
  record = replace_record_key(record)
448
474
  end
475
+
476
+ buf = String.new
449
477
  row = @fields.format(@add_time_field.call(record, time))
450
478
  unless row.empty?
451
479
  buf << MultiJson.dump(row) + "\n"
@@ -453,7 +481,7 @@ module Fluent
453
481
  buf
454
482
  end
455
483
 
456
- def _write(chunk, table_id)
484
+ def _write(chunk, table_id, template_suffix)
457
485
  res = nil
458
486
  job_id = nil
459
487
 
@@ -461,25 +489,7 @@ module Fluent
461
489
  if @prevent_duplicate_load
462
490
  job_id = create_job_id(upload_source.path, @dataset, @table, @fields.to_a, @max_bad_records, @ignore_unknown_values)
463
491
  end
464
- configuration = {
465
- configuration: {
466
- load: {
467
- destination_table: {
468
- project_id: @project,
469
- dataset_id: @dataset,
470
- table_id: table_id,
471
- },
472
- schema: {
473
- fields: @fields.to_a,
474
- },
475
- write_disposition: "WRITE_APPEND",
476
- source_format: "NEWLINE_DELIMITED_JSON",
477
- ignore_unknown_values: @ignore_unknown_values,
478
- max_bad_records: @max_bad_records,
479
- }
480
- }
481
- }
482
- configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
492
+ configuration = load_configuration(table_id, template_suffix, upload_source)
483
493
  res = client.insert_job(@project, configuration, {upload_source: upload_source, content_type: "application/octet-stream"})
484
494
  end
485
495
 
@@ -502,6 +512,45 @@ module Fluent
502
512
 
503
513
  private
504
514
 
515
+ def load_configuration(table_id, template_suffix, upload_source)
516
+ job_id = nil
517
+ if @prevent_duplicate_load
518
+ job_id = create_job_id(upload_source.path, @dataset, "#{table_id}#{template_suffix}", @fields.to_a, @max_bad_records, @ignore_unknown_values)
519
+ end
520
+
521
+ configuration = {
522
+ configuration: {
523
+ load: {
524
+ destination_table: {
525
+ project_id: @project,
526
+ dataset_id: @dataset,
527
+ table_id: "#{table_id}#{template_suffix}",
528
+ },
529
+ schema: {
530
+ fields: @fields.to_a,
531
+ },
532
+ write_disposition: "WRITE_APPEND",
533
+ source_format: "NEWLINE_DELIMITED_JSON",
534
+ ignore_unknown_values: @ignore_unknown_values,
535
+ max_bad_records: @max_bad_records,
536
+ }
537
+ }
538
+ }
539
+ configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
540
+
541
+ # If target table is already exist, omit schema configuration.
542
+ # Because schema changing is easier.
543
+ begin
544
+ if template_suffix && client.get_table(@project, @dataset, "#{table_id}#{template_suffix}")
545
+ configuration[:configuration][:load].delete(:schema)
546
+ end
547
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError
548
+ raise "Schema is empty" if @fields.empty?
549
+ end
550
+
551
+ configuration
552
+ end
553
+
505
554
  def wait_load(job_id)
506
555
  wait_interval = 10
507
556
  _response = client.get_job(@project, job_id)
@@ -581,8 +630,12 @@ module Fluent
581
630
  when :nullable
582
631
  format_one(value) unless value.nil?
583
632
  when :required
584
- raise "Required field #{name} cannot be null" if value.nil?
585
- format_one(value)
633
+ if value.nil?
634
+ log.warn "Required field #{name} cannot be null"
635
+ nil
636
+ else
637
+ format_one(value)
638
+ end
586
639
  when :repeated
587
640
  value.nil? ? [] : value.map {|v| format_one(v) }
588
641
  end
@@ -642,12 +695,32 @@ module Fluent
642
695
  end
643
696
 
644
697
  class TimestampFieldSchema < FieldSchema
698
+ INTEGER_REGEXP = /\A-?[[:digit:]]+\z/.freeze
699
+ FLOAT_REGEXP = /\A-?[[:digit:]]+(\.[[:digit:]]+)\z/.freeze
700
+
645
701
  def type
646
702
  :timestamp
647
703
  end
648
704
 
649
705
  def format_one(value)
650
- value
706
+ case value
707
+ when Time
708
+ value.strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
709
+ when String
710
+ if value =~ INTEGER_REGEXP
711
+ value.to_i
712
+ elsif value =~ FLOAT_REGEXP
713
+ value.to_f
714
+ else
715
+ begin
716
+ Time.parse(value).strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
717
+ rescue
718
+ value
719
+ end
720
+ end
721
+ else
722
+ value
723
+ end
651
724
  end
652
725
  end
653
726
 
@@ -674,6 +747,10 @@ module Fluent
674
747
  @fields[name]
675
748
  end
676
749
 
750
+ def empty?
751
+ @fields.empty?
752
+ end
753
+
677
754
  def to_a
678
755
  @fields.map do |_, field_schema|
679
756
  field_schema.to_h
@@ -729,11 +806,10 @@ module Fluent
729
806
 
730
807
  def format_one(record)
731
808
  out = {}
732
- @fields.each do |key, schema|
733
- value = record[key]
734
- formatted = schema.format(value)
735
- next if formatted.nil? # field does not exists, or null value
736
- out[key] = formatted
809
+ record.each do |key, value|
810
+ next if value.nil?
811
+ schema = @fields[key]
812
+ out[key] = schema ? schema.format(value) : value
737
813
  end
738
814
  out
739
815
  end
@@ -275,9 +275,15 @@ class BigQueryOutputTest < Test::Unit::TestCase
275
275
  "requesttime" => (now - 1).to_f.to_s.to_f,
276
276
  "bot_access" => true,
277
277
  "loginsession" => false,
278
+ "something-else" => "would be ignored",
279
+ "yet-another" => {
280
+ "foo" => "bar",
281
+ "baz" => 1,
282
+ },
278
283
  "remote" => {
279
284
  "host" => "remote.example",
280
285
  "ip" => "192.0.2.1",
286
+ "port" => 12345,
281
287
  "user" => "tagomoris",
282
288
  }
283
289
  }
@@ -429,12 +435,18 @@ class BigQueryOutputTest < Test::Unit::TestCase
429
435
  "remote" => {
430
436
  "host" => "remote.example",
431
437
  "ip" => "192.0.2.1",
438
+ "port" => 12345,
432
439
  "user" => "tagomoris",
433
440
  },
434
441
  "response" => {
435
442
  "status" => 1,
436
443
  "bytes" => 3,
437
444
  },
445
+ "something-else" => "would be ignored",
446
+ "yet-another" => {
447
+ "foo" => "bar",
448
+ "baz" => 1,
449
+ },
438
450
  }
439
451
  }
440
452
 
@@ -739,38 +751,6 @@ class BigQueryOutputTest < Test::Unit::TestCase
739
751
  assert_equal expected, buf
740
752
  end
741
753
 
742
- def test_empty_value_in_required
743
- now = Time.now
744
- input = [
745
- now,
746
- {
747
- "tty" => "pts/1",
748
- "pwd" => "/home/yugui",
749
- "user" => nil,
750
- "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
751
- }
752
- ]
753
-
754
- driver = create_driver(<<-CONFIG)
755
- table foo
756
- email foo@bar.example
757
- private_key_path /path/to/key
758
- project yourproject_id
759
- dataset yourdataset_id
760
-
761
- time_format %s
762
- time_field time
763
-
764
- schema_path #{File.join(File.dirname(__FILE__), "testdata", "sudo.schema")}
765
- field_integer time
766
- CONFIG
767
- driver.instance.start
768
- assert_raises(RuntimeError.new("Required field user cannot be null")) do
769
- driver.instance.format_stream("my.tag", [input])
770
- end
771
- driver.instance.shutdown
772
- end
773
-
774
754
  def test_replace_record_key
775
755
  now = Time.now
776
756
  input = [
@@ -0,0 +1,173 @@
1
+ require 'helper'
2
+ require 'active_support/json'
3
+ require 'active_support/core_ext/hash'
4
+ require 'active_support/core_ext/object/json'
5
+
6
+ class RecordSchemaTest < Test::Unit::TestCase
7
+ def base_schema
8
+ [
9
+ {
10
+ "name" => "time",
11
+ "type" => "TIMESTAMP",
12
+ "mode" => "REQUIRED"
13
+ },
14
+ {
15
+ "name" => "tty",
16
+ "type" => "STRING",
17
+ "mode" => "NULLABLE"
18
+ },
19
+ {
20
+ "name" => "pwd",
21
+ "type" => "STRING",
22
+ "mode" => "REQUIRED"
23
+ },
24
+ {
25
+ "name" => "user",
26
+ "type" => "STRING",
27
+ "mode" => "REQUIRED"
28
+ },
29
+ {
30
+ "name" => "argv",
31
+ "type" => "STRING",
32
+ "mode" => "REPEATED"
33
+ }
34
+ ]
35
+ end
36
+
37
+ def base_schema_with_new_column
38
+ [
39
+ {
40
+ "name" => "time",
41
+ "type" => "TIMESTAMP",
42
+ "mode" => "REQUIRED"
43
+ },
44
+ {
45
+ "name" => "tty",
46
+ "type" => "STRING",
47
+ "mode" => "NULLABLE"
48
+ },
49
+ {
50
+ "name" => "pwd",
51
+ "type" => "STRING",
52
+ "mode" => "REQUIRED"
53
+ },
54
+ {
55
+ "name" => "user",
56
+ "type" => "STRING",
57
+ "mode" => "REQUIRED"
58
+ },
59
+ {
60
+ "name" => "argv",
61
+ "type" => "STRING",
62
+ "mode" => "REPEATED"
63
+ },
64
+ {
65
+ "name" => "new_column",
66
+ "type" => "STRING",
67
+ "mode" => "REQUIRED"
68
+ }
69
+ ]
70
+ end
71
+
72
+ def base_schema_with_type_changed_column
73
+ [
74
+ {
75
+ "name" => "time",
76
+ "type" => "INTEGER", # change type
77
+ "mode" => "REQUIRED"
78
+ },
79
+ {
80
+ "name" => "tty",
81
+ "type" => "STRING",
82
+ "mode" => "NULLABLE"
83
+ },
84
+ {
85
+ "name" => "pwd",
86
+ "type" => "STRING",
87
+ "mode" => "REQUIRED"
88
+ },
89
+ {
90
+ "name" => "user",
91
+ "type" => "STRING",
92
+ "mode" => "REQUIRED"
93
+ },
94
+ {
95
+ "name" => "argv",
96
+ "type" => "STRING",
97
+ "mode" => "REPEATED"
98
+ },
99
+ ]
100
+ end
101
+
102
+ def test_load_schema
103
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
104
+ fields.load_schema(base_schema, true)
105
+ assert { fields.to_a.as_json == base_schema }
106
+ end
107
+
108
+ def test_load_schema_allow_overwrite_with_type_changed_column
109
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
110
+ fields.load_schema(base_schema, true)
111
+
112
+ fields.load_schema(base_schema_with_type_changed_column, true)
113
+ assert { fields.to_a.as_json == base_schema_with_type_changed_column }
114
+ end
115
+
116
+ def test_load_schema_allow_overwrite_with_new_column
117
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
118
+ fields.load_schema(base_schema, true)
119
+
120
+ fields.load_schema(base_schema_with_new_column, true)
121
+ assert { fields.to_a.as_json == base_schema_with_new_column }
122
+ end
123
+
124
+ def test_load_schema_not_allow_overwrite_with_type_changed_column
125
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
126
+ fields.load_schema(base_schema, false)
127
+
128
+ fields.load_schema(base_schema_with_type_changed_column, false)
129
+ assert { fields.to_a.as_json == base_schema }
130
+ end
131
+
132
+ def test_load_schema_no_allow_overwrite_with_new_column
133
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
134
+ fields.load_schema(base_schema, false)
135
+
136
+ fields.load_schema(base_schema_with_new_column, false)
137
+ assert { fields.to_a.as_json == base_schema_with_new_column }
138
+ end
139
+
140
+ def test_format_one
141
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
142
+ fields.load_schema(base_schema, false)
143
+
144
+ time = Time.local(2016, 2, 7, 19, 0, 0).utc
145
+
146
+ formatted = fields.format_one({
147
+ "time" => time, "tty" => nil, "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", 42]
148
+ })
149
+ assert_equal(
150
+ formatted,
151
+ {
152
+ "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", "42"]
153
+ }
154
+ )
155
+ end
156
+
157
+ def test_format_one_with_extra_column
158
+ fields = Fluent::BigQueryOutput::RecordSchema.new("record")
159
+ fields.load_schema(base_schema, false)
160
+
161
+ time = Time.local(2016, 2, 7, 19, 0, 0).utc
162
+
163
+ formatted = fields.format_one({
164
+ "time" => time, "tty" => nil, "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", 42.195], "extra" => "extra_data"
165
+ })
166
+ assert_equal(
167
+ formatted,
168
+ {
169
+ "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", "42.195"], "extra" => "extra_data"
170
+ }
171
+ )
172
+ end
173
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery-custom
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomohiro Hashidate
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-22 00:00:00.000000000 Z
11
+ date: 2016-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -183,6 +183,7 @@ files:
183
183
  - lib/fluent/plugin/out_bigquery.rb
184
184
  - test/helper.rb
185
185
  - test/plugin/test_out_bigquery.rb
186
+ - test/plugin/test_record_schema.rb
186
187
  - test/plugin/testdata/apache.schema
187
188
  - test/plugin/testdata/json_key.json
188
189
  - test/plugin/testdata/sudo.schema
@@ -213,6 +214,7 @@ summary: Fluentd plugin to store data on Google BigQuery
213
214
  test_files:
214
215
  - test/helper.rb
215
216
  - test/plugin/test_out_bigquery.rb
217
+ - test/plugin/test_record_schema.rb
216
218
  - test/plugin/testdata/apache.schema
217
219
  - test/plugin/testdata/json_key.json
218
220
  - test/plugin/testdata/sudo.schema