RubyGems - fluent-plugin-bigquery-custom - Versions diffs - 0.3.2 → 0.3.6 - Mend

fluent-plugin-bigquery-custom 0.3.2 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +8 -0
data/lib/fluent/plugin/bigquery/version.rb +1 -1
data/lib/fluent/plugin/out_bigquery.rb +122 -46
data/test/plugin/test_out_bigquery.rb +12 -32
data/test/plugin/test_record_schema.rb +173 -0
metadata +4 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cf7ef29505d6d6d1a7c0b1bb4418d33be7e81b1f
-  data.tar.gz: 53193d13414cc8a1ef05f3825469a2c5cba0fb36
+  metadata.gz: 4af83e7241135e5fa4386ddd2342bc990448fd58
+  data.tar.gz: 94d7f710bcf578befd9fc1a0ba4cf35eaf79e72b
 SHA512:
-  metadata.gz: bc5e2572202bcb9f99531cb1a6f03645310e91d5edc6133a5d509e5263e9f2efb3a203acfbd01cdcda14e1482f14251db89a053045cea2c6714b601c0f9b7c0a
-  data.tar.gz: b30ddefbaa82d17732f483733d924962ffadb85677b9734e3794a0381de0865ea9804e569a9718ac4bc0c5111f9d15f5ab091130fe9a48e6e137c64cd9eef712
+  metadata.gz: aad322ae03a689b9bd9459f7eae50f8990801e95bd0f8816a9ce80948b6f962da2724ef35a8728d71d36c266b8a878507953e865f80928503b2ff925c4109f9d
+  data.tar.gz: 407e7510c1739175cfc40dc476df07bc01526221cc9a77712e9db0d21545ce175143d91e3d21dfe609790e0fa47598f7d9c303f818df9ac0a03837b6c0b6027e

data/README.md CHANGED

@@ -27,7 +27,15 @@ OAuth flow for installed applications.
   - `max_bad_records`
   - `ignore_unknown_values`
   - `prevent_duplicate_load`
+  - `template_suffix`
+  - `schema_cache_expire`
 - Improve error handling
+- Add templateSuffix feature
+  - `template_suffix` can use same placeholder for `table`
+  - If use load method, emulate templateSuffix process. But, slightly different with Streaming Insert.
+    1. Fetch Schema from base table per `schema_cache_expire` time
+    1. If table exists, Insert job with no schema data.
+    1. Unless table exists, Insert job with fetched schema data.
 ## Configuration

data/lib/fluent/plugin/bigquery/version.rb CHANGED

@@ -1,6 +1,6 @@
 module Fluent
   module BigQueryPlugin
-    VERSION = "0.3.2"
+    VERSION = "0.3.6"
   end
 end

data/lib/fluent/plugin/out_bigquery.rb CHANGED

@@ -63,6 +63,7 @@ module Fluent
     #   In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
     config_param :table, :string, default: nil
     config_param :tables, :string, default: nil
+    config_param :template_suffix, :string, default: nil
     config_param :auto_create_table, :bool, default: false
@@ -82,6 +83,7 @@ module Fluent
     config_param :schema_path, :string, default: nil
     config_param :fetch_schema, :bool, default: false
+    config_param :schema_cache_expire, :time, default: 600
     config_param :field_string,  :string, default: nil
     config_param :field_integer, :string, default: nil
     config_param :field_float,   :string, default: nil
@@ -171,7 +173,7 @@ module Fluent
         require 'digest/sha1'
         extend(LoadImplementation)
       else
-        raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
+        raise Fluend::ConfigError, "'method' must be 'insert' or 'load'"
       end
       case @auth_method
@@ -253,8 +255,10 @@ module Fluent
       @tables_queue = @tablelist.dup.shuffle
       @tables_mutex = Mutex.new
+      @fetch_schema_mutex = Mutex.new
-      fetch_schema() if @fetch_schema
+      @last_fetch_schema_time = 0
+      fetch_schema(false) if @fetch_schema
     end
     def client
@@ -356,32 +360,52 @@ module Fluent
         t
       end
       table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now), chunk)
-      _write(chunk, table_id)
+      template_suffix = @template_suffix ? generate_table_id(@template_suffix, Time.at(Fluent::Engine.now), chunk) : nil
+      _write(chunk, table_id, template_suffix)
     end
-    def fetch_schema
-      table_id_format = @tablelist[0]
-      table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
-      res = client.get_table(@project, @dataset, table_id)
-      schema = res.schema.fields.as_json
-      log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
-      @fields.load_schema(schema, false)
+    def fetch_schema(allow_overwrite = true)
+      table_id = nil
+      @fetch_schema_mutex.synchronize do
+        if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
+          table_id_format = @tablelist[0]
+          table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
+          res = client.get_table(@project, @dataset, table_id)
+          schema = res.schema.fields.as_json
+          log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
+          if allow_overwrite
+            fields = RecordSchema.new("record")
+            fields.load_schema(schema, allow_overwrite)
+            @fields = fields
+          else
+            @fields.load_schema(schema, allow_overwrite)
+          end
+          @last_fetch_schema_time = Fluent::Engine.now
+        end
+      end
     rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
       # api_error? -> client cache clear
       @cached_client = nil
       message = e.message
       log.error "tables.get API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
-      raise "failed to fetch schema from bigquery" # TODO: error class
+      if @fields.empty?
+        raise "failed to fetch schema from bigquery" # TODO: error class
+      else
+        log.warn "Use previous schema"
+        @last_fetch_schema_time = Fluent::Engine.now
+      end
     end
     module InsertImplementation
       def format(tag, time, record)
-        buf = ''
+        fetch_schema if @template_suffix
         if @replace_record_key
           record = replace_record_key(record)
         end
+        buf = String.new
         row = @fields.format(@add_time_field.call(record, time))
         unless row.empty?
           row = {"json" => row}
@@ -391,18 +415,20 @@ module Fluent
         buf
       end
-      def _write(chunk, table_id)
+      def _write(chunk, table_id, template_suffix)
         rows = []
         chunk.msgpack_each do |row_object|
           # TODO: row size limit
           rows << row_object.deep_symbolize_keys
         end
-        res = client.insert_all_table_data(@project, @dataset, table_id, {
+        body = {
           rows: rows,
           skip_invalid_rows: @skip_invalid_rows,
           ignore_unknown_values: @ignore_unknown_values,
-        }, {})
+        }
+        body.merge!(template_suffix: template_suffix) if template_suffix
+        res = client.insert_all_table_data(@project, @dataset, table_id, body, {})
         if res.insert_errors
           reasons = []
@@ -428,7 +454,7 @@ module Fluent
         end
         reason = e.respond_to?(:reason) ? e.reason : nil
-        log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
+        log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
         raise "failed to insert into bigquery, retry" if reason == "backendError" # backendError is retryable. TODO: error class
@@ -441,11 +467,13 @@ module Fluent
     module LoadImplementation
       def format(tag, time, record)
-        buf = ''
+        fetch_schema if @template_suffix
         if @replace_record_key
           record = replace_record_key(record)
         end
+        buf = String.new
         row = @fields.format(@add_time_field.call(record, time))
         unless row.empty?
           buf << MultiJson.dump(row) + "\n"
@@ -453,7 +481,7 @@ module Fluent
         buf
       end
-      def _write(chunk, table_id)
+      def _write(chunk, table_id, template_suffix)
         res = nil
         job_id = nil
@@ -461,25 +489,7 @@ module Fluent
           if @prevent_duplicate_load
             job_id = create_job_id(upload_source.path, @dataset, @table, @fields.to_a, @max_bad_records, @ignore_unknown_values)
           end
-          configuration = {
-            configuration: {
-              load: {
-                destination_table: {
-                  project_id: @project,
-                  dataset_id: @dataset,
-                  table_id: table_id,
-                },
-                schema: {
-                  fields: @fields.to_a,
-                },
-                write_disposition: "WRITE_APPEND",
-                source_format: "NEWLINE_DELIMITED_JSON",
-                ignore_unknown_values: @ignore_unknown_values,
-                max_bad_records: @max_bad_records,
-              }
-            }
-          }
-          configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
+          configuration = load_configuration(table_id, template_suffix, upload_source)
           res = client.insert_job(@project, configuration, {upload_source: upload_source, content_type: "application/octet-stream"})
         end
@@ -502,6 +512,45 @@ module Fluent
       private
+      def load_configuration(table_id, template_suffix, upload_source)
+        job_id = nil
+        if @prevent_duplicate_load
+          job_id = create_job_id(upload_source.path, @dataset, "#{table_id}#{template_suffix}", @fields.to_a, @max_bad_records, @ignore_unknown_values)
+        end
+        configuration = {
+          configuration: {
+            load: {
+              destination_table: {
+                project_id: @project,
+                dataset_id: @dataset,
+                table_id: "#{table_id}#{template_suffix}",
+              },
+              schema: {
+                fields: @fields.to_a,
+              },
+              write_disposition: "WRITE_APPEND",
+              source_format: "NEWLINE_DELIMITED_JSON",
+              ignore_unknown_values: @ignore_unknown_values,
+              max_bad_records: @max_bad_records,
+            }
+          }
+        }
+        configuration.merge!({job_reference: {project_id: @project, job_id: job_id}}) if job_id
+        # If target table is already exist, omit schema configuration.
+        # Because schema changing is easier.
+        begin
+          if template_suffix && client.get_table(@project, @dataset, "#{table_id}#{template_suffix}")
+            configuration[:configuration][:load].delete(:schema)
+          end
+        rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError
+          raise "Schema is empty" if @fields.empty?
+        end
+        configuration
+      end
       def wait_load(job_id)
         wait_interval = 10
         _response = client.get_job(@project, job_id)
@@ -581,8 +630,12 @@ module Fluent
         when :nullable
           format_one(value) unless value.nil?
         when :required
-          raise "Required field #{name} cannot be null" if value.nil?
-          format_one(value)
+          if value.nil?
+            log.warn "Required field #{name} cannot be null"
+            nil
+          else
+            format_one(value)
+          end
         when :repeated
           value.nil? ? [] : value.map {|v| format_one(v) }
         end
@@ -642,12 +695,32 @@ module Fluent
     end
     class TimestampFieldSchema < FieldSchema
+      INTEGER_REGEXP = /\A-?[[:digit:]]+\z/.freeze
+      FLOAT_REGEXP = /\A-?[[:digit:]]+(\.[[:digit:]]+)\z/.freeze
       def type
         :timestamp
       end
       def format_one(value)
-        value
+        case value
+        when Time
+          value.strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
+        when String
+          if value =~ INTEGER_REGEXP
+            value.to_i
+          elsif value =~ FLOAT_REGEXP
+            value.to_f
+          else
+            begin
+              Time.parse(value).strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
+            rescue
+              value
+            end
+          end
+        else
+          value
+        end
       end
     end
@@ -674,6 +747,10 @@ module Fluent
         @fields[name]
       end
+      def empty?
+        @fields.empty?
+      end
       def to_a
         @fields.map do |_, field_schema|
           field_schema.to_h
@@ -729,11 +806,10 @@ module Fluent
       def format_one(record)
         out = {}
-        @fields.each do |key, schema|
-          value = record[key]
-          formatted = schema.format(value)
-          next if formatted.nil? # field does not exists, or null value
-          out[key] = formatted
+        record.each do |key, value|
+          next if value.nil?
+          schema = @fields[key]
+          out[key] = schema ? schema.format(value) : value
         end
         out
       end

data/test/plugin/test_out_bigquery.rb CHANGED

@@ -275,9 +275,15 @@ class BigQueryOutputTest < Test::Unit::TestCase
         "requesttime" => (now - 1).to_f.to_s.to_f,
         "bot_access" => true,
         "loginsession" => false,
+        "something-else" => "would be ignored",
+        "yet-another" => {
+          "foo" => "bar",
+          "baz" => 1,
+        },
         "remote" => {
           "host" => "remote.example",
           "ip" =>  "192.0.2.1",
+          "port" => 12345,
           "user" => "tagomoris",
         }
       }
@@ -429,12 +435,18 @@ class BigQueryOutputTest < Test::Unit::TestCase
         "remote" => {
           "host" => "remote.example",
           "ip" =>  "192.0.2.1",
+          "port" => 12345,
           "user" => "tagomoris",
         },
         "response" => {
           "status" => 1,
           "bytes" => 3,
         },
+        "something-else" => "would be ignored",
+        "yet-another" => {
+          "foo" => "bar",
+          "baz" => 1,
+        },
       }
     }
@@ -739,38 +751,6 @@ class BigQueryOutputTest < Test::Unit::TestCase
     assert_equal expected, buf
   end
-  def test_empty_value_in_required
-    now = Time.now
-    input = [
-      now,
-      {
-        "tty" => "pts/1",
-        "pwd" => "/home/yugui",
-        "user" => nil,
-        "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
-      }
-    ]
-    driver = create_driver(<<-CONFIG)
-      table foo
-      email foo@bar.example
-      private_key_path /path/to/key
-      project yourproject_id
-      dataset yourdataset_id
-      time_format %s
-      time_field  time
-      schema_path #{File.join(File.dirname(__FILE__), "testdata", "sudo.schema")}
-      field_integer time
-    CONFIG
-    driver.instance.start
-    assert_raises(RuntimeError.new("Required field user cannot be null")) do
-      driver.instance.format_stream("my.tag", [input])
-    end
-    driver.instance.shutdown
-  end
   def test_replace_record_key
     now = Time.now
     input = [

data/test/plugin/test_record_schema.rb ADDED

@@ -0,0 +1,173 @@
+require 'helper'
+require 'active_support/json'
+require 'active_support/core_ext/hash'
+require 'active_support/core_ext/object/json'
+class RecordSchemaTest < Test::Unit::TestCase
+  def base_schema
+    [
+      {
+        "name" => "time",
+        "type" => "TIMESTAMP",
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "tty",
+        "type" => "STRING",
+        "mode" => "NULLABLE"
+      },
+      {
+        "name" => "pwd",
+        "type" => "STRING",
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "user",
+        "type" => "STRING",
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "argv",
+        "type" => "STRING",
+        "mode" => "REPEATED"
+      }
+    ]
+  end
+  def base_schema_with_new_column
+    [
+      {
+        "name" => "time",
+        "type" => "TIMESTAMP",
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "tty",
+        "type" => "STRING",
+        "mode" => "NULLABLE"
+      },
+      {
+        "name" => "pwd",
+        "type" => "STRING",
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "user",
+        "type" => "STRING",
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "argv",
+        "type" => "STRING",
+        "mode" => "REPEATED"
+      },
+      {
+        "name" => "new_column",
+        "type" => "STRING",
+        "mode" => "REQUIRED"
+      }
+    ]
+  end
+  def base_schema_with_type_changed_column
+    [
+      {
+        "name" => "time",
+        "type" => "INTEGER", # change type
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "tty",
+        "type" => "STRING",
+        "mode" => "NULLABLE"
+      },
+      {
+        "name" => "pwd",
+        "type" => "STRING",
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "user",
+        "type" => "STRING",
+        "mode" => "REQUIRED"
+      },
+      {
+        "name" => "argv",
+        "type" => "STRING",
+        "mode" => "REPEATED"
+      },
+    ]
+  end
+  def test_load_schema
+    fields = Fluent::BigQueryOutput::RecordSchema.new("record")
+    fields.load_schema(base_schema, true)
+    assert { fields.to_a.as_json == base_schema }
+  end
+  def test_load_schema_allow_overwrite_with_type_changed_column
+    fields = Fluent::BigQueryOutput::RecordSchema.new("record")
+    fields.load_schema(base_schema, true)
+    fields.load_schema(base_schema_with_type_changed_column, true)
+    assert { fields.to_a.as_json == base_schema_with_type_changed_column }
+  end
+  def test_load_schema_allow_overwrite_with_new_column
+    fields = Fluent::BigQueryOutput::RecordSchema.new("record")
+    fields.load_schema(base_schema, true)
+    fields.load_schema(base_schema_with_new_column, true)
+    assert { fields.to_a.as_json == base_schema_with_new_column }
+  end
+  def test_load_schema_not_allow_overwrite_with_type_changed_column
+    fields = Fluent::BigQueryOutput::RecordSchema.new("record")
+    fields.load_schema(base_schema, false)
+    fields.load_schema(base_schema_with_type_changed_column, false)
+    assert { fields.to_a.as_json == base_schema }
+  end
+  def test_load_schema_no_allow_overwrite_with_new_column
+    fields = Fluent::BigQueryOutput::RecordSchema.new("record")
+    fields.load_schema(base_schema, false)
+    fields.load_schema(base_schema_with_new_column, false)
+    assert { fields.to_a.as_json == base_schema_with_new_column }
+  end
+  def test_format_one
+    fields = Fluent::BigQueryOutput::RecordSchema.new("record")
+    fields.load_schema(base_schema, false)
+    time = Time.local(2016, 2, 7, 19, 0, 0).utc
+    formatted = fields.format_one({
+      "time" => time, "tty" => nil, "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", 42]
+    })
+    assert_equal(
+      formatted,
+      {
+        "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", "42"]
+      }
+    )
+  end
+  def test_format_one_with_extra_column
+    fields = Fluent::BigQueryOutput::RecordSchema.new("record")
+    fields.load_schema(base_schema, false)
+    time = Time.local(2016, 2, 7, 19, 0, 0).utc
+    formatted = fields.format_one({
+      "time" => time, "tty" => nil, "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", 42.195], "extra" => "extra_data"
+    })
+    assert_equal(
+      formatted,
+      {
+        "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "pwd" => "/home", "user" => "joker1007", "argv" => ["foo", "42.195"], "extra" => "extra_data"
+      }
+    )
+  end
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fluent-plugin-bigquery-custom
 version: !ruby/object:Gem::Version
-  version: 0.3.2
+  version: 0.3.6
 platform: ruby
 authors:
 - Tomohiro Hashidate
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-01-22 00:00:00.000000000 Z
+date: 2016-02-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -183,6 +183,7 @@ files:
 - lib/fluent/plugin/out_bigquery.rb
 - test/helper.rb
 - test/plugin/test_out_bigquery.rb
+- test/plugin/test_record_schema.rb
 - test/plugin/testdata/apache.schema
 - test/plugin/testdata/json_key.json
 - test/plugin/testdata/sudo.schema
@@ -213,6 +214,7 @@ summary: Fluentd plugin to store data on Google BigQuery
 test_files:
 - test/helper.rb
 - test/plugin/test_out_bigquery.rb
+- test/plugin/test_record_schema.rb
 - test/plugin/testdata/apache.schema
 - test/plugin/testdata/json_key.json
 - test/plugin/testdata/sudo.schema