RubyGems - fluent-plugin-bigquery - Versions diffs - 2.1.0 → 2.2.0 - Mend

fluent-plugin-bigquery 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +10 -5
data/lib/fluent/plugin/bigquery/schema.rb +11 -0
data/lib/fluent/plugin/bigquery/version.rb +1 -1
data/lib/fluent/plugin/bigquery/writer.rb +16 -2
data/lib/fluent/plugin/out_bigquery_base.rb +8 -2
data/test/plugin/test_out_bigquery_insert.rb +90 -1
data/test/plugin/test_out_bigquery_load.rb +2 -2
data/test/plugin/test_record_schema.rb +17 -2
metadata +3 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b463f412345eb71d1b263bf56e0cd51ebe1c2dacffaa223293edb8d4e5776e73
-  data.tar.gz: f5f7766b2d0f4498239389ef38eb29ef9d20dbe9b118890e8d651b23330d33ca
+  metadata.gz: 36b950bf0783d3ce350d7c7514f5b7946b10fe4b867aec015c9331656e86eb48
+  data.tar.gz: b4b8e92f41008043b09822b20698a7e29ca8daf9ba69c2a5c38c696553e86d71
 SHA512:
-  metadata.gz: 8d3851b83d9cbc7c802836dc5f5709d2f92009f980a3a6d3566730eea55fdaf697540c0370220441ed1d88687c27eb8677506e9897693469ef4fcb347d1e7825
-  data.tar.gz: 39223f99503c53a812549b4ff8de2a94c3b7db670e6dd9819840d86d561fe68c922f82c18f0201abe8625b2dbf79d0741413d21c56d9d0855b1889b68946a2f8
+  metadata.gz: 01d3d39d9247134ca9059b990d0d6a52f308b27711d8cd989de30dfeb4e91a1673f1047d4e9269d24447169d9ec4bbac1d0d9b9f7d93b08b7be5d6c170593f1f
+  data.tar.gz: f226de7925fb048ba5533bf9b7c626f43e4b63eeb92c119d700737d1ae44611fb6fe6294e1ed5f989456de2ee3e1f98334c2d4cd1d89c49b52ef945a3674c8ce

data/README.md CHANGED

@@ -44,6 +44,7 @@ Because embbeded gem dependency sometimes restricts ruby environment.
 | private_key_path                              | string        | yes (private_key)                            | no           | nil                        | GCP Private Key file path                                                                              |
 | private_key_passphrase                        | string        | yes (private_key)                            | no           | nil                        | GCP Private Key Passphrase                                                                             |
 | json_key                                      | string        | yes (json_key)                               | no           | nil                        | GCP JSON Key file path or JSON Key string                                                              |
+| location                                      | string        | no                                           | no           | nil                        | BigQuery Data Location. The geographic location of the job. Required except for US and EU.             |
 | project                                       | string        | yes                                          | yes          | nil                        |                                                                                                        |
 | dataset                                       | string        | yes                                          | yes          | nil                        |                                                                                                        |
 | table                                         | string        | yes (either `tables`)                        | yes          | nil                        |                                                                                                        |
@@ -57,10 +58,10 @@ Because embbeded gem dependency sometimes restricts ruby environment.
 | schema_cache_expire                           | integer       | no                                           | no           | 600                        | Value is second. If current time is after expiration interval, re-fetch table schema definition.       |
 | request_timeout_sec                           | integer       | no                                           | no           | nil                        | Bigquery API response timeout                                                                          |
 | request_open_timeout_sec                      | integer       | no                                           | no           | 60                         | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value.       |
-| time_partitioning_type                        | enum          | no (either day)                              | no           | nil                        | Type of bigquery time partitioning feature(experimental feature on BigQuery).                          |
-| time_partitioning_field                       | string        | no                                           | no           | nil                        | Field used to determine how to create a time-based partition(experimental feature on BigQuery).        |
-| time_partitioning_expiration                  | time          | no                                           | no           | nil                        | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery)             |
-| time_partitioning_require_partition_filter    | bool          | no                                           | no           | false                      | If true, queries over this table require a partition filter that can be used for partition elimination to be specified. (experimental feature on BigQuery) |
+| time_partitioning_type                        | enum          | no (either day)                              | no           | nil                        | Type of bigquery time partitioning feature.                                                            |
+| time_partitioning_field                       | string        | no                                           | no           | nil                        | Field used to determine how to create a time-based partition.                                          |
+| time_partitioning_expiration                  | time          | no                                           | no           | nil                        | Expiration milliseconds for bigquery time partitioning.                                                |
+| clustering_fields                             | array(string) | no                                           | no           | nil                        | One or more fields on which data should be clustered. The order of the specified columns determines the sort order of the data. |
 #### bigquery_insert
@@ -433,7 +434,7 @@ Use placeholder.
 ```apache
 <match dummy>
-  @type bigquery_insert
+  @type bigquery_load
   ...
   table   accesslog$%Y%m%d
@@ -446,6 +447,8 @@ Use placeholder.
 ```
 But, Dynamic table creating doesn't support date partitioned table yet.
+And streaming insert is not allowed to insert with `$%Y%m%d` suffix.
+If you use date partitioned table with streaming insert, Please omit `$%Y%m%d` suffix from `table`.
 ### Dynamic table creating
@@ -467,6 +470,8 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
 </match>
 ```
+Also, you can create clustered table by using `clustering_fields`.
 ### Table schema
 There are three methods to describe the schema of the target table.

data/lib/fluent/plugin/bigquery/schema.rb CHANGED

@@ -86,6 +86,16 @@ module Fluent
       end
     end
+    class NumericFieldSchema < FieldSchema
+      def type
+        :numeric
+      end
+      def format_one(value)
+        value.to_s
+      end
+    end
     class BooleanFieldSchema < FieldSchema
       def type
         :boolean
@@ -169,6 +179,7 @@ module Fluent
         string: StringFieldSchema,
         integer: IntegerFieldSchema,
         float: FloatFieldSchema,
+        numeric: NumericFieldSchema,
         boolean: BooleanFieldSchema,
         timestamp: TimestampFieldSchema,
         date: DateFieldSchema,

data/lib/fluent/plugin/bigquery/version.rb CHANGED

@@ -1,5 +1,5 @@
 module Fluent
   module BigQueryPlugin
-    VERSION = "2.1.0".freeze
+    VERSION = "2.2.0".freeze
   end
 end

data/lib/fluent/plugin/bigquery/writer.rb CHANGED

@@ -35,6 +35,7 @@ module Fluent
           }
           definition.merge!(time_partitioning: time_partitioning) if time_partitioning
+          definition.merge!(clustering: clustering) if clustering
           client.insert_table(project, dataset, definition, {})
           log.debug "create table", project_id: project, dataset: dataset, table: table_id
         rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
@@ -149,6 +150,7 @@ module Fluent
             raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
             configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
             configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
+            configuration[:configuration][:load].merge!(clustering: clustering) if clustering
           end
         end
@@ -174,8 +176,9 @@ module Fluent
       def fetch_load_job(job_reference)
         project = job_reference.project_id
         job_id = job_reference.job_id
+        location = @options[:location]
-        res = client.get_job(project, job_id)
+        res = client.get_job(project, job_id, location: location)
         log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
         if res.status.state == "DONE"
@@ -309,13 +312,24 @@ module Fluent
             type: @options[:time_partitioning_type].to_s.upcase,
             field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
             expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
-            require_partition_filter: @options[:time_partitioning_require_partition_filter],
           }.reject { |_, v| v.nil? }
         else
           @time_partitioning
         end
       end
+      def clustering
+        return @clustering if instance_variable_defined?(:@clustering)
+        if @options[:clustering_fields]
+          @clustering = {
+            fields: @options[:clustering_fields]
+          }
+        else
+          @clustering
+        end
+      end
       def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
         try_count ||= 1
         res = client.insert_all_table_data(project, dataset, table_id, body, {})

data/lib/fluent/plugin/out_bigquery_base.rb CHANGED

@@ -29,6 +29,9 @@ module Fluent
       config_param :private_key_path, :string, default: nil
       config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
       config_param :json_key, default: nil, secret: true
+      # The geographic location of the job. Required except for US and EU.
+      # https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
+      config_param :location, :string, default: nil
       # see as simple reference
       #   https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
@@ -69,7 +72,9 @@ module Fluent
       config_param :time_partitioning_type, :enum, list: [:day], default: nil
       config_param :time_partitioning_field, :string, default: nil
       config_param :time_partitioning_expiration, :time, default: nil
-      config_param :time_partitioning_require_partition_filter, :bool, default: false
+      ## Clustering
+      config_param :clustering_fields, :array, default: nil
       ## Formatter
       config_section :format do
@@ -132,6 +137,7 @@ module Fluent
           private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
           email: @email,
           json_key: @json_key,
+          location: @location,
           source_format: @source_format,
           skip_invalid_rows: @skip_invalid_rows,
           ignore_unknown_values: @ignore_unknown_values,
@@ -142,7 +148,7 @@ module Fluent
           time_partitioning_type: @time_partitioning_type,
           time_partitioning_field: @time_partitioning_field,
           time_partitioning_expiration: @time_partitioning_expiration,
-          time_partitioning_require_partition_filter: @time_partitioning_require_partition_filter,
+          clustering_fields: @clustering_fields,
           timeout_sec: @request_timeout_sec,
           open_timeout_sec: @request_open_timeout_sec,
         })

data/test/plugin/test_out_bigquery_insert.rb CHANGED

@@ -400,6 +400,85 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
       }
     }
+    driver = create_driver(<<-CONFIG)
+      table foo
+      email foo@bar.example
+      private_key_path /path/to/key
+      project yourproject_id
+      dataset yourdataset_id
+      time_format %s
+      time_field  time
+      auto_create_table true
+      schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
+      time_partitioning_type day
+      time_partitioning_field time
+      time_partitioning_expiration 1h
+    CONFIG
+    stub_writer do |writer|
+      body = {
+        rows: [message],
+        skip_invalid_rows: false,
+        ignore_unknown_values: false,
+      }
+      mock(writer.client).insert_all_table_data('yourproject_id', 'yourdataset_id', 'foo', body, {}) do
+        raise Google::Apis::ClientError.new("notFound: Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404)
+      end.at_least(1)
+      mock(writer).sleep(instance_of(Numeric)) { nil }.at_least(1)
+      mock(writer.client).insert_table('yourproject_id', 'yourdataset_id', {
+        table_reference: {
+          table_id: 'foo',
+        },
+        schema: {
+          fields: driver.instance.instance_variable_get(:@table_schema).to_a,
+        },
+        time_partitioning: {
+          type: 'DAY',
+          field: 'time',
+          expiration_ms: 3600000,
+        },
+      }, {})
+    end
+    assert_raise(RuntimeError) do
+      driver.run do
+        driver.feed("tag", Fluent::EventTime.now, message[:json])
+      end
+    end
+  end
+  def test_auto_create_clustered_table_by_bigquery_api
+    now = Time.now
+    message = {
+      json: {
+        time: now.to_i,
+        request: {
+          vhost: "bar",
+          path: "/path/to/baz",
+          method: "GET",
+          protocol: "HTTP/1.0",
+          agent: "libwww",
+          referer: "http://referer.example",
+          time: (now - 1).to_f,
+          bot_access: true,
+          loginsession: false,
+        },
+        remote: {
+          host: "remote.example",
+          ip: "192.168.1.1",
+          user: "nagachika",
+        },
+        response: {
+          status: 200,
+          bytes: 72,
+        },
+      }
+    }
     driver = create_driver(<<-CONFIG)
       table foo
       email foo@bar.example
@@ -417,6 +496,11 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
       time_partitioning_field time
       time_partitioning_expiration 1h
       time_partitioning_require_partition_filter true
+      clustering_fields [
+        "time",
+        "vhost"
+      ]
     CONFIG
     stub_writer do |writer|
@@ -441,7 +525,12 @@ class BigQueryInsertOutputTest < Test::Unit::TestCase
           type: 'DAY',
           field: 'time',
           expiration_ms: 3600000,
-          require_partition_filter: true
+        },
+        clustering: {
+          fields: [
+            'time',
+            'vhost',
+          ],
         },
       }, {})
     end

data/test/plugin/test_out_bigquery_load.rb CHANGED

@@ -158,7 +158,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
         stub!.job_reference.stub!.job_id { "dummy_job_id" }
       end
-      mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
+      mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
         stub! do |s|
           s.id { 'dummy_job_id' }
           s.configuration.stub! do |_s|
@@ -241,7 +241,7 @@ class BigQueryLoadOutputTest < Test::Unit::TestCase
         stub!.job_reference.stub!.job_id { "dummy_job_id" }
       end
-      mock(writer.client).get_job('yourproject_id', 'dummy_job_id') do
+      mock(writer.client).get_job('yourproject_id', 'dummy_job_id', {:location=>nil}) do
         stub! do |s|
           s.id { 'dummy_job_id' }
           s.configuration.stub! do |_s|

data/test/plugin/test_record_schema.rb CHANGED

@@ -27,6 +27,11 @@ class RecordSchemaTest < Test::Unit::TestCase
         "name" => "argv",
         "type" => "STRING",
         "mode" => "REPEATED"
+      },
+      {
+        "name" => "utilisation",
+        "type" => "NUMERIC",
+        "mode" => "NULLABLE"
       }
     ]
   end
@@ -58,6 +63,11 @@ class RecordSchemaTest < Test::Unit::TestCase
         "type" => "STRING",
         "mode" => "REPEATED"
       },
+      {
+        "name" => "utilisation",
+        "type" => "NUMERIC",
+        "mode" => "NULLABLE"
+      },
       {
         "name" => "new_column",
         "type" => "STRING",
@@ -93,6 +103,11 @@ class RecordSchemaTest < Test::Unit::TestCase
         "type" => "STRING",
         "mode" => "REPEATED"
       },
+      {
+        "name" => "utilisation",
+        "type" => "NUMERIC",
+        "mode" => "NULLABLE"
+      }
     ]
   end
@@ -142,12 +157,12 @@ class RecordSchemaTest < Test::Unit::TestCase
     time = Time.local(2016, 2, 7, 19, 0, 0).utc
     formatted = fields.format_one({
-      "time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42]
+      "time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42], "utilisation" => "0.837"
     })
     assert_equal(
       formatted,
       {
-        "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"]
+        "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"], "utilisation" => "0.837"
       }
     )
   end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fluent-plugin-bigquery
 version: !ruby/object:Gem::Version
-  version: 2.1.0
+  version: 2.2.0
 platform: ruby
 authors:
 - Naoya Ito
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-11-05 00:00:00.000000000 Z
+date: 2019-08-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -183,8 +183,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.7
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: Fluentd plugin to store data on Google BigQuery