RubyGems - fluent-plugin-bigquery - Versions diffs - 0.3.4 → 0.4.0 - Mend

fluent-plugin-bigquery 0.3.4 → 0.4.0

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +102 -65
data/lib/fluent/plugin/bigquery/schema.rb +52 -1
data/lib/fluent/plugin/bigquery/version.rb +1 -1
data/lib/fluent/plugin/bigquery/writer.rb +21 -5
data/lib/fluent/plugin/out_bigquery.rb +15 -11
data/test/plugin/test_out_bigquery.rb +120 -238
data/test/plugin/test_record_schema.rb +17 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3b4f46435678df7e2b883e2dfdbb0a77a9359481
-  data.tar.gz: 7cbe9b3aa6e4c22ccf0299bfd0a8ff8d1892ccff
+  metadata.gz: 7d4074dc903c423acbebd56b2b4d6fc0ce110510
+  data.tar.gz: 4d17cd1b2ee3768b83845105b5b9a714835e0a4c
 SHA512:
-  metadata.gz: b3e02368662e2c7448726d9d3652aa80d60cd19a040122cfdd05aed6b36949e2c5b760cc3812f690633e4ad3685fd01da14eacab65066caa18e754b8432e7dde
-  data.tar.gz: b9d5c81042fe958b6230d9ebffbf6c09526d7f7bdb0b2641667e2c7c34ab70418d03542a23bb90bff23e565315f7c1e6817ba792b1e622d94d015bad6eea4ef0
+  metadata.gz: 7f99c64e394650b7eac03e6872dcfafb36981f48a726d8aba9d87fc83b45329ebac925c7d1239113995597e4697d8afcf1f9397c8583d0a3bbe11d47aedd668b
+  data.tar.gz: 52554bcd622e75486fc8a10ceeebd8af958ac5523869f2ae964324c1348b734fcef00c4232766484a0d3112c15b50eb06f334b6efaf2cd55394321139bc1df9e

data/README.md CHANGED Viewed

@@ -21,47 +21,48 @@ If you use ruby-2.1 or earlier, you must use activesupport-4.2.x or earlier.
 ### Options
-| name                                   | type          | required?                   | default                                                | description                                                                                                          |
-| :------------------------------------- | :------------ | :-----------                | :-------------------------                             | :-----------------------                                                                                             |
-| method                                 | string        | no                          | insert                                                 | `insert` (Streaming Insert) or `load` (load job)                                                                     |
-| buffer_type                            | string        | no                          | lightening (insert) or file (load)                     |                                                                                                                      |
-| buffer_chunk_limit                     | integer       | no                          | 1MB (insert) or 1GB (load)                             |                                                                                                                      |
-| buffer_queue_limit                     | integer       | no                          | 1024 (insert) or 32 (load)                             |                                                                                                                      |
-| buffer_chunk_records_limit             | integer       | no                          | 500                                                    |                                                                                                                      |
-| flush_interval                         | float         | no                          | 0.25 (*insert) or default of time sliced output (load) |                                                                                                                      |
-| try_flush_interval                     | float         | no                          | 0.05 (*insert) or default of time sliced output (load) |                                                                                                                      |
-| auth_method                            | enum          | yes                         | private_key                                            | `private_key` or `json_key` or `compute_engine` or `application_default`                                             |
-| email                                  | string        | yes (private_key)           | nil                                                    | GCP Service Account Email                                                                                            |
-| private_key_path                       | string        | yes (private_key)           | nil                                                    | GCP Private Key file path                                                                                            |
-| private_key_passphrase                 | string        | yes (private_key)           | nil                                                    | GCP Private Key Passphrase                                                                                           |
-| json_key                               | string        | yes (json_key)              | nil                                                    | GCP JSON Key file path or JSON Key string                                                                            |
-| project                                | string        | yes                         | nil                                                    |                                                                                                                      |
-| table                                  | string        | yes (either `tables`)       | nil                                                    |                                                                                                                      |
-| tables                                 | string        | yes (either `table`)        | nil                                                    | can set multi table names splitted by `,`                                                                            |
-| template_suffix                        | string        | no                          | nil                                                    | can use `%{time_slice}` placeholder replaced by `time_slice_format`                                                  |
-| auto_create_table                      | bool          | no                          | false                                                  | If true, creates table automatically                                                                                 |
-| skip_invalid_rows                      | bool          | no                          | false                                                  | Only `insert` method.                                                                                                |
-| max_bad_records                        | integer       | no                          | 0                                                      | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
-| ignore_unknown_values                  | bool          | no                          | false                                                  | Accept rows that contain values that do not match the schema. The unknown values are ignored.                        |
-| schema_path                            | string        | yes (either `fetch_schema`) | nil                                                    | Schema Definition file path. It is formatted by JSON.                                                                |
-| fetch_schema                           | bool          | yes (either `schema_path`)  | false                                                  | If true, fetch table schema definition from Bigquery table automatically.                                            |
-| fetch_schema_table                     | string        | no                          | nil                                                    | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored               |
-| schema_cache_expire                    | integer       | no                          | 600                                                    | Value is second. If current time is after expiration interval, re-fetch table schema definition.                     |
-| field_string                           | string        | no                          | nil                                                    | see examples.                                                                                                        |
-| field_integer                          | string        | no                          | nil                                                    | see examples.                                                                                                        |
-| field_float                            | string        | no                          | nil                                                    | see examples.                                                                                                        |
-| field_boolean                          | string        | no                          | nil                                                    | see examples.                                                                                                        |
-| field_timestamp                        | string        | no                          | nil                                                    | see examples.                                                                                                        |
-| time_field                             | string        | no                          | nil                                                    | If this param is set, plugin set formatted time string to this field.                                                |
-| time_format                            | string        | no                          | nil                                                    | ex. `%s`, `%Y/%m%d %H:%M:%S`                                                                                         |
-| replace_record_key                     | bool          | no                          | false                                                  | see examples.                                                                                                        |
-| replace_record_key_regexp{1-10}        | string        | no                          | nil                                                    | see examples.                                                                                                        |
-| convert_hash_to_json                   | bool          | no                          | false                                                  | If true, converts Hash value of record to JSON String.                                                               |
-| insert_id_field                        | string        | no                          | nil                                                    | Use key as `insert_id` of Streaming Insert API parameter.                                                            |
-| request_timeout_sec                    | integer       | no                          | nil                                                    | Bigquery API response timeout                                                                                        |
-| request_open_timeout_sec               | integer       | no                          | 60                                                     | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value.                     |
-| time_partitioning_type                 | enum          | no (either day)             | nil                                                    | Type of bigquery time partitioning feature(experimental feature on BigQuery).                                        |
-| time_partitioning_expiration           | time          | no                          | nil                                                    | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery)                           |
+| name                                   | type          | required?                                    | default                                                | description                                                                                                          |
+| :------------------------------------- | :------------ | :-----------                                 | :-------------------------                             | :-----------------------                                                                                             |
+| method                                 | string        | no                                           | insert                                                 | `insert` (Streaming Insert) or `load` (load job)                                                                     |
+| buffer_type                            | string        | no                                           | lightening (insert) or file (load)                     |                                                                                                                      |
+| buffer_chunk_limit                     | integer       | no                                           | 1MB (insert) or 1GB (load)                             |                                                                                                                      |
+| buffer_queue_limit                     | integer       | no                                           | 1024 (insert) or 32 (load)                             |                                                                                                                      |
+| buffer_chunk_records_limit             | integer       | no                                           | 500                                                    |                                                                                                                      |
+| flush_interval                         | float         | no                                           | 0.25 (*insert) or default of time sliced output (load) |                                                                                                                      |
+| try_flush_interval                     | float         | no                                           | 0.05 (*insert) or default of time sliced output (load) |                                                                                                                      |
+| auth_method                            | enum          | yes                                          | private_key                                            | `private_key` or `json_key` or `compute_engine` or `application_default`                                             |
+| email                                  | string        | yes (private_key)                            | nil                                                    | GCP Service Account Email                                                                                            |
+| private_key_path                       | string        | yes (private_key)                            | nil                                                    | GCP Private Key file path                                                                                            |
+| private_key_passphrase                 | string        | yes (private_key)                            | nil                                                    | GCP Private Key Passphrase                                                                                           |
+| json_key                               | string        | yes (json_key)                               | nil                                                    | GCP JSON Key file path or JSON Key string                                                                            |
+| project                                | string        | yes                                          | nil                                                    |                                                                                                                      |
+| table                                  | string        | yes (either `tables`)                        | nil                                                    |                                                                                                                      |
+| tables                                 | string        | yes (either `table`)                         | nil                                                    | can set multi table names splitted by `,`                                                                            |
+| template_suffix                        | string        | no                                           | nil                                                    | can use `%{time_slice}` placeholder replaced by `time_slice_format`                                                  |
+| auto_create_table                      | bool          | no                                           | false                                                  | If true, creates table automatically                                                                                 |
+| skip_invalid_rows                      | bool          | no                                           | false                                                  | Only `insert` method.                                                                                                |
+| max_bad_records                        | integer       | no                                           | 0                                                      | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
+| ignore_unknown_values                  | bool          | no                                           | false                                                  | Accept rows that contain values that do not match the schema. The unknown values are ignored.                        |
+| schema                                 | array         | yes (either `fetch_schema` or `schema_path`) | nil                                                    | Schema Definition. It is formatted by JSON.                                                                          |
+| schema_path                            | string        | yes (either `fetch_schema`)                  | nil                                                    | Schema Definition file path. It is formatted by JSON.                                                                |
+| fetch_schema                           | bool          | yes (either `schema_path`)                   | false                                                  | If true, fetch table schema definition from Bigquery table automatically.                                            |
+| fetch_schema_table                     | string        | no                                           | nil                                                    | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored               |
+| schema_cache_expire                    | integer       | no                                           | 600                                                    | Value is second. If current time is after expiration interval, re-fetch table schema definition.                     |
+| field_string (deprecated)              | string        | no                                           | nil                                                    | see examples.                                                                                                        |
+| field_integer (deprecated)             | string        | no                                           | nil                                                    | see examples.                                                                                                        |
+| field_float (deprecated)               | string        | no                                           | nil                                                    | see examples.                                                                                                        |
+| field_boolean (deprecated)             | string        | no                                           | nil                                                    | see examples.                                                                                                        |
+| field_timestamp (deprecated)           | string        | no                                           | nil                                                    | see examples.                                                                                                        |
+| time_field                             | string        | no                                           | nil                                                    | If this param is set, plugin set formatted time string to this field.                                                |
+| time_format                            | string        | no                                           | nil                                                    | ex. `%s`, `%Y/%m%d %H:%M:%S`                                                                                         |
+| replace_record_key                     | bool          | no                                           | false                                                  | see examples.                                                                                                        |
+| replace_record_key_regexp{1-10}        | string        | no                                           | nil                                                    | see examples.                                                                                                        |
+| convert_hash_to_json (deprecated)      | bool          | no                                           | false                                                  | If true, converts Hash value of record to JSON String.                                                               |
+| insert_id_field                        | string        | no                                           | nil                                                    | Use key as `insert_id` of Streaming Insert API parameter.                                                            |
+| request_timeout_sec                    | integer       | no                                           | nil                                                    | Bigquery API response timeout                                                                                        |
+| request_open_timeout_sec               | integer       | no                                           | 60                                                     | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value.                     |
+| time_partitioning_type                 | enum          | no (either day)                              | nil                                                    | Type of bigquery time partitioning feature(experimental feature on BigQuery).                                        |
+| time_partitioning_expiration           | time          | no                                           | nil                                                    | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery)                           |
 ### Standard Options
@@ -96,10 +97,25 @@ Configure insert specifications with target table schema, with your credentials.
   time_format %s
   time_field  time
-  field_integer time,status,bytes
-  field_string  rhost,vhost,path,method,protocol,agent,referer
-  field_float   requesttime
-  field_boolean bot_access,loginsession
+  schema [
+    {"name": "time", "type": "INTEGER"},
+    {"name": "status", "type": "INTEGER"},
+    {"name": "bytes", "type": "INTEGER"},
+    {"name": "vhost", "type": "STRING"},
+    {"name": "path", "type": "STRING"},
+    {"name": "method", "type": "STRING"},
+    {"name": "protocol", "type": "STRING"},
+    {"name": "agent", "type": "STRING"},
+    {"name": "referer", "type": "STRING"},
+    {"name": "remote", "type": "RECORD", "fields": [
+      {"name": "host", "type": "STRING"},
+      {"name": "ip", "type": "STRING"},
+      {"name": "user", "type": "STRING"}
+    ]},
+    {"name": "requesttime", "type": "FLOAT"},
+    {"name": "bot_access", "type": "BOOLEAN"},
+    {"name": "loginsession", "type": "BOOLEAN"}
+  ]
 </match>
 ```
@@ -130,10 +146,25 @@ For high rate inserts over streaming inserts, you should specify flush intervals
   time_format %s
   time_field  time
-  field_integer time,status,bytes
-  field_string  rhost,vhost,path,method,protocol,agent,referer
-  field_float   requesttime
-  field_boolean bot_access,loginsession
+  schema [
+    {"name": "time", "type": "INTEGER"},
+    {"name": "status", "type": "INTEGER"},
+    {"name": "bytes", "type": "INTEGER"},
+    {"name": "vhost", "type": "STRING"},
+    {"name": "path", "type": "STRING"},
+    {"name": "method", "type": "STRING"},
+    {"name": "protocol", "type": "STRING"},
+    {"name": "agent", "type": "STRING"},
+    {"name": "referer", "type": "STRING"},
+    {"name": "remote", "type": "RECORD", "fields": [
+      {"name": "host", "type": "STRING"},
+      {"name": "ip", "type": "STRING"},
+      {"name": "user", "type": "STRING"}
+    ]},
+    {"name": "requesttime", "type": "FLOAT"},
+    {"name": "bot_access", "type": "BOOLEAN"},
+    {"name": "loginsession", "type": "BOOLEAN"}
+  ]
 </match>
 ```
@@ -266,11 +297,7 @@ Compute Engine instance, then you can configure fluentd like this.
   time_format %s
   time_field  time
-  field_integer time,status,bytes
-  field_string  rhost,vhost,path,method,protocol,agent,referer
-  field_float   requesttime
-  field_boolean bot_access,loginsession
+  ...
 </match>
 ```
@@ -419,10 +446,25 @@ you can also specify nested fields by prefixing their belonging record fields.
   time_format %s
   time_field  time
-  field_integer time,response.status,response.bytes
-  field_string  request.vhost,request.path,request.method,request.protocol,request.agent,request.referer,remote.host,remote.ip,remote.user
-  field_float   request.time
-  field_boolean request.bot_access,request.loginsession
+  schema [
+    {"name": "time", "type": "INTEGER"},
+    {"name": "status", "type": "INTEGER"},
+    {"name": "bytes", "type": "INTEGER"},
+    {"name": "vhost", "type": "STRING"},
+    {"name": "path", "type": "STRING"},
+    {"name": "method", "type": "STRING"},
+    {"name": "protocol", "type": "STRING"},
+    {"name": "agent", "type": "STRING"},
+    {"name": "referer", "type": "STRING"},
+    {"name": "remote", "type": "RECORD", "fields": [
+      {"name": "host", "type": "STRING"},
+      {"name": "ip", "type": "STRING"},
+      {"name": "user", "type": "STRING"}
+    ]},
+    {"name": "requesttime", "type": "FLOAT"},
+    {"name": "bot_access", "type": "BOOLEAN"},
+    {"name": "loginsession", "type": "BOOLEAN"}
+  ]
 </match>
 ```
@@ -459,10 +501,9 @@ The second method is to specify a path to a BigQuery schema file instead of list
   time_field  time
   schema_path /path/to/httpd.schema
-  field_integer time
 </match>
 ```
-where /path/to/httpd.schema is a path to the JSON-encoded schema file which you used for creating the table on BigQuery.
+where /path/to/httpd.schema is a path to the JSON-encoded schema file which you used for creating the table on BigQuery. By using external schema file you are able to write full schema that does support NULLABLE/REQUIRED/REPEATED, this feature is really useful and adds full flexbility.
 The third method is to set `fetch_schema` to `true` to enable fetch a schema using BigQuery API.  In this case, your fluent.conf looks like:
@@ -477,7 +518,6 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
   fetch_schema true
   # fetch_schema_table other_table # if you want to fetch schema from other table
-  field_integer time
 </match>
 ```
@@ -498,17 +538,14 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
   ...
   insert_id_field uuid
-  field_string uuid
+  schema [{"name": "uuid", "type": "STRING"}]
 </match>
 ```
 ## TODO
-* support optional data fields
-* support NULLABLE/REQUIRED/REPEATED field options in field list style of configuration
 * OAuth installed application credentials support
 * Google API discovery expiration
-* Error classes
 * check row size limits
 ## Authors

data/lib/fluent/plugin/bigquery/schema.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+require 'multi_json'
 module Fluent
   module BigQuery
     class FieldSchema
@@ -56,7 +58,11 @@ module Fluent
       end
       def format_one(value)
-        value.to_s
+        if value.is_a?(Hash) || value.is_a?(Array)
+          MultiJson.dump(value)
+        else
+          value.to_s
+        end
       end
     end
@@ -116,6 +122,48 @@ module Fluent
       end
     end
+    class DateFieldSchema < FieldSchema
+      def type
+        :date
+      end
+      def format_one(value)
+        if value.respond_to?(:strftime)
+          value.strftime("%Y-%m-%d")
+        else
+          value
+        end
+      end
+    end
+    class DateTimeFieldSchema < FieldSchema
+      def type
+        :datetime
+      end
+      def format_one(value)
+        if value.respond_to?(:strftime)
+          value.strftime("%Y-%m-%dT%H:%M:%S.%6L")
+        else
+          value
+        end
+      end
+    end
+    class TimeFieldSchema < FieldSchema
+      def type
+        :time
+      end
+      def format_one(value)
+        if value.respond_to?(:strftime)
+          value.strftime("%H:%M:%S.%6L")
+        else
+          value
+        end
+      end
+    end
     class RecordSchema < FieldSchema
       FIELD_TYPES = {
         string: StringFieldSchema,
@@ -123,6 +171,9 @@ module Fluent
         float: FloatFieldSchema,
         boolean: BooleanFieldSchema,
         timestamp: TimestampFieldSchema,
+        date: DateFieldSchema,
+        datetime: DateTimeFieldSchema,
+        time: TimeFieldSchema,
         record: RecordSchema
       }.freeze

data/lib/fluent/plugin/bigquery/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Fluent
   module BigQueryPlugin
-    VERSION = "0.3.4".freeze
+    VERSION = "0.4.0".freeze
   end
 end

data/lib/fluent/plugin/bigquery/writer.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module Fluent
         @scope = "https://www.googleapis.com/auth/bigquery"
         @auth_options = auth_options
         @log = log
+        @num_errors_per_chunk = {}
         @cached_client_expiration = Time.now + 1800
       end
@@ -104,7 +105,7 @@ module Fluent
         raise Fluent::BigQuery::Error.wrap(e)
       end
-      def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60, auto_create_table: nil, time_partitioning_type: nil, time_partitioning_expiration: nil)
+      def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields, prevent_duplicate_load: false, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60, auto_create_table: nil, time_partitioning_type: nil, time_partitioning_expiration: nil)
         configuration = {
           configuration: {
             load: {
@@ -123,6 +124,8 @@ module Fluent
             }
           }
         }
+        job_id = create_job_id(chunk_id, dataset, table_id, fields.to_a, max_bad_records, ignore_unknown_values) if prevent_duplicate_load
         configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if time_partitioning_type
         configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
@@ -148,7 +151,8 @@ module Fluent
             }
           }
         )
-        wait_load_job(project, dataset, res.job_reference.job_id, table_id)
+        wait_load_job(chunk_id, project, dataset, res.job_reference.job_id, table_id)
+        @num_errors_per_chunk.delete(chunk_id)
       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
         @client = nil
@@ -161,12 +165,16 @@ module Fluent
           raise "table created. send rows next time."
         end
-        return wait_load_job(project, dataset, job_id, table_id) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
+        if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
+          wait_load_job(chunk_id, project, dataset, job_id, table_id)
+          @num_errors_per_chunk.delete(chunk_id)
+          return
+        end
         raise Fluent::BigQuery::Error.wrap(e)
       end
-      def wait_load_job(project, dataset, job_id, table_id, retryable: true)
+      def wait_load_job(chunk_id, project, dataset, job_id, table_id)
         wait_interval = 10
         _response = client.get_job(project, job_id)
@@ -186,9 +194,11 @@ module Fluent
         error_result = _response.status.error_result
         if error_result
           log.error "job.insert API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
-          if retryable && Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
+          if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
+            @num_errors_per_chunk[chunk_id] = @num_errors_per_chunk[chunk_id].to_i + 1
             raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry")
           else
+            @num_errors_per_chunk.delete(chunk_id)
             raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry")
           end
         end
@@ -259,6 +269,12 @@ module Fluent
       def safe_table_id(table_id)
         table_id.gsub(/\$\d+$/, "")
       end
+      def create_job_id(chunk_id, dataset, table, schema, max_bad_records, ignore_unknown_values)
+        job_id_key = "#{chunk_id}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}#{@num_errors_per_chunk[chunk_id]}"
+        @log.debug "job_id_key: #{job_id_key}"
+        "fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
+      end
     end
   end
 end

data/lib/fluent/plugin/out_bigquery.rb CHANGED Viewed

@@ -87,6 +87,7 @@ module Fluent
     #   Default is false, which treats unknown values as errors.
     config_param :ignore_unknown_values, :bool, default: false
+    config_param :schema, :array, default: nil
     config_param :schema_path, :string, default: nil
     config_param :fetch_schema, :bool, default: false
     config_param :fetch_schema_table, :string, default: nil
@@ -213,7 +214,11 @@ module Fluent
       @tablelist = @tables ? @tables.split(',') : [@table]
+      legacy_schema_config_deprecation
       @fields = Fluent::BigQuery::RecordSchema.new('record')
+      if @schema
+        @fields.load_schema(@schema)
+      end
       if @schema_path
         @fields.load_schema(MultiJson.load(File.read(@schema_path)))
       end
@@ -259,6 +264,8 @@ module Fluent
       else
         @get_insert_id = nil
       end
+      warn "[DEPRECATION] `convert_hash_to_json` param is deprecated. If Hash value is inserted string field, plugin convert it to json automatically." if @convert_hash_to_json
     end
     def start
@@ -329,6 +336,12 @@ module Fluent
       record
     end
+    def legacy_schema_config_deprecation
+      if [@field_string, @field_integer, @field_float, @field_boolean, @field_timestamp].any?
+        warn "[DEPRECATION] `field_*` style schema config is deprecated. Instead of it, use `schema` config params that is array of json style."
+      end
+    end
     def write(chunk)
       table_id_format = @tables_mutex.synchronize do
         t = @tables_queue.shift
@@ -455,14 +468,9 @@ module Fluent
       def load(chunk, table_id)
         res = nil
-        if @prevent_duplicate_load
-          job_id = create_job_id(chunk, @dataset, table_id, @fields.to_a, @max_bad_records, @ignore_unknown_values)
-        else
-          job_id = nil
-        end
         create_upload_source(chunk) do |upload_source|
-          res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
+          res = writer.create_load_job(chunk.unique_id, @project, @dataset, table_id, upload_source, @fields, {
+            prevent_duplicate_load: @prevent_duplicate_load,
             ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
             timeout_sec: @request_timeout_sec,  open_timeout_sec: @request_open_timeout_sec, auto_create_table: @auto_create_table,
             time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration
@@ -494,10 +502,6 @@ module Fluent
           end
         end
       end
-      def create_job_id(chunk, dataset, table, schema, max_bad_records, ignore_unknown_values)
-        "fluentd_job_" + Digest::SHA1.hexdigest("#{chunk.unique_id}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
-      end
     end
   end
 end

data/test/plugin/test_out_bigquery.rb CHANGED Viewed

@@ -22,16 +22,31 @@ class BigQueryOutputTest < Test::Unit::TestCase
     time_format %s
     time_field  time
-    field_integer time,status,bytes
-    field_string  vhost,path,method,protocol,agent,referer,remote.host,remote.ip,remote.user
-    field_float   requesttime
-    field_boolean bot_access,loginsession
+    schema [
+      {"name": "time", "type": "INTEGER"},
+      {"name": "status", "type": "INTEGER"},
+      {"name": "bytes", "type": "INTEGER"},
+      {"name": "vhost", "type": "STRING"},
+      {"name": "path", "type": "STRING"},
+      {"name": "method", "type": "STRING"},
+      {"name": "protocol", "type": "STRING"},
+      {"name": "agent", "type": "STRING"},
+      {"name": "referer", "type": "STRING"},
+      {"name": "remote", "type": "RECORD", "fields": [
+        {"name": "host", "type": "STRING"},
+        {"name": "ip", "type": "STRING"},
+        {"name": "user", "type": "STRING"}
+      ]},
+      {"name": "requesttime", "type": "FLOAT"},
+      {"name": "bot_access", "type": "BOOLEAN"},
+      {"name": "loginsession", "type": "BOOLEAN"}
+    ]
   ]
   API_SCOPE = "https://www.googleapis.com/auth/bigquery"
   def create_driver(conf = CONFIG)
-    Fluent::Test::TimeSlicedOutputTestDriver.new(Fluent::BigQueryOutput).configure(conf)
+    Fluent::Test::TimeSlicedOutputTestDriver.new(Fluent::BigQueryOutput).configure(conf, true)
   end
   def stub_writer(driver)
@@ -91,7 +106,11 @@ class BigQueryOutputTest < Test::Unit::TestCase
       auth_method compute_engine
       project yourproject_id
       dataset yourdataset_id
-      field_integer time,status,bytes
+      schema [
+        {"name": "time", "type": "INTEGER"},
+        {"name": "status", "type": "INTEGER"},
+        {"name": "bytes", "type": "INTEGER"}
+      ]
     ])
     mock.proxy(Fluent::BigQuery::Writer).new(duck_type(:info, :error, :warn), driver.instance.auth_method, is_a(Hash))
     driver.instance.writer
@@ -114,7 +133,11 @@ class BigQueryOutputTest < Test::Unit::TestCase
       json_key #{json_key_path}
       project yourproject_id
       dataset yourdataset_id
-      field_integer time,status,bytes
+      schema [
+        {"name": "time", "type": "INTEGER"},
+        {"name": "status", "type": "INTEGER"},
+        {"name": "bytes", "type": "INTEGER"}
+      ]
     ])
     mock.proxy(Fluent::BigQuery::Writer).new(duck_type(:info, :error, :warn), driver.instance.auth_method, is_a(Hash))
     driver.instance.writer
@@ -134,7 +157,11 @@ class BigQueryOutputTest < Test::Unit::TestCase
         json_key #{json_key_path}
         project yourproject_id
         dataset yourdataset_id
-        field_integer time,status,bytes
+        schema [
+          {"name": "time", "type": "INTEGER"},
+          {"name": "status", "type": "INTEGER"},
+          {"name": "bytes", "type": "INTEGER"}
+        ]
       ])
       assert_raises(Errno::EACCES) do
         driver.instance.writer.client
@@ -147,9 +174,8 @@ class BigQueryOutputTest < Test::Unit::TestCase
   def test_configure_auth_json_key_as_string
     json_key = '{"private_key": "X", "client_email": "' + 'x' * 255 + '@developer.gserviceaccount.com"}'
     json_key_io = StringIO.new(json_key)
-    mock(StringIO).new(json_key) { json_key_io }
     authorization = Object.new
-    mock(Google::Auth::ServiceAccountCredentials).make_creds(json_key_io: json_key_io, scope: API_SCOPE) { authorization }
+    mock(Google::Auth::ServiceAccountCredentials).make_creds(json_key_io: satisfy {|arg| JSON.parse(arg.read) == JSON.parse(json_key_io.read) }, scope: API_SCOPE) { authorization }
     mock.proxy(Google::Apis::BigqueryV2::BigqueryService).new.with_any_args do |cl|
       mock(cl).__send__(:authorization=, authorization) {}
@@ -162,7 +188,11 @@ class BigQueryOutputTest < Test::Unit::TestCase
       json_key #{json_key}
       project yourproject_id
       dataset yourdataset_id
-      field_integer time,status,bytes
+      schema [
+        {"name": "time", "type": "INTEGER"},
+        {"name": "status", "type": "INTEGER"},
+        {"name": "bytes", "type": "INTEGER"}
+      ]
     ])
     mock.proxy(Fluent::BigQuery::Writer).new(duck_type(:info, :error, :warn), driver.instance.auth_method, is_a(Hash))
     driver.instance.writer
@@ -183,7 +213,11 @@ class BigQueryOutputTest < Test::Unit::TestCase
       auth_method application_default
       project yourproject_id
       dataset yourdataset_id
-      field_integer time,status,bytes
+      schema [
+        {"name": "time", "type": "INTEGER"},
+        {"name": "status", "type": "INTEGER"},
+        {"name": "bytes", "type": "INTEGER"}
+      ]
     ])
     mock.proxy(Fluent::BigQuery::Writer).new(duck_type(:info, :error, :warn), driver.instance.auth_method, is_a(Hash))
@@ -191,186 +225,6 @@ class BigQueryOutputTest < Test::Unit::TestCase
     assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
   end
-  def test_configure_fieldname_stripped
-    driver = create_driver(%[
-      table foo
-      email foo@bar.example
-      private_key_path /path/to/key
-      project yourproject_id
-      dataset yourdataset_id
-      time_format %s
-      time_field  time
-      field_integer time  , status , bytes
-      field_string  _log_name, vhost, path, method, protocol, agent, referer, remote.host, remote.ip, remote.user
-      field_float   requesttime
-      field_boolean bot_access , loginsession
-    ])
-    fields = driver.instance.instance_eval{ @fields }
-    assert (not fields['time  ']), "tailing spaces must be stripped"
-    assert fields['time']
-    assert fields['status']
-    assert fields['bytes']
-    assert fields['_log_name']
-    assert fields['vhost']
-    assert fields['protocol']
-    assert fields['agent']
-    assert fields['referer']
-    assert fields['remote']['host']
-    assert fields['remote']['ip']
-    assert fields['remote']['user']
-    assert fields['requesttime']
-    assert fields['bot_access']
-    assert fields['loginsession']
-  end
-  def test_configure_invalid_fieldname
-    base = %[
-      table foo
-      email foo@bar.example
-      private_key_path /path/to/key
-      project yourproject_id
-      dataset yourdataset_id
-      time_format %s
-      time_field  time
-    ]
-    assert_raises(Fluent::ConfigError) do
-      create_driver(base + "field_integer time field\n")
-    end
-    assert_raises(Fluent::ConfigError) do
-      create_driver(base + "field_string my name\n")
-    end
-    assert_raises(Fluent::ConfigError) do
-      create_driver(base + "field_string remote.host name\n")
-    end
-    assert_raises(Fluent::ConfigError) do
-      create_driver(base + "field_string 1column\n")
-    end
-    assert_raises(Fluent::ConfigError) do
-      create_driver(base + "field_string #{'tenstrings' * 12 + '123456789'}\n")
-    end
-    assert_raises(Fluent::ConfigError) do
-      create_driver(base + "field_float request time\n")
-    end
-    assert_raises(Fluent::ConfigError) do
-      create_driver(base + "field_boolean login session\n")
-    end
-  end
-  def test_format_stream
-    now = Time.now
-    input = [
-      now,
-      {
-        "status" => "1",
-        "bytes" => 3.0,
-        "vhost" => :bar,
-        "path" => "/path/to/baz",
-        "method" => "GET",
-        "protocol" => "HTTP/0.9",
-        "agent" => "libwww",
-        "referer" => "http://referer.example",
-        "requesttime" => (now - 1).to_f.to_s,
-        "bot_access" => true,
-        "loginsession" => false,
-        "something-else" => "would be ignored",
-        "yet-another" => {
-          "foo" => "bar",
-          "baz" => 1,
-        },
-        "remote" => {
-          "host" => "remote.example",
-          "ip" =>  "192.0.2.1",
-          "port" => 12345,
-          "user" => "tagomoris",
-        }
-      }
-    ]
-    expected = {
-      "json" => {
-        "time" => now.to_i,
-        "status" => 1,
-        "bytes" => 3,
-        "vhost" => "bar",
-        "path" => "/path/to/baz",
-        "method" => "GET",
-        "protocol" => "HTTP/0.9",
-        "agent" => "libwww",
-        "referer" => "http://referer.example",
-        "requesttime" => (now - 1).to_f.to_s.to_f,
-        "bot_access" => true,
-        "loginsession" => false,
-        "something-else" => "would be ignored",
-        "yet-another" => {
-          "foo" => "bar",
-          "baz" => 1,
-        },
-        "remote" => {
-          "host" => "remote.example",
-          "ip" =>  "192.0.2.1",
-          "port" => 12345,
-          "user" => "tagomoris",
-        }
-      }
-    }
-    driver = create_driver(CONFIG)
-    driver.instance.start
-    buf = driver.instance.format_stream("my.tag", [input])
-    driver.instance.shutdown
-    assert_equal expected, MessagePack.unpack(buf)
-  end
-  [
-    # <time_format>, <time field type>, <time expectation generator>, <assertion>
-    [
-      "%s.%6N", "field_float",
-      lambda{|t| t.strftime("%s.%6N").to_f },
-      lambda{|recv, expected, actual|
-        recv.assert_in_delta(expected, actual, Float::EPSILON / 10**3)
-      }
-    ],
-    [
-      "%Y-%m-%dT%H:%M:%SZ", "field_string",
-      lambda{|t| t.iso8601 },
-      :assert_equal.to_proc
-    ],
-    [
-      "%a, %d %b %Y %H:%M:%S GMT", "field_string",
-      lambda{|t| t.httpdate },
-      :assert_equal.to_proc
-    ],
-  ].each do |format, type, expect_time, assert|
-    define_method("test_time_formats_#{format}") do
-      now = Time.now.utc
-      input = [ now, {} ]
-      expected = { "json" => { "time" => expect_time[now], } }
-      driver = create_driver(<<-CONFIG)
-        table foo
-        email foo@bar.example
-        private_key_path /path/to/key
-        project yourproject_id
-        dataset yourdataset_id
-        time_format #{format}
-        time_field  time
-        #{type}     time
-      CONFIG
-      driver.instance.start
-      buf = driver.instance.format_stream("my.tag", [input])
-      driver.instance.shutdown
-      assert[self, expected["json"]["time"], MessagePack.unpack(buf)["json"]["time"]]
-    end
-  end
   def test_format_nested_time
     now = Time.now
     input = [
@@ -402,8 +256,13 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_format %s
       time_field  metadata.time
-      field_integer metadata.time
-      field_string  metadata.node,log
+      schema [
+        {"name": "metadata", "type": "RECORD", "fields": [
+          {"name": "time", "type": "INTEGER"},
+          {"name": "node", "type": "STRING"}
+        ]},
+        {"name": "log", "type": "STRING"}
+      ]
     CONFIG
     driver.instance.start
@@ -489,7 +348,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_field  time
       schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
-      field_integer time
+      schema [{"name": "time", "type": "INTEGER"}]
     CONFIG
     driver.instance.start
     buf = driver.instance.format_stream("my.tag", [input])
@@ -529,7 +388,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_field  time
       schema_path #{File.join(File.dirname(__FILE__), "testdata", "sudo.schema")}
-      field_integer time
+      schema [{"name": "time", "type": "INTEGER"}]
     CONFIG
     driver.instance.start
     buf = driver.instance.format_stream("my.tag", [input])
@@ -569,7 +428,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_field  time
       fetch_schema true
-      field_integer time
+      schema [{"name": "time", "type": "INTEGER"}]
     CONFIG
     writer = stub_writer(driver)
@@ -635,7 +494,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_field  time
       fetch_schema true
-      field_integer time
+      schema [{"name": "time", "type": "INTEGER"}]
     CONFIG
     writer = stub_writer(driver)
@@ -693,7 +552,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
       dataset yourdataset_id
       insert_id_field uuid
-      field_string uuid
+      schema [{"name": "uuid", "type": "STRING"}]
     CONFIG
     driver.instance.start
     buf = driver.instance.format_stream("my.tag", [input])
@@ -729,7 +588,9 @@ class BigQueryOutputTest < Test::Unit::TestCase
       dataset yourdataset_id
       insert_id_field data.uuid
-      field_string data.uuid
+      schema [{"name": "data", "type": "RECORD", "fields": [
+        {"name": "uuid", "type": "STRING"}
+      ]}]
     CONFIG
     driver.instance.start
     buf = driver.instance.format_stream("my.tag", [input])
@@ -758,7 +619,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
       project yourproject_id
       dataset yourdataset_id
-      field_string uuid
+      schema [{"name": "uuid", "type": "STRING"}]
       buffer_type memory
     CONFIG
@@ -803,9 +664,13 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_format %s
       time_field time
-      field_integer time
-      field_string vhost, referer
-      field_boolean bot_access, login_session
+      schema [
+        {"name": "time", "type": "INTEGER"},
+        {"name": "vhost", "type": "STRING"},
+        {"name": "refere", "type": "STRING"},
+        {"name": "bot_access", "type": "BOOLEAN"},
+        {"name": "login_session", "type": "BOOLEAN"}
+      ]
     CONFIG
     driver.instance.start
     buf = driver.instance.format_stream("my.tag", [input])
@@ -854,9 +719,13 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_format %s
       time_field time
-      field_integer time
-      field_string vhost, referer, remote
-      field_boolean bot_access, loginsession
+      schema [
+        {"name": "time", "type": "INTEGER"},
+        {"name": "vhost", "type": "STRING"},
+        {"name": "refere", "type": "STRING"},
+        {"name": "bot_access", "type": "BOOLEAN"},
+        {"name": "loginsession", "type": "BOOLEAN"}
+      ]
     CONFIG
     driver.instance.start
     buf = driver.instance.format_stream("my.tag", [input])
@@ -906,10 +775,25 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_format %s
       time_field  time
-      field_integer time,status,bytes
-      field_string  vhost,path,method,protocol,agent,referer,remote.host,remote.ip,remote.user
-      field_float   requesttime
-      field_boolean bot_access,loginsession
+      schema [
+        {"name": "time", "type": "INTEGER"},
+        {"name": "status", "type": "INTEGER"},
+        {"name": "bytes", "type": "INTEGER"},
+        {"name": "vhost", "type": "STRING"},
+        {"name": "path", "type": "STRING"},
+        {"name": "method", "type": "STRING"},
+        {"name": "protocol", "type": "STRING"},
+        {"name": "agent", "type": "STRING"},
+        {"name": "referer", "type": "STRING"},
+        {"name": "remote", "type": "RECORD", "fields": [
+          {"name": "host", "type": "STRING"},
+          {"name": "ip", "type": "STRING"},
+          {"name": "user", "type": "STRING"}
+        ]},
+        {"name": "requesttime", "type": "FLOAT"},
+        {"name": "bot_access", "type": "BOOLEAN"},
+        {"name": "loginsession", "type": "BOOLEAN"}
+      ]
       <secondary>
         type file
         path error
@@ -951,10 +835,25 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_format %s
       time_field  time
-      field_integer time,status,bytes
-      field_string  vhost,path,method,protocol,agent,referer,remote.host,remote.ip,remote.user
-      field_float   requesttime
-      field_boolean bot_access,loginsession
+      schema [
+        {"name": "time", "type": "INTEGER"},
+        {"name": "status", "type": "INTEGER"},
+        {"name": "bytes", "type": "INTEGER"},
+        {"name": "vhost", "type": "STRING"},
+        {"name": "path", "type": "STRING"},
+        {"name": "method", "type": "STRING"},
+        {"name": "protocol", "type": "STRING"},
+        {"name": "agent", "type": "STRING"},
+        {"name": "referer", "type": "STRING"},
+        {"name": "remote", "type": "RECORD", "fields": [
+          {"name": "host", "type": "STRING"},
+          {"name": "ip", "type": "STRING"},
+          {"name": "user", "type": "STRING"}
+        ]},
+        {"name": "requesttime", "type": "FLOAT"},
+        {"name": "bot_access", "type": "BOOLEAN"},
+        {"name": "loginsession", "type": "BOOLEAN"}
+      ]
       <secondary>
         type file
         path error
@@ -1002,20 +901,16 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_field  time
       schema_path #{schema_path}
-      field_integer time
       buffer_type memory
     CONFIG
-    schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys).tap do |h|
-      h[0][:type] = "INTEGER"
-      h[0][:mode] = "NULLABLE"
-    end
+    schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys)
     writer = stub_writer(driver)
     chunk = Fluent::MemoryBufferChunk.new("my.tag")
     io = StringIO.new("hello")
     mock(driver.instance).create_upload_source(chunk).yields(io)
-    mock(writer).wait_load_job("yourproject_id", "yourdataset_id", "dummy_job_id", "foo") { nil }
+    mock(writer).wait_load_job(is_a(String), "yourproject_id", "yourdataset_id", "dummy_job_id", "foo") { nil }
     mock(writer.client).insert_job('yourproject_id', {
       configuration: {
         load: {
@@ -1065,22 +960,17 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_field  time
       schema_path #{schema_path}
-      field_integer time
       prevent_duplicate_load true
       buffer_type memory
     CONFIG
-    schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys).tap do |h|
-      h[0][:type] = "INTEGER"
-      h[0][:mode] = "NULLABLE"
-    end
+    schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys)
     chunk = Fluent::MemoryBufferChunk.new("my.tag")
     io = StringIO.new("hello")
     mock(driver.instance).create_upload_source(chunk).yields(io)
-    mock.proxy(driver.instance).create_job_id(duck_type(:unique_id), "yourdataset_id", "foo", driver.instance.instance_variable_get(:@fields).to_a, 0, false)
     writer = stub_writer(driver)
-    mock(writer).wait_load_job("yourproject_id", "yourdataset_id", "dummy_job_id", "foo") { nil }
+    mock(writer).wait_load_job(is_a(String), "yourproject_id", "yourdataset_id", "dummy_job_id", "foo") { nil }
     mock(writer.client).insert_job('yourproject_id', {
       configuration: {
         load: {
@@ -1131,14 +1021,10 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_field  time
       schema_path #{schema_path}
-      field_integer time
       buffer_type memory
     CONFIG
-    schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys).tap do |h|
-      h[0][:type] = "INTEGER"
-      h[0][:mode] = "NULLABLE"
-    end
+    schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys)
     chunk = Fluent::MemoryBufferChunk.new("my.tag")
     io = StringIO.new("hello")
@@ -1209,7 +1095,6 @@ class BigQueryOutputTest < Test::Unit::TestCase
       time_field  time
       schema_path #{schema_path}
-      field_integer time
       buffer_type memory
       <secondary>
@@ -1218,10 +1103,7 @@ class BigQueryOutputTest < Test::Unit::TestCase
         utc
       </secondary>
     CONFIG
-    schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys).tap do |h|
-      h[0][:type] = "INTEGER"
-      h[0][:mode] = "NULLABLE"
-    end
+    schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys)
     chunk = Fluent::MemoryBufferChunk.new("my.tag")
     io = StringIO.new("hello")

data/test/plugin/test_record_schema.rb CHANGED Viewed

@@ -154,6 +154,23 @@ class RecordSchemaTest < Test::Unit::TestCase
     )
   end
+  def test_format_one_convert_array_or_hash_to_json
+    fields = Fluent::BigQuery::RecordSchema.new("record")
+    fields.load_schema(base_schema, false)
+    time = Time.local(2016, 2, 7, 19, 0, 0).utc
+    formatted = fields.format_one({
+      "time" => time, "tty" => ["tty1", "tty2", "tty3"], "pwd" => "/home", "user" => {name: "joker1007", uid: 10000}, "argv" => ["foo", 42]
+    })
+    assert_equal(
+      formatted,
+      {
+        "time" => time.strftime("%Y-%m-%d %H:%M:%S.%6L %:z"), "tty" => MultiJson.dump(["tty1", "tty2", "tty3"]), "pwd" => "/home", "user" => MultiJson.dump({name: "joker1007", uid: 10000}), "argv" => ["foo", "42"]
+      }
+    )
+  end
   def test_format_one_with_extra_column
     fields = Fluent::BigQuery::RecordSchema.new("record")
     fields.load_schema(base_schema, false)

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: fluent-plugin-bigquery
 version: !ruby/object:Gem::Version
-  version: 0.3.4
+  version: 0.4.0
 platform: ruby
 authors:
 - Naoya Ito
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-12-19 00:00:00.000000000 Z
+date: 2017-01-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake