RubyGems - fluent-plugin-bigquery - Versions diffs - 0.3.0 → 0.3.1 - Mend

fluent-plugin-bigquery 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +41 -39
data/lib/fluent/plugin/bigquery/version.rb +1 -1
data/lib/fluent/plugin/bigquery/writer.rb +21 -6
data/lib/fluent/plugin/out_bigquery.rb +7 -2
data/test/plugin/test_out_bigquery.rb +63 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 77b839947c8f721f341499a0be5c9f21552833ad
-  data.tar.gz: 3efa172577cb54e19290f0df245ecbb677994869
+  metadata.gz: f0f23c20f0c7feb488dc105d5b4ada3dcc27a4a3
+  data.tar.gz: 41497e9b9fa0e5b518f63dc4c664ae67b17f763f
 SHA512:
-  metadata.gz: f0dbcd7b1cd8f4462657006f9a51338e0327f0b61a172ae0d39860a4c88bae4af0ba8f299cc9657a8ac3eb44acf53f5bc1161d023a06b5ab6d133d4ab72aeba2
-  data.tar.gz: aabd97536deeeb1a6b3f55e0c71f28ddbcfbf99ced84c7d299e881409dab907a34a943c8fa69df0a3373d6c6e1eb5a83fcde157ea6c601b90d84ad7c93b816b4
+  metadata.gz: a62ad2c42da6fabfc422457e06a40095957abc1b740cf3fa99337d2f03802b92dd79e314f60ebef1911ed4ba09ff972bf5adceccafc701cbd4d0d7e9411387e3
+  data.tar.gz: 6d399567ec70f674f9ce8725d16561f5362f38c64a5267ec4e519e72f40431b65852dfb03a2486a52bf1594b97615b94063e0cf101177be1da37544be823edf2

data/README.md CHANGED Viewed

@@ -10,7 +10,7 @@
 * load data
   * for data loading as batch jobs, for big amount of data
   * https://developers.google.com/bigquery/loading-data-into-bigquery
 Current version of this plugin supports Google API with Service Account Authentication, but does not support
 OAuth flow for installed applications.
@@ -57,6 +57,8 @@ OAuth flow for installed applications.
 | insert_id_field                        | string        | no                          | nil                                                    | Use key as `insert_id` of Streaming Insert API parameter.                                                            |
 | request_timeout_sec                    | integer       | no                          | nil                                                    | Bigquery API response timeout                                                                                        |
 | request_open_timeout_sec               | integer       | no                          | 60                                                     | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value.                     |
+| time_partitioning_type                 | enum          | no (either day)             | nil                                                    | Type of bigquery time partitioning feature(experimental feature on BigQuery).                                        |
+| time_partitioning_expiration           | time          | no                          | nil                                                    | Expiration milliseconds for bigquery time partitioning. (experimental feature on BigQuery)                           |
 ### Standard Options
@@ -76,21 +78,21 @@ Configure insert specifications with target table schema, with your credentials.
 ```apache
 <match dummy>
   @type bigquery
   method insert    # default
   auth_method private_key   # default
   email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
   private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
   # private_key_passphrase notasecret # default
   project yourproject_id
   dataset yourdataset_id
   table   tablename
   time_format %s
   time_field  time
   field_integer time,status,bytes
   field_string  rhost,vhost,path,method,protocol,agent,referer
   field_float   requesttime
@@ -103,28 +105,28 @@ For high rate inserts over streaming inserts, you should specify flush intervals
 ```apache
 <match dummy>
   @type bigquery
   method insert    # default
   flush_interval 1  # flush as frequent as possible
   buffer_chunk_records_limit 300  # default rate limit for users is 100
   buffer_queue_limit 10240        # 1MB * 10240 -> 10GB!
   num_threads 16
   auth_method private_key   # default
   email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
   private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
   # private_key_passphrase notasecret # default
   project yourproject_id
   dataset yourdataset_id
   tables  accesslog1,accesslog2,accesslog3
   time_format %s
   time_field  time
   field_integer time,status,bytes
   field_string  rhost,vhost,path,method,protocol,agent,referer
   field_float   requesttime
@@ -214,10 +216,10 @@ download its JSON key and deploy the key with fluentd.
 ```apache
 <match dummy>
   @type bigquery
   auth_method json_key
   json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
   project yourproject_id
   dataset yourdataset_id
   table   tablename
@@ -231,10 +233,10 @@ You need to only include `private_key` and `client_email` key from JSON key file
 ```apache
 <match dummy>
   @type bigquery
   auth_method json_key
   json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
   project yourproject_id
   dataset yourdataset_id
   table   tablename
@@ -252,16 +254,16 @@ Compute Engine instance, then you can configure fluentd like this.
 ```apache
 <match dummy>
   @type bigquery
   auth_method compute_engine
   project yourproject_id
   dataset yourdataset_id
   table   tablename
   time_format %s
   time_field  time
   field_integer time,status,bytes
   field_string  rhost,vhost,path,method,protocol,agent,referer
   field_float   requesttime
@@ -296,13 +298,13 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
 ```apache
 <match dummy>
   @type bigquery
   ...
   project yourproject_id
   dataset yourdataset_id
   table   accesslog_%Y_%m
   ...
 </match>
 ```
@@ -384,12 +386,12 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
 ```apache
 <match dummy>
   @type bigquery
   ...
   auto_create_table true
   table accesslog_%Y_%m
   ...
 </match>
 ```
@@ -408,12 +410,12 @@ you can also specify nested fields by prefixing their belonging record fields.
 ```apache
 <match dummy>
   @type bigquery
   ...
   time_format %s
   time_field  time
   field_integer time,response.status,response.bytes
   field_string  request.vhost,request.path,request.method,request.protocol,request.agent,request.referer,remote.host,remote.ip,remote.user
   field_float   request.time
@@ -447,12 +449,12 @@ The second method is to specify a path to a BigQuery schema file instead of list
 ```apache
 <match dummy>
   @type bigquery
   ...
   time_format %s
   time_field  time
   schema_path /path/to/httpd.schema
   field_integer time
 </match>
@@ -464,12 +466,12 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
 ```apache
 <match dummy>
   @type bigquery
   ...
   time_format %s
   time_field  time
   fetch_schema true
   # fetch_schema_table other_table # if you want to fetch schema from other table
   field_integer time
@@ -489,9 +491,9 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
 ```apache
 <match dummy>
   @type bigquery
   ...
   insert_id_field uuid
   field_string uuid
 </match>

data/lib/fluent/plugin/bigquery/version.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Fluent
   module BigQueryPlugin
-    VERSION = "0.3.0"
+    VERSION = "0.3.1"
   end
 end

data/lib/fluent/plugin/bigquery/writer.rb CHANGED Viewed

@@ -60,20 +60,28 @@ module Fluent
         @client = client
       end
-      def create_table(project, dataset, table_id, record_schema)
+      def create_table(project, dataset, table_id, record_schema, time_partitioning_type: nil, time_partitioning_expiration: nil)
         create_table_retry_limit = 3
         create_table_retry_wait = 1
         create_table_retry_count = 0
         begin
-          client.insert_table(project, dataset, {
+          definition = {
             table_reference: {
               table_id: table_id,
             },
             schema: {
               fields: record_schema.to_a,
             }
-          }, {})
+          }
+          if time_partitioning_type
+            definition[:time_partitioning] = {
+              type: time_partitioning_type.to_s.upcase,
+              expiration_ms: time_partitioning_expiration ? time_partitioning_expiration * 1000 : nil
+            }.compact
+          end
+          client.insert_table(project, dataset, definition, {})
           log.debug "create table", project_id: project, dataset: dataset, table: table_id
           @client = nil
         rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
@@ -124,7 +132,7 @@ module Fluent
           options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
         })
         log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
-        log.warn "insert errors", insert_errors: res.insert_errors if res.insert_errors && !res.insert_errors.empty?
+        log.warn "insert errors", insert_errors: res.insert_errors.to_s if res.insert_errors && !res.insert_errors.empty?
       rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
         @client = nil
@@ -138,7 +146,7 @@ module Fluent
         end
       end
-      def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60)
+      def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60, auto_create_table: nil, time_partitioning_type: nil, time_partitioning_expiration: nil)
         configuration = {
           configuration: {
             load: {
@@ -157,6 +165,7 @@ module Fluent
             }
           }
         }
+        configuration[:configuration][:load].merge!(create_disposition: "CREATE_NEVER") if time_partitioning_type
         configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
         # If target table is already exist, omit schema configuration.
@@ -188,7 +197,13 @@ module Fluent
         reason = e.respond_to?(:reason) ? e.reason : nil
         log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
-        return wait_load_job(project, dataset, job_id, table_id, retryable: false) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
+        if auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
+          # Table Not Found: Auto Create Table
+          create_table(project, dataset, table_id, fields, time_partitioning_type: time_partitioning_type, time_partitioning_expiration: time_partitioning_expiration)
+          raise "table created. send rows next time."
+        end
+        return wait_load_job(project, dataset, job_id, table_id) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
         if RETRYABLE_ERROR_REASON.include?(reason) || e.is_a?(Google::Apis::ServerError)
           raise RetryableError.new(nil, e)

data/lib/fluent/plugin/out_bigquery.rb CHANGED Viewed

@@ -143,6 +143,10 @@ module Fluent
     config_param :request_timeout_sec, :time, default: nil
     config_param :request_open_timeout_sec, :time, default: 60
+    ## Partitioning
+    config_param :time_partitioning_type, :enum, list: [:day], default: nil
+    config_param :time_partitioning_expiration, :time, default: nil
     ### Table types
     # https://developers.google.com/bigquery/docs/tables
     #
@@ -413,7 +417,7 @@ module Fluent
       rescue Fluent::BigQuery::Writer::Error => e
         if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
           # Table Not Found: Auto Create Table
-          writer.create_table(@project, @dataset, table_id, @fields)
+          writer.create_table(@project, @dataset, table_id, @fields, time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration)
           raise "table created. send rows next time."
         end
@@ -459,7 +463,8 @@ module Fluent
         create_upload_source(chunk) do |upload_source|
           res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
             ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
-            timeout_sec: @request_timeout_sec,  open_timeout_sec: @request_open_timeout_sec,
+            timeout_sec: @request_timeout_sec,  open_timeout_sec: @request_open_timeout_sec, auto_create_table: @auto_create_table,
+            time_partitioning_type: @time_partitioning_type, time_partitioning_expiration: @time_partitioning_expiration
           })
         end
       rescue Fluent::BigQuery::Writer::Error => e

data/test/plugin/test_out_bigquery.rb CHANGED Viewed

@@ -1428,7 +1428,69 @@ class BigQueryOutputTest < Test::Unit::TestCase
       skip_invalid_rows: false,
       ignore_unknown_values: false,
     )) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
-    mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields))
+    mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: nil, time_partitioning_expiration: nil)
+    chunk = Fluent::MemoryBufferChunk.new("my.tag")
+    chunk << message.to_msgpack
+    driver.instance.start
+    assert_raise(RuntimeError) {
+      driver.instance.write(chunk)
+    }
+    driver.instance.shutdown
+  end
+  def test_auto_create_partitioned_table_by_bigquery_api
+    now = Time.now
+    message = {
+      "json" => {
+        "time" => now.to_i,
+        "request" => {
+          "vhost" => "bar",
+          "path" => "/path/to/baz",
+          "method" => "GET",
+          "protocol" => "HTTP/1.0",
+          "agent" => "libwww",
+          "referer" => "http://referer.example",
+          "time" => (now - 1).to_f,
+          "bot_access" => true,
+          "loginsession" => false,
+        },
+        "remote" => {
+          "host" => "remote.example",
+          "ip" =>  "192.168.1.1",
+          "user" => "nagachika",
+        },
+        "response" => {
+          "status" => 200,
+          "bytes" => 72,
+        },
+      }
+    }.deep_symbolize_keys
+    driver = create_driver(<<-CONFIG)
+      table foo
+      email foo@bar.example
+      private_key_path /path/to/key
+      project yourproject_id
+      dataset yourdataset_id
+      time_format %s
+      time_field  time
+      auto_create_table true
+      schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
+      time_partitioning_type day
+      time_partitioning_expiration 1h
+    CONFIG
+    writer = stub_writer(driver)
+    mock(writer).insert_rows('yourproject_id', 'yourdataset_id', 'foo', [message], hash_including(
+      skip_invalid_rows: false,
+      ignore_unknown_values: false,
+    )) { raise Fluent::BigQuery::Writer::RetryableError.new(nil, Google::Apis::ServerError.new("Not found: Table yourproject_id:yourdataset_id.foo", status_code: 404, body: "Not found: Table yourproject_id:yourdataset_id.foo")) }
+    mock(writer).create_table('yourproject_id', 'yourdataset_id', 'foo', driver.instance.instance_variable_get(:@fields), time_partitioning_type: :day, time_partitioning_expiration: 3600)
     chunk = Fluent::MemoryBufferChunk.new("my.tag")
     chunk << message.to_msgpack

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: fluent-plugin-bigquery
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.3.1
 platform: ruby
 authors:
 - Naoya Ito
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-10-07 00:00:00.000000000 Z
+date: 2016-10-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake