RubyGems - fluent-plugin-bigquery - Versions diffs - 1.2.0 → 2.0.0.beta - Mend

fluent-plugin-bigquery 1.2.0 → 2.0.0.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/.travis.yml +2 -9
data/README.md +68 -65
data/lib/fluent/plugin/bigquery/version.rb +1 -1
data/lib/fluent/plugin/bigquery/writer.rb +45 -39
data/lib/fluent/plugin/out_bigquery_base.rb +211 -0
data/lib/fluent/plugin/out_bigquery_insert.rb +131 -0
data/lib/fluent/plugin/out_bigquery_load.rb +220 -0
data/test/helper.rb +3 -1
data/test/plugin/test_out_bigquery_base.rb +579 -0
data/test/plugin/test_out_bigquery_insert.rb +420 -0
data/test/plugin/test_out_bigquery_load.rb +310 -0
metadata +13 -7
data/lib/fluent/plugin/out_bigquery.rb +0 -500
data/test/plugin/test_out_bigquery.rb +0 -1276

data/lib/fluent/plugin/out_bigquery_base.rb ADDED

@@ -0,0 +1,211 @@
+require 'fluent/plugin/output'
+require 'fluent/plugin/bigquery/version'
+require 'fluent/plugin/bigquery/helper'
+require 'fluent/plugin/bigquery/errors'
+require 'fluent/plugin/bigquery/schema'
+require 'fluent/plugin/bigquery/writer'
+require 'multi_json'
+require 'google/apis/bigquery_v2'
+require 'googleauth'
+module Fluent
+  module Plugin
+    # This class is abstract class
+    class BigQueryBaseOutput < Output
+      helpers :inject, :formatter
+      # Available methods are:
+      # * private_key -- Use service account credential from pkcs12 private key file
+      # * compute_engine -- Use access token available in instances of ComputeEngine
+      # * json_key -- Use service account credential from JSON key
+      # * application_default -- Use application default credential
+      config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
+      ### Service Account credential
+      config_param :email, :string, default: nil
+      config_param :private_key_path, :string, default: nil
+      config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
+      config_param :json_key, default: nil, secret: true
+      # see as simple reference
+      #   https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
+      config_param :project, :string
+      # dataset_name
+      #   The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
+      #   but it cannot start with a number or underscore, or have spaces.
+      config_param :dataset, :string
+      # table_id
+      #   In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
+      config_param :table, :string, default: nil
+      config_param :tables, :array, value_type: :string, default: nil
+      config_param :auto_create_table, :bool, default: false
+      # ignore_unknown_values
+      #   Accept rows that contain values that do not match the schema. The unknown values are ignored.
+      #   Default is false, which treats unknown values as errors.
+      config_param :ignore_unknown_values, :bool, default: false
+      config_param :schema, :array, default: nil
+      config_param :schema_path, :string, default: nil
+      config_param :fetch_schema, :bool, default: false
+      config_param :fetch_schema_table, :string, default: nil
+      config_param :schema_cache_expire, :time, default: 600
+      ## Timeout
+      # request_timeout_sec
+      #   Bigquery API response timeout
+      # request_open_timeout_sec
+      #   Bigquery API connection, and request timeout
+      config_param :request_timeout_sec, :time, default: nil
+      config_param :request_open_timeout_sec, :time, default: 60
+      ## Partitioning
+      config_param :time_partitioning_type, :enum, list: [:day], default: nil
+      config_param :time_partitioning_expiration, :time, default: nil
+      ## Formatter
+      config_section :format do
+        config_set_default :@type, 'json'
+      end
+      def configure(conf)
+        super
+        case @auth_method
+        when :private_key
+          unless @email && @private_key_path
+            raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
+          end
+        when :compute_engine
+          # Do nothing
+        when :json_key
+          unless @json_key
+            raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
+          end
+        when :application_default
+          # Do nothing
+        else
+          raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
+        end
+        unless @table.nil? ^ @tables.nil?
+          raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
+        end
+        @tablelist = @tables ? @tables : [@table]
+        @table_schema = Fluent::BigQuery::RecordSchema.new('record')
+        if @schema
+          @table_schema.load_schema(@schema)
+        end
+        if @schema_path
+          @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
+        end
+        formatter_config = conf.elements("format")[0]
+        @formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
+      end
+      def start
+        super
+        @tables_queue = @tablelist.shuffle
+        @tables_mutex = Mutex.new
+        @fetched_schemas = {}
+        @last_fetch_schema_time = Hash.new(0)
+      end
+      def multi_workers_ready?
+        true
+      end
+      def writer
+        @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
+          private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
+          email: @email,
+          json_key: @json_key,
+          source_format: @source_format,
+          skip_invalid_rows: @skip_invalid_rows,
+          ignore_unknown_values: @ignore_unknown_values,
+          max_bad_records: @max_bad_records,
+          allow_retry_insert_errors: @allow_retry_insert_errors,
+          prevent_duplicate_load: @prevent_duplicate_load,
+          auto_create_table: @auto_create_table,
+          time_partitioning_type: @time_partitioning_type,
+          time_partitioning_expiration: @time_partitioning_expiration,
+          timeout_sec: @request_timeout_sec,
+          open_timeout_sec: @request_open_timeout_sec,
+        })
+      end
+      def format(tag, time, record)
+        record = inject_values_to_record(tag, time, record)
+        meta = metadata(tag, time, record)
+        schema =
+          if @fetch_schema
+            fetch_schema(meta)
+          else
+            @table_schema
+          end
+        begin
+          row = schema.format(record)
+          return if row.empty?
+          @formatter.format(tag, time, row)
+        rescue
+          log.error("format error", record: record, schema: schema)
+          raise
+        end
+      end
+      def write(chunk)
+      end
+      def fetch_schema(metadata)
+        table_id = nil
+        project = extract_placeholders(@project, metadata)
+        dataset = extract_placeholders(@dataset, metadata)
+        table_id = fetch_schema_target_table(metadata)
+        if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
+          schema = writer.fetch_schema(project, dataset, table_id)
+          if schema
+            table_schema = Fluent::BigQuery::RecordSchema.new("record")
+            table_schema.load_schema(schema)
+            @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
+          else
+            if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
+              raise "failed to fetch schema from bigquery"
+            else
+              log.warn "#{table_id} uses previous schema"
+            end
+          end
+          @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
+        end
+        @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
+      end
+      def fetch_schema_target_table(metadata)
+        extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
+      end
+      def get_schema(project, dataset, metadata)
+        if @fetch_schema
+          @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
+        else
+          @table_schema
+        end
+      end
+    end
+  end
+end

data/lib/fluent/plugin/out_bigquery_insert.rb ADDED

@@ -0,0 +1,131 @@
+require 'fluent/plugin/out_bigquery_base'
+module Fluent
+  module Plugin
+    class BigQueryInsertOutput < BigQueryBaseOutput
+      Fluent::Plugin.register_output('bigquery_insert', self)
+      helpers :record_accessor
+      # template_suffix (only insert)
+      #   https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
+      config_param :template_suffix, :string, default: nil
+      # skip_invalid_rows (only insert)
+      #   Insert all valid rows of a request, even if invalid rows exist.
+      #   The default value is false, which causes the entire request to fail if any invalid rows exist.
+      config_param :skip_invalid_rows, :bool, default: false
+      # insert_id_field (only insert)
+      config_param :insert_id_field, :string, default: nil
+      # add_insert_timestamp (only insert)
+      # adds a timestamp just before sending the rows to bigquery, so that
+      # buffering time is not taken into account. Gives a field in bigquery
+      # which represents the insert time of the row.
+      config_param :add_insert_timestamp, :string, default: nil
+      # allow_retry_insert_errors (only insert)
+      # If insert_id_field is not specified, true means to allow duplicate rows
+      config_param :allow_retry_insert_errors, :bool, default: false
+      ## Buffer
+      config_section :buffer do
+        config_set_default :@type, "memory"
+        config_set_default :flush_mode, :interval
+        config_set_default :flush_interval, 1
+        config_set_default :flush_thread_interval, 0.05
+        config_set_default :flush_thread_burst_interval, 0.05
+        config_set_default :chunk_limit_size, 1 * 1024 ** 2 # 1MB
+        config_set_default :total_limit_size, 1 * 1024 ** 3 # 1GB
+        config_set_default :chunk_limit_records, 500
+      end
+      def configure(conf)
+        super
+        if @insert_id_field
+          if @insert_id_field !~ /^\$[\[\.]/ && @insert_id_field =~ /\./
+            warn "[BREAKING CHANGE] insert_id_field format is changed. Use fluentd record_accessor helper. (https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)"
+          end
+          @get_insert_id = record_accessor_create(@insert_id_field)
+        end
+        formatter_config = conf.elements("format")[0]
+        if formatter_config && formatter_config['@type'] != "json"
+          raise ConfigError, "`bigquery_insert` supports only json formatter."
+        end
+        @formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
+        placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
+        placeholder_validate!(:bigquery_insert, placeholder_params)
+      end
+      # for Fluent::Plugin::Output#implement? method
+      def format(tag, time, record)
+        super
+      end
+      def write(chunk)
+        table_format = @tables_mutex.synchronize do
+          t = @tables_queue.shift
+          @tables_queue.push t
+          t
+        end
+        now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
+        rows = chunk.open do |io|
+          io.map do |line|
+            record = MultiJson.load(line)
+            record[@add_insert_timestamp] = now if @add_insert_timestamp
+            row = {"json" => record}
+            row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
+            Fluent::BigQuery::Helper.deep_symbolize_keys(row)
+          end
+        end
+        metadata = chunk.metadata
+        project = extract_placeholders(@project, metadata)
+        dataset = extract_placeholders(@dataset, metadata)
+        table_id = extract_placeholders(table_format, metadata)
+        template_suffix = @template_suffix ? extract_placeholders(@template_suffix, metadata) : nil
+        schema = get_schema(project, dataset, metadata)
+        insert(project, dataset, table_id, rows, schema, template_suffix)
+      end
+      def insert(project, dataset, table_id, rows, schema, template_suffix)
+        writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
+      rescue Fluent::BigQuery::Error => e
+        if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
+          # Table Not Found: Auto Create Table
+          writer.create_table(project, dataset, table_id, schema)
+          raise "table created. send rows next time."
+        end
+        raise if e.retryable?
+        if @secondary
+          # TODO: find better way
+          @retry = retry_state_create(
+            :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
+            forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
+            max_interval: @buffer_config.retry_max_interval,
+            secondary: true, secondary_threshold: Float::EPSILON,
+            randomize: @buffer_config.retry_randomize
+          )
+        else
+          @retry = retry_state_create(
+            :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
+            forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
+            max_interval: @buffer_config.retry_max_interval,
+            randomize: @buffer_config.retry_randomize
+          )
+        end
+        raise
+      end
+    end
+  end
+end

data/lib/fluent/plugin/out_bigquery_load.rb ADDED

@@ -0,0 +1,220 @@
+require 'fluent/plugin/out_bigquery_base'
+module Fluent
+  module Plugin
+    class BigQueryLoadOutput < BigQueryBaseOutput
+      Fluent::Plugin.register_output('bigquery_load', self)
+      helpers :timer
+      config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
+      # max_bad_records (only load)
+      #   The maximum number of bad records that BigQuery can ignore when running the job.
+      #   If the number of bad records exceeds this value, an invalid error is returned in the job result.
+      #   The default value is 0, which requires that all records are valid.
+      config_param :max_bad_records, :integer, default: 0
+      # prevent_duplicate_load (only load)
+      config_param :prevent_duplicate_load, :bool, default: false
+      config_param :use_delayed_commit, :bool, default: true
+      config_param :wait_job_interval, :time, default: 3
+      ## Buffer
+      config_section :buffer do
+        config_set_default :@type, "file"
+        config_set_default :flush_mode, :interval
+        config_set_default :flush_interval, 3600 # 1h
+        config_set_default :flush_thread_interval, 5
+        config_set_default :flush_thread_burst_interval, 5
+        config_set_default :chunk_limit_size, 1 * 1024 ** 3 # 1GB
+        config_set_default :total_limit_size, 32 * 1024 ** 3 # 32GB
+        config_set_default :delayed_commit_timeout, 1800 # 30m
+      end
+      def configure(conf)
+        super
+        placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}"
+        placeholder_validate!(:bigquery_load, placeholder_params)
+      end
+      def start
+        super
+        if prefer_delayed_commit
+          @polling_targets = []
+          @polling_mutex = Mutex.new
+          log.debug("start load job polling")
+          timer_execute(:polling_bigquery_load_job, @wait_job_interval, &method(:poll))
+        end
+      end
+      def prefer_delayed_commit
+        @use_delayed_commit
+      end
+      # for Fluent::Plugin::Output#implement? method
+      def format(tag, time, record)
+        super
+      end
+      def write(chunk)
+        job_reference = do_write(chunk)
+        until response = writer.fetch_load_job(job_reference)
+          sleep @wait_job_interval
+        end
+        writer.commit_load_job(job_reference.chunk_id_hex, response)
+      rescue Fluent::BigQuery::Error => e
+        raise if e.retryable?
+        @retry_mutex.synchronize do
+          if @secondary
+            # TODO: find better way
+            @retry = retry_state_create(
+              :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
+              forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
+              max_interval: @buffer_config.retry_max_interval,
+              secondary: true, secondary_threshold: Float::EPSILON,
+              randomize: @buffer_config.retry_randomize
+            )
+          else
+            @retry = retry_state_create(
+              :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
+              forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
+              max_interval: @buffer_config.retry_max_interval,
+              randomize: @buffer_config.retry_randomize
+            )
+          end
+        end
+        raise
+      end
+      def try_write(chunk)
+        job_reference = do_write(chunk)
+        @polling_mutex.synchronize do
+          @polling_targets << job_reference
+        end
+      rescue Fluent::BigQuery::Error => e
+        raise if e.retryable?
+        @retry_mutex.synchronize do
+          if @secondary
+            # TODO: find better way
+            @retry = retry_state_create(
+              :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
+              forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
+              max_interval: @buffer_config.retry_max_interval,
+              secondary: true, secondary_threshold: Float::EPSILON,
+              randomize: @buffer_config.retry_randomize
+            )
+          else
+            @retry = retry_state_create(
+              :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
+              forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
+              max_interval: @buffer_config.retry_max_interval,
+              randomize: @buffer_config.retry_randomize
+            )
+          end
+        end
+        raise
+      end
+      private
+      def do_write(chunk)
+        table_format = @tables_mutex.synchronize do
+          t = @tables_queue.shift
+          @tables_queue.push t
+          t
+        end
+        metadata = chunk.metadata
+        project = extract_placeholders(@project, metadata)
+        dataset = extract_placeholders(@dataset, metadata)
+        table_id = extract_placeholders(table_format, metadata)
+        schema = get_schema(project, dataset, metadata)
+        create_upload_source(chunk) do |upload_source|
+          writer.create_load_job(chunk.unique_id, dump_unique_id_hex(chunk.unique_id), project, dataset, table_id, upload_source, schema)
+        end
+      end
+      def poll
+        job_reference = @polling_mutex.synchronize do
+          @polling_targets.shift
+        end
+        return unless job_reference
+        begin
+          response = writer.fetch_load_job(job_reference)
+          if response
+            writer.commit_load_job(job_reference.chunk_id_hex, response)
+            commit_write(job_reference.chunk_id)
+            log.debug("commit chunk", chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
+          else
+            @polling_mutex.synchronize do
+              @polling_targets << job_reference
+            end
+          end
+        rescue Fluent::BigQuery::Error => e
+          # RetryableError comes from only `commit_load_job`
+          # if error is retryable, takeback chunk and do next `try_flush`
+          # if error is not retryable, create custom retry_state and takeback chunk do next `try_flush`
+          if e.retryable?
+            log.warn("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
+          else
+            log.error("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
+            @retry_mutex.synchronize do
+              if @secondary
+                # TODO: find better way
+                @retry = retry_state_create(
+                  :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
+                  forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
+                  max_interval: @buffer_config.retry_max_interval,
+                  secondary: true, secondary_threshold: Float::EPSILON,
+                  randomize: @buffer_config.retry_randomize
+                )
+              else
+                @retry = retry_state_create(
+                  :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
+                  forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
+                  max_interval: @buffer_config.retry_max_interval,
+                  randomize: @buffer_config.retry_randomize
+                )
+              end
+            end
+          end
+          rollback_write(job_reference.chunk_id)
+        rescue => e
+          log.error("unexpected error while polling", error: e)
+          log.error_backtrace
+        end
+      end
+      def create_upload_source(chunk)
+        chunk_is_file = @buffer_config["@type"] == 'file'
+        if chunk_is_file
+          File.open(chunk.path) do |file|
+            yield file
+          end
+        else
+          Tempfile.open("chunk-tmp") do |file|
+            file.binmode
+            chunk.write_to(file)
+            file.sync
+            file.rewind
+            yield file
+          end
+        end
+      end
+    end
+  end
+end