RubyGems - fluent-plugin-bigquery-test - Versions diffs - 2.2.0 - Mend

fluent-plugin-bigquery-test 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +7 -0
data/.github/ISSUE_TEMPLATE.md +16 -0
data/.gitignore +21 -0
data/.travis.yml +14 -0
data/Gemfile +4 -0
data/LICENSE.txt +13 -0
data/README.md +602 -0
data/Rakefile +12 -0
data/fluent-plugin-bigquery.gemspec +29 -0
data/gemfiles/activesupport-4.gemfile +6 -0
data/lib/fluent/plugin/bigquery/errors.rb +84 -0
data/lib/fluent/plugin/bigquery/helper.rb +33 -0
data/lib/fluent/plugin/bigquery/schema.rb +281 -0
data/lib/fluent/plugin/bigquery/version.rb +5 -0
data/lib/fluent/plugin/bigquery/writer.rb +356 -0
data/lib/fluent/plugin/out_bigquery_base.rb +221 -0
data/lib/fluent/plugin/out_bigquery_insert.rb +125 -0
data/lib/fluent/plugin/out_bigquery_load.rb +221 -0
data/test/helper.rb +20 -0
data/test/plugin/test_out_bigquery_base.rb +579 -0
data/test/plugin/test_out_bigquery_insert.rb +544 -0
data/test/plugin/test_out_bigquery_load.rb +348 -0
data/test/plugin/test_record_schema.rb +186 -0
data/test/plugin/testdata/apache.schema +98 -0
data/test/plugin/testdata/json_key.json +7 -0
data/test/plugin/testdata/sudo.schema +27 -0
data/test/run_test.rb +9 -0
metadata +197 -0

data/lib/fluent/plugin/bigquery/version.rb ADDED

@@ -0,0 +1,5 @@
+module Fluent
+  module BigQueryPlugin
+    VERSION = "2.2.0".freeze
+  end
+end

data/lib/fluent/plugin/bigquery/writer.rb ADDED

@@ -0,0 +1,356 @@
+module Fluent
+  module BigQuery
+    class Writer
+      def initialize(log, auth_method, options = {})
+        @auth_method = auth_method
+        @scope = "https://www.googleapis.com/auth/bigquery"
+        @options = options
+        @log = log
+        @num_errors_per_chunk = {}
+      end
+      def client
+        @client ||= Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
+          cl.authorization = get_auth
+          cl.client_options.open_timeout_sec = @options[:open_timeout_sec] if @options[:open_timeout_sec]
+          cl.client_options.read_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
+          cl.client_options.send_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
+        end
+      end
+      def create_table(project, dataset, table_id, record_schema)
+        create_table_retry_limit = 3
+        create_table_retry_wait = 1
+        create_table_retry_count = 0
+        table_id = safe_table_id(table_id)
+        begin
+          definition = {
+            table_reference: {
+              table_id: table_id,
+            },
+            schema: {
+              fields: record_schema.to_a,
+            }
+          }
+          definition.merge!(time_partitioning: time_partitioning) if time_partitioning
+          definition.merge!(clustering: clustering) if clustering
+          client.insert_table(project, dataset, definition, {})
+          log.debug "create table", project_id: project, dataset: dataset, table: table_id
+        rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+          message = e.message
+          if e.status_code == 409 && /Already Exists:/ =~ message
+            log.debug "already created table", project_id: project, dataset: dataset, table: table_id
+            # ignore 'Already Exists' error
+            return
+          end
+          log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
+          if create_table_retry_count < create_table_retry_limit
+            sleep create_table_retry_wait
+            create_table_retry_wait *= 2
+            create_table_retry_count += 1
+            retry
+          else
+            raise Fluent::BigQuery::UnRetryableError.new("failed to create table in bigquery", e)
+          end
+        end
+      end
+      def fetch_schema(project, dataset, table_id)
+        res = client.get_table(project, dataset, table_id)
+        schema = Fluent::BigQuery::Helper.deep_stringify_keys(res.schema.to_h[:fields])
+        log.debug "Load schema from BigQuery: #{project}:#{dataset}.#{table_id} #{schema}"
+        schema
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        message = e.message
+        log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
+        nil
+      end
+      def insert_rows(project, dataset, table_id, rows, schema, template_suffix: nil)
+        body = {
+          rows: rows,
+          skip_invalid_rows: @options[:skip_invalid_rows],
+          ignore_unknown_values: @options[:ignore_unknown_values],
+        }
+        body.merge!(template_suffix: template_suffix) if template_suffix
+        if @options[:auto_create_table]
+          res = insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
+        else
+          res = client.insert_all_table_data(project, dataset, table_id, body, {})
+        end
+        log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
+        if res.insert_errors && !res.insert_errors.empty?
+          log.warn "insert errors", project_id: project, dataset: dataset, table: table_id, insert_errors: res.insert_errors.to_s
+          if @options[:allow_retry_insert_errors]
+            is_included_any_retryable_insert_error = res.insert_errors.any? do |insert_error|
+              insert_error.errors.any? { |error| Fluent::BigQuery::Error.retryable_insert_errors_reason?(error.reason) }
+            end
+            if is_included_any_retryable_insert_error
+              raise Fluent::BigQuery::RetryableError.new("failed to insert into bigquery(insert errors), retry")
+            else
+              raise Fluent::BigQuery::UnRetryableError.new("failed to insert into bigquery(insert errors), and cannot retry")
+            end
+          end
+        end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message }
+        wrapped = Fluent::BigQuery::Error.wrap(e)
+        if wrapped.retryable?
+          log.warn "tabledata.insertAll API", error_data
+        else
+          log.error "tabledata.insertAll API", error_data
+        end
+        raise wrapped
+      end
+      JobReference = Struct.new(:chunk_id, :chunk_id_hex, :project_id, :dataset_id, :table_id, :job_id) do
+        def as_hash(*keys)
+          if keys.empty?
+            to_h
+          else
+            to_h.select { |k, _| keys.include?(k) }
+          end
+        end
+      end
+      def create_load_job(chunk_id, chunk_id_hex, project, dataset, table_id, upload_source, fields)
+        configuration = {
+          configuration: {
+            load: {
+              destination_table: {
+                project_id: project,
+                dataset_id: dataset,
+                table_id: table_id,
+              },
+              write_disposition: "WRITE_APPEND",
+              source_format: source_format,
+              ignore_unknown_values: @options[:ignore_unknown_values],
+              max_bad_records: @options[:max_bad_records],
+            }
+          }
+        }
+        job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
+        configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
+        begin
+          # Check table existance
+          client.get_table(project, dataset, table_id)
+        rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+          if e.status_code == 404 && /Not Found: Table/i =~ e.message
+            raise Fluent::BigQuery::UnRetryableError.new("Table is not found") unless @options[:auto_create_table]
+            raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
+            configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
+            configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
+            configuration[:configuration][:load].merge!(clustering: clustering) if clustering
+          end
+        end
+        res = client.insert_job(
+          project,
+          configuration,
+          {
+            upload_source: upload_source,
+            content_type: "application/octet-stream",
+          }
+        )
+        JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id)
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message
+        if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
+          return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id)
+        end
+        raise Fluent::BigQuery::Error.wrap(e)
+      end
+      def fetch_load_job(job_reference)
+        project = job_reference.project_id
+        job_id = job_reference.job_id
+        location = @options[:location]
+        res = client.get_job(project, job_id, location: location)
+        log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
+        if res.status.state == "DONE"
+          res
+        end
+      rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+        e = Fluent::BigQuery::Error.wrap(e)
+        raise e unless e.retryable?
+      end
+      def commit_load_job(chunk_id_hex, response)
+        job_id = response.id
+        project = response.configuration.load.destination_table.project_id
+        dataset = response.configuration.load.destination_table.dataset_id
+        table_id = response.configuration.load.destination_table.table_id
+        errors = response.status.errors
+        if errors
+          errors.each do |e|
+            log.error "job.load API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
+          end
+        end
+        error_result = response.status.error_result
+        if error_result
+          log.error "job.load API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
+          if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
+            @num_errors_per_chunk[chunk_id_hex] = @num_errors_per_chunk[chunk_id_hex].to_i + 1
+            raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry")
+          else
+            @num_errors_per_chunk.delete(chunk_id_hex)
+            raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry")
+          end
+        end
+        # `stats` can be nil if we receive a warning like "Warning: Load job succeeded with data imported, however statistics may be lost due to internal error."
+        stats = response.statistics.load
+        duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
+        log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats&.input_file_bytes, input_files: stats&.input_files, output_bytes: stats&.output_bytes, output_rows: stats&.output_rows, bad_records: stats&.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
+        @num_errors_per_chunk.delete(chunk_id_hex)
+      end
+      private
+      def log
+        @log
+      end
+      def get_auth
+        case @auth_method
+        when :private_key
+          get_auth_from_private_key
+        when :compute_engine
+          get_auth_from_compute_engine
+        when :json_key
+          get_auth_from_json_key
+        when :application_default
+          get_auth_from_application_default
+        else
+          raise ConfigError, "Unknown auth method: #{@auth_method}"
+        end
+      end
+      def get_auth_from_private_key
+        require 'google/api_client/auth/key_utils'
+        private_key_path = @options[:private_key_path]
+        private_key_passphrase = @options[:private_key_passphrase]
+        email = @options[:email]
+        key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
+        Signet::OAuth2::Client.new(
+          token_credential_uri: "https://accounts.google.com/o/oauth2/token",
+          audience: "https://accounts.google.com/o/oauth2/token",
+          scope: @scope,
+          issuer: email,
+          signing_key: key
+        )
+      end
+      def get_auth_from_compute_engine
+        Google::Auth::GCECredentials.new
+      end
+      def get_auth_from_json_key
+        json_key = @options[:json_key]
+        begin
+          JSON.parse(json_key)
+          key = StringIO.new(json_key)
+          Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope)
+        rescue JSON::ParserError
+          key = json_key
+          File.open(json_key) do |f|
+            Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope)
+          end
+        end
+      end
+      def get_auth_from_application_default
+        Google::Auth.get_application_default([@scope])
+      end
+      def safe_table_id(table_id)
+        table_id.gsub(/\$\d+$/, "")
+      end
+      def create_job_id(chunk_id_hex, dataset, table, schema)
+        job_id_key = "#{chunk_id_hex}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id_hex]}"
+        @log.debug "job_id_key: #{job_id_key}"
+        "fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
+      end
+      def source_format
+        case @options[:source_format]
+        when :json
+          "NEWLINE_DELIMITED_JSON"
+        when :avro
+          "AVRO"
+        when :csv
+          "CSV"
+        else
+          "NEWLINE_DELIMITED_JSON"
+        end
+      end
+      def time_partitioning
+        return @time_partitioning if instance_variable_defined?(:@time_partitioning)
+        if @options[:time_partitioning_type]
+          @time_partitioning = {
+            type: @options[:time_partitioning_type].to_s.upcase,
+            field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
+            expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
+          }.reject { |_, v| v.nil? }
+        else
+          @time_partitioning
+        end
+      end
+      def clustering
+        return @clustering if instance_variable_defined?(:@clustering)
+        if @options[:clustering_fields]
+          @clustering = {
+            fields: @options[:clustering_fields]
+          }
+        else
+          @clustering
+        end
+      end
+      def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
+        try_count ||= 1
+        res = client.insert_all_table_data(project, dataset, table_id, body, {})
+      rescue Google::Apis::ClientError => e
+        if e.status_code == 404 && /Not Found: Table/i =~ e.message
+          if try_count == 1
+            # Table Not Found: Auto Create Table
+            create_table(project, dataset, table_id, schema)
+          elsif try_count > 10
+            raise "A new table was created but it is not found."
+          end
+          # Retry to insert several times because the created table is not visible from Streaming insert for a little while
+          # cf. https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
+          try_count += 1
+          sleep 5
+          log.debug "Retry to insert rows", project_id: project, dataset: dataset, table: table_id
+          retry
+        end
+        raise
+      end
+    end
+  end
+end

data/lib/fluent/plugin/out_bigquery_base.rb ADDED

@@ -0,0 +1,221 @@
+require 'fluent/plugin/output'
+require 'fluent/plugin/bigquery/version'
+require 'fluent/plugin/bigquery/helper'
+require 'fluent/plugin/bigquery/errors'
+require 'fluent/plugin/bigquery/schema'
+require 'fluent/plugin/bigquery/writer'
+require 'multi_json'
+require 'google/apis/bigquery_v2'
+require 'googleauth'
+module Fluent
+  module Plugin
+    # This class is abstract class
+    class BigQueryBaseOutput < Output
+      helpers :inject, :formatter
+      # Available methods are:
+      # * private_key -- Use service account credential from pkcs12 private key file
+      # * compute_engine -- Use access token available in instances of ComputeEngine
+      # * json_key -- Use service account credential from JSON key
+      # * application_default -- Use application default credential
+      config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
+      ### Service Account credential
+      config_param :email, :string, default: nil
+      config_param :private_key_path, :string, default: nil
+      config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
+      config_param :json_key, default: nil, secret: true
+      # The geographic location of the job. Required except for US and EU.
+      # https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
+      config_param :location, :string, default: nil
+      # see as simple reference
+      #   https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
+      config_param :project, :string
+      # dataset_name
+      #   The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
+      #   but it cannot start with a number or underscore, or have spaces.
+      config_param :dataset, :string
+      # table_id
+      #   In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
+      config_param :table, :string, default: nil
+      config_param :tables, :array, value_type: :string, default: nil
+      config_param :auto_create_table, :bool, default: false
+      # ignore_unknown_values
+      #   Accept rows that contain values that do not match the schema. The unknown values are ignored.
+      #   Default is false, which treats unknown values as errors.
+      config_param :ignore_unknown_values, :bool, default: false
+      config_param :schema, :array, default: nil
+      config_param :schema_path, :string, default: nil
+      config_param :fetch_schema, :bool, default: false
+      config_param :fetch_schema_table, :string, default: nil
+      config_param :schema_cache_expire, :time, default: 600
+      ## Timeout
+      # request_timeout_sec
+      #   Bigquery API response timeout
+      # request_open_timeout_sec
+      #   Bigquery API connection, and request timeout
+      config_param :request_timeout_sec, :time, default: nil
+      config_param :request_open_timeout_sec, :time, default: 60
+      ## Partitioning
+      config_param :time_partitioning_type, :enum, list: [:day], default: nil
+      config_param :time_partitioning_field, :string, default: nil
+      config_param :time_partitioning_expiration, :time, default: nil
+      ## Clustering
+      config_param :clustering_fields, :array, default: nil
+      ## Formatter
+      config_section :format do
+        config_set_default :@type, 'json'
+      end
+      def configure(conf)
+        super
+        case @auth_method
+        when :private_key
+          unless @email && @private_key_path
+            raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
+          end
+        when :compute_engine
+          # Do nothing
+        when :json_key
+          unless @json_key
+            raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
+          end
+        when :application_default
+          # Do nothing
+        else
+          raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
+        end
+        unless @table.nil? ^ @tables.nil?
+          raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
+        end
+        @tablelist = @tables ? @tables : [@table]
+        @table_schema = Fluent::BigQuery::RecordSchema.new('record')
+        if @schema
+          @table_schema.load_schema(@schema)
+        end
+        if @schema_path
+          @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
+        end
+        formatter_config = conf.elements("format")[0]
+        @formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
+      end
+      def start
+        super
+        @tables_queue = @tablelist.shuffle
+        @tables_mutex = Mutex.new
+        @fetched_schemas = {}
+        @last_fetch_schema_time = Hash.new(0)
+      end
+      def multi_workers_ready?
+        true
+      end
+      def writer
+        @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
+          private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
+          email: @email,
+          json_key: @json_key,
+          location: @location,
+          source_format: @source_format,
+          skip_invalid_rows: @skip_invalid_rows,
+          ignore_unknown_values: @ignore_unknown_values,
+          max_bad_records: @max_bad_records,
+          allow_retry_insert_errors: @allow_retry_insert_errors,
+          prevent_duplicate_load: @prevent_duplicate_load,
+          auto_create_table: @auto_create_table,
+          time_partitioning_type: @time_partitioning_type,
+          time_partitioning_field: @time_partitioning_field,
+          time_partitioning_expiration: @time_partitioning_expiration,
+          clustering_fields: @clustering_fields,
+          timeout_sec: @request_timeout_sec,
+          open_timeout_sec: @request_open_timeout_sec,
+        })
+      end
+      def format(tag, time, record)
+        record = inject_values_to_record(tag, time, record)
+        meta = metadata(tag, time, record)
+        schema =
+          if @fetch_schema
+            fetch_schema(meta)
+          else
+            @table_schema
+          end
+        begin
+          row = schema.format(record)
+          return if row.empty?
+          @formatter.format(tag, time, row)
+        rescue
+          log.error("format error", record: record, schema: schema)
+          raise
+        end
+      end
+      def write(chunk)
+      end
+      def fetch_schema(metadata)
+        table_id = nil
+        project = extract_placeholders(@project, metadata)
+        dataset = extract_placeholders(@dataset, metadata)
+        table_id = fetch_schema_target_table(metadata)
+        if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
+          schema = writer.fetch_schema(project, dataset, table_id)
+          if schema
+            table_schema = Fluent::BigQuery::RecordSchema.new("record")
+            table_schema.load_schema(schema)
+            @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
+          else
+            if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].nil?
+              raise "failed to fetch schema from bigquery"
+            else
+              log.warn "#{table_id} uses previous schema"
+            end
+          end
+          @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
+        end
+        @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
+      end
+      def fetch_schema_target_table(metadata)
+        extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
+      end
+      def get_schema(project, dataset, metadata)
+        if @fetch_schema
+          @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
+        else
+          @table_schema
+        end
+      end
+    end
+  end
+end