RubyGems - logstash-output-google_bigquery - Versions diffs - 4.0.0-java - Mend

logstash-output-google_bigquery 4.0.0-java

Files changed (53) hide show

data/lib/logstash-output-google_bigquery_jars.rb ADDED

@@ -0,0 +1,38 @@
+# AUTOGENERATED BY THE GRADLE SCRIPT. DO NOT EDIT.
+require 'jar_dependencies'
+require_jar('com.google.cloud', 'google-cloud-bigquery', '1.24.1')
+require_jar('com.fasterxml.jackson.core', 'jackson-core', '2.1.3')
+require_jar('com.google.api', 'api-common', '1.5.0')
+require_jar('com.google.api-client', 'google-api-client', '1.23.0')
+require_jar('com.google.api', 'gax', '1.23.0')
+require_jar('com.google.api', 'gax-httpjson', '0.40.0')
+require_jar('com.google.api.grpc', 'proto-google-common-protos', '1.7.0')
+require_jar('com.google.api.grpc', 'proto-google-iam-v1', '0.8.0')
+require_jar('com.google.apis', 'google-api-services-bigquery', 'v2-rev377-1.23.0')
+require_jar('com.google.auth', 'google-auth-library-credentials', '0.9.0')
+require_jar('com.google.auth', 'google-auth-library-oauth2-http', '0.9.0')
+require_jar('com.google.auto.value', 'auto-value', '1.4')
+require_jar('com.google.cloud', 'google-cloud-core', '1.24.1')
+require_jar('com.google.cloud', 'google-cloud-core-http', '1.24.1')
+require_jar('com.google.code.findbugs', 'jsr305', '3.0.1')
+require_jar('com.google.code.gson', 'gson', '2.7')
+require_jar('com.google.errorprone', 'error_prone_annotations', '2.2.0')
+require_jar('com.google.guava', 'guava', '20.0')
+require_jar('com.google.http-client', 'google-http-client', '1.23.0')
+require_jar('com.google.http-client', 'google-http-client-appengine', '1.23.0')
+require_jar('com.google.http-client', 'google-http-client-jackson', '1.23.0')
+require_jar('com.google.http-client', 'google-http-client-jackson2', '1.23.0')
+require_jar('com.google.oauth-client', 'google-oauth-client', '1.23.0')
+require_jar('com.google.protobuf', 'protobuf-java', '3.5.1')
+require_jar('com.google.protobuf', 'protobuf-java-util', '3.5.1')
+require_jar('commons-codec', 'commons-codec', '1.9')
+require_jar('commons-logging', 'commons-logging', '1.2')
+require_jar('io.grpc', 'grpc-context', '1.9.0')
+require_jar('io.opencensus', 'opencensus-api', '0.11.1')
+require_jar('io.opencensus', 'opencensus-contrib-http-util', '0.11.1')
+require_jar('joda-time', 'joda-time', '2.9.2')
+require_jar('org.apache.httpcomponents', 'httpclient', '4.5.2')
+require_jar('org.apache.httpcomponents', 'httpcore', '4.4.4')
+require_jar('org.codehaus.jackson', 'jackson-core-asl', '1.9.11')
+require_jar('org.threeten', 'threetenbp', '1.3.3')

data/lib/logstash/outputs/bigquery/batcher.rb ADDED

@@ -0,0 +1,82 @@
+require 'thread'
+require 'java'
+require 'logstash-output-google_bigquery_jars.rb'
+module LogStash
+  module Outputs
+    module BigQuery
+      # Batcher is a queue that bundles messages in batches based on their
+      # size in bytes or count. It's used to provide guarantees around
+      # maximum data loss due to a fault while maintaining good upload
+      # throughput.
+      class Batcher
+        include_package 'java.util.concurrent.locks'
+        def initialize(max_length, max_bytes)
+          @lock = ReentrantReadWriteLock.new
+          @max_length = max_length
+          @max_bytes = max_bytes
+          clear
+        end
+        # enqueue_push calls enqueue and if a batch is ready to go pushes it to
+        # the provided queue.
+        def enqueue_push(message, queue)
+          batch = enqueue message
+          queue << batch unless batch.nil?
+        end
+        # enqueue adds a message to the batch. If the batch is ready to be sent
+        # out the internal state is reset and the array of messages is both
+        # yielded and returned.
+        # Otherwise nil is returned.
+        def enqueue(message)
+          @lock.write_lock.lock
+          begin
+            is_flush_request = message.nil?
+            unless is_flush_request
+              @batch_size_bytes += message.length
+              @batch << message
+            end
+            length_met = @batch.length >= @max_length
+            size_met = @batch_size_bytes >= @max_bytes
+            if is_flush_request || length_met || size_met
+              orig = @batch
+              clear
+              yield(orig) if block_given?
+              return orig
+            end
+            nil
+          ensure
+            @lock.write_lock.unlock
+          end
+        end
+        # removes all elements from the batch
+        def clear
+          @lock.write_lock.lock
+          @batch = []
+          @batch_size_bytes = 0
+          @lock.write_lock.unlock
+        end
+        def empty?
+          @lock.read_lock.lock
+          begin
+            @batch.empty? && @batch_size_bytes.zero?
+          ensure
+            @lock.read_lock.unlock
+          end
+        end
+      end
+    end
+  end
+end

data/lib/logstash/outputs/bigquery/schema.rb ADDED

@@ -0,0 +1,93 @@
+require 'java'
+require 'logstash-output-google_bigquery_jars.rb'
+require 'logstash/json'
+module LogStash
+  module Outputs
+    module BigQuery
+      class Schema
+        include_package 'com.google.cloud.bigquery'
+        # Converts a CSV schema or JSON schema into a BigQuery Java Schema.
+        def self.parse_csv_or_json(csv_schema, json_schema)
+          csv_blank  = csv_schema.nil?  || csv_schema.empty?
+          json_blank = json_schema.nil? || json_schema.empty?
+          unless csv_blank ^ json_blank
+            raise ArgumentError.new("You must provide either json_schema OR csv_schema. csv: #{csv_schema}, json: #{json_schema}")
+          end
+          if csv_blank
+            schema = json_schema
+          else
+            schema = parse_csv_schema csv_schema
+          end
+          self.hash_to_java_schema schema
+        end
+        # Converts a CSV of field:type pairs into the JSON style schema.
+        def self.parse_csv_schema(csv_schema)
+          require 'csv'
+          fields = []
+          CSV.parse(csv_schema.gsub('\"', '""')).flatten.each do |field|
+            raise ArgumentError.new('csv_schema must follow the format <field-name>:<field-type>') if field.nil?
+            temp = field.strip.split(':')
+            if temp.length != 2
+              raise ArgumentError.new('csv_schema must follow the format <field-name>:<field-type>')
+            end
+            fields << { 'name' => temp[0], 'type' => temp[1] }
+          end
+          # Check that we have at least one field in the schema
+          raise ArgumentError.new('csv_schema must contain at least one field') if fields.empty?
+          { 'fields' => fields }
+        end
+        # Converts the Ruby hash style schema into a BigQuery Java schema
+        def self.hash_to_java_schema(schema_hash)
+          field_list = self.parse_field_list schema_hash['fields']
+          com.google.cloud.bigquery.Schema.of field_list
+        end
+        # Converts a list of fields into a BigQuery Java FieldList
+        def self.parse_field_list(fields)
+          fieldslist = fields.map {|field| self.parse_field field}
+          FieldList.of fieldslist
+        end
+        # Converts a single field definition into a BigQuery Java Field object.
+        # This includes any nested fields as well.
+        def self.parse_field(field)
+          type = LegacySQLTypeName.valueOfStrict(field['type'])
+          name = field['name']
+          if field.has_key? 'fields'
+            sub_fields = self.parse_field_list field['fields']
+            builder = Field.newBuilder(name, type, sub_fields)
+          else
+            builder = Field.newBuilder(name, type)
+          end
+          if field.has_key? 'description'
+            builder = builder.setDescription(field['description'])
+          end
+          if field.has_key? 'mode'
+            mode = Field::Mode.valueOf field['mode']
+            builder = builder.setMode(mode)
+          end
+          builder.build
+        end
+      end
+    end
+  end
+end

data/lib/logstash/outputs/bigquery/streamclient.rb ADDED

@@ -0,0 +1,120 @@
+require 'java'
+require 'openssl'
+require 'logstash-output-google_bigquery_jars.rb'
+module LogStash
+  module Outputs
+    module BigQuery
+      # NOTE: This file uses _a lot_ of Java. Please keep the Java looking
+      # java-y so it's easy to tell the languages apart.
+      include_package 'com.google.cloud.bigquery'
+      # StreamingClient supports shipping data to BigQuery using streams.
+      class StreamingClient
+        def initialize(json_key_file, project_id, logger)
+          @logger = logger
+          @bigquery = initialize_google_client json_key_file, project_id
+        end
+        def table_exists?(dataset, table)
+          api_debug('Checking if table exists', dataset, table)
+          tbl = @bigquery.getTable dataset, table
+          !tbl.nil?
+        end
+        # Creates a table with the given name in the given dataset
+        def create_table(dataset, table, schema)
+          api_debug('Creating table', dataset, table)
+          table_id = com.google.cloud.bigquery.TableId.of dataset, table
+          table_defn = com.google.cloud.bigquery.StandardTableDefinition.of schema
+          table_info = com.google.cloud.bigquery.TableInfo.newBuilder(table_id, table_defn).build()
+          @bigquery.create table_info
+        end
+        def append(dataset, table, rows, ignore_unknown)
+          api_debug("Appending #{rows.length} rows", dataset, table)
+          request = build_append_request dataset, table, rows, ignore_unknown
+          response = @bigquery.insertAll request
+          return true unless response.hasErrors
+          response.getInsertErrors().entrySet().each{ |entry|
+            key = entry.getKey
+            errors = entry.getValue
+            errors.each{|bqError|
+              @logger.warn('Error while inserting',
+                           key: key,
+                           location: bqError.getLocation,
+                           message: bqError.getMessage,
+                           reason: bqError.getReason)
+              }
+            }
+            false
+        end
+        def build_append_request(dataset, table, rows, ignore_unknown)
+          request = com.google.cloud.bigquery.InsertAllRequest.newBuilder dataset, table
+          request.setIgnoreUnknownValues ignore_unknown
+          rows.each { |serialized_row|
+            # deserialize rows into Java maps
+            deserialized = LogStash::Json.load serialized_row
+            request.addRow deserialized
+          }
+          request.build
+        end
+        # raises an exception if the key file is invalid
+        def get_key_file_error(json_key_file)
+          return nil if json_key_file.nil? || json_key_file == ''
+          abs = ::File.absolute_path json_key_file
+          unless abs == json_key_file
+            return "json_key_file must be an absolute path: #{json_key_file}"
+          end
+          unless ::File.exist? json_key_file
+            return "json_key_file does not exist: #{json_key_file}"
+          end
+          nil
+        end
+        def initialize_google_client(json_key_file, project_id)
+          @logger.info("Initializing Google API client #{project_id} key: #{json_key_file}")
+          err = get_key_file_error json_key_file
+          raise err unless err.nil?
+          if json_key_file.nil? || json_key_file.empty?
+            return com.google.cloud.bigquery.BigQueryOptions.getDefaultInstance().getService()
+          end
+          # TODO: set User-Agent
+          key_file = java.io.FileInputStream.new json_key_file
+          credentials = com.google.auth.oauth2.ServiceAccountCredentials.fromStream key_file
+          return com.google.cloud.bigquery.BigQueryOptions.newBuilder()
+                     .setCredentials(credentials)
+                     .setProjectId(project_id)
+                     .build()
+                     .getService()
+        end
+        private
+        def api_debug(message, dataset, table)
+          @logger.debug(message, dataset: dataset, table: table)
+        end
+      end
+    end
+  end
+end

data/lib/logstash/outputs/google_bigquery.rb ADDED

@@ -0,0 +1,280 @@
+require 'logstash/outputs/base'
+require 'logstash/namespace'
+require 'logstash/json'
+require 'logstash/outputs/bigquery/streamclient'
+require 'logstash/outputs/bigquery/batcher'
+require 'logstash/outputs/bigquery/schema'
+require 'time'
+require 'fileutils'
+#
+# === Summary
+#
+# This plugin uploads events to Google BigQuery using the streaming API
+# so data can become available nearly immediately.
+#
+# You can configure it to flush periodically, after N events or after
+# a certain amount of data is ingested.
+#
+# === Environment Configuration
+#
+# You must enable BigQuery on your GCS account and create a dataset to
+# hold the tables this plugin generates.
+#
+# You must also grant the service account this plugin uses access to
+# the dataset.
+#
+# You can use https://www.elastic.co/guide/en/logstash/current/event-dependent-configuration.html[Logstash conditionals]
+# and multiple configuration blocks to upload events with different structures.
+#
+# === Usage
+# This is an example of logstash config:
+#
+# [source,ruby]
+# --------------------------
+# output {
+#    google_bigquery {
+#      project_id => "folkloric-guru-278"                        (required)
+#      dataset => "logs"                                         (required)
+#      csv_schema => "path:STRING,status:INTEGER,score:FLOAT"    (required) <1>
+#      json_key_file => "/path/to/key.json"                      (optional) <2>
+#      error_directory => "/tmp/bigquery-errors"                 (required)
+#      date_pattern => "%Y-%m-%dT%H:00"                          (optional)
+#      flush_interval_secs => 30                                 (optional)
+#    }
+# }
+# --------------------------
+#
+# <1> Specify either a csv_schema or a json_schema.
+#
+# <2> If the key is not used, then the plugin tries to find
+# https://cloud.google.com/docs/authentication/production[Application Default Credentials]
+#
+# === Considerations
+#
+# * There is a small fee to insert data into BigQuery using the streaming API
+# * This plugin buffers events in-memory, so make sure the flush configurations are appropriate
+#   for your use-case and consider using
+#   https://www.elastic.co/guide/en/logstash/current/persistent-queues.html[Logstash Persistent Queues]
+#
+# === Additional Resources
+#
+# * https://cloud.google.com/bigquery/[BigQuery Introduction]
+# * https://cloud.google.com/bigquery/docs/schemas[BigQuery Schema Formats and Types]
+# * https://cloud.google.com/bigquery/pricing[Pricing Information]
+#
+class LogStash::Outputs::GoogleBigQuery < LogStash::Outputs::Base
+  config_name 'google_bigquery'
+  concurrency :single
+  # Google Cloud Project ID (number, not Project Name!).
+  config :project_id, validate: :string, required: true
+  # The BigQuery dataset the tables for the events will be added to.
+  config :dataset, validate: :string, required: true
+  # BigQuery table ID prefix to be used when creating new tables for log data.
+  # Table name will be `<table_prefix><table_separator><date>`
+  config :table_prefix, validate: :string, default: 'logstash'
+  # BigQuery table separator to be added between the table_prefix and the
+  # date suffix.
+  config :table_separator, validate: :string, default: '_'
+  # Schema for log data. It must follow the format `name1:type1(,name2:type2)*`.
+  # For example, `path:STRING,status:INTEGER,score:FLOAT`.
+  config :csv_schema, validate: :string, required: false, default: nil
+  # Schema for log data as a hash.
+  # These can include nested records, descriptions, and modes.
+  #
+  # Example:
+  # # [source,ruby]
+  # --------------------------
+  # json_schema => {
+  #   fields => [{
+  #     name => "endpoint"
+  #     type => "STRING"
+  #     description => "Request route"
+  #   }, {
+  #     name => "status"
+  #     type => "INTEGER"
+  #     mode => "NULLABLE"
+  #   }, {
+  #     name => "params"
+  #     type => "RECORD"
+  #     mode => "REPEATED"
+  #     fields => [{
+  #       name => "key"
+  #       type => "STRING"
+  #      }, {
+  #       name => "value"
+  #       type => "STRING"
+  #     }]
+  #   }]
+  # }
+  # --------------------------
+  config :json_schema, validate: :hash, required: false, default: nil
+  # Indicates if BigQuery should ignore values that are not represented in the table schema.
+  # If true, the extra values are discarded.
+  # If false, BigQuery will reject the records with extra fields and the job will fail.
+  # The default value is false.
+  #
+  # NOTE: You may want to add a Logstash filter like the following to remove common fields it adds:
+  # [source,ruby]
+  # ----------------------------------
+  # mutate {
+  #     remove_field => ["@version","@timestamp","path","host","type", "message"]
+  # }
+  # ----------------------------------
+  config :ignore_unknown_values, validate: :boolean, default: false
+  # Time pattern for BigQuery table, defaults to hourly tables.
+  # Must Time.strftime patterns: www.ruby-doc.org/core-2.0/Time.html#method-i-strftime
+  config :date_pattern, validate: :string, default: '%Y-%m-%dT%H:00'
+  # If logstash is running within Google Compute Engine, the plugin will use
+  # GCE's Application Default Credentials. Outside of GCE, you will need to
+  # specify a Service Account JSON key file.
+  config :json_key_file, validate: :string, required: false
+  # The number of messages to upload at a single time. (< 1000, default: 128)
+  config :batch_size, validate: :number, required: true, default: 128
+  # An approximate number of bytes to upload as part of a batch. Default: 1MB
+  config :batch_size_bytes, validate: :number, required: true, default: 1_000_000
+  # Uploads all data this often even if other upload criteria aren't met. Default: 5s
+  config :flush_interval_secs, validate: :number, required: true, default: 5
+  # The location to store events that could not be uploaded due to errors.
+  # Consider using an additional Logstash input to pipe the contents of
+  # these to an alert platform so you can manually fix the events.
+  #
+  # Or use https://cloud.google.com/storage/docs/gcs-fuse[GCS FUSE] to
+  # transparently upload to a GCS bucket.
+  #
+  # Files names follow the pattern `[table name]-[UNIX timestamp].log`
+  config :error_directory, validate: :string, required: true, default: '/tmp/bigquery_errors'
+  # The following configuration options still exist to alert users that are using them
+  config :uploader_interval_secs, validate: :number, deprecated: 'No longer used.'
+  config :deleter_interval_secs, validate: :number, deprecated: 'No longer used.'
+  config :key_path, validate: :string, obsolete: 'Use json_key_file or ADC instead.'
+  config :key_password, validate: :string, deprecated: 'No longer needed with json_key_file or ADC.'
+  config :service_account, validate: :string, deprecated: 'No longer needed with json_key_file or ADC.'
+  config :temp_file_prefix, validate: :string, deprecated: 'No longer used.'
+  config :temp_directory, validate: :string, deprecated: 'No longer used.'
+  public
+  def register
+    @logger.debug('Registering plugin')
+    @schema = LogStash::Outputs::BigQuery::Schema.parse_csv_or_json @csv_schema, @json_schema
+    @bq_client = LogStash::Outputs::BigQuery::StreamingClient.new @json_key_file, @project_id, @logger
+    @batcher = LogStash::Outputs::BigQuery::Batcher.new @batch_size, @batch_size_bytes
+    init_batcher_flush_thread
+  end
+  # Method called for each log event. It writes the event to the current output
+  # file, flushing depending on flush interval configuration.
+  def receive(event)
+    @logger.debug('BQ: receive method called', event: event)
+    # Property names MUST NOT have @ in them
+    message = replace_at_keys event.to_hash
+    # Message must be written as json
+    encoded_message = LogStash::Json.dump message
+    @batcher.enqueue(encoded_message) { |batch| publish(batch) }
+  end
+  def get_table_name(time=nil)
+    time ||= Time.now
+    str_time = time.strftime(@date_pattern)
+    table_id = @table_prefix + @table_separator + str_time
+    # BQ does not accept anything other than alphanumeric and _
+    # Ref: https://developers.google.com/bigquery/browser-tool-quickstart?hl=en
+    table_id.tr!(':-', '_')
+    table_id
+  end
+  # Remove @ symbols in hash keys
+  def replace_at_keys(event)
+    return event unless event.is_a? Hash
+    out = {}
+    event.each do |key, value|
+      new_key = key.to_s.delete '@'
+      out[new_key] = replace_at_keys value
+    end
+    out
+  end
+  # publish sends messages to a BigQuery table immediately
+  def publish(messages)
+    begin
+      return if messages.nil? || messages.empty?
+      table = get_table_name
+      @logger.info("Publishing #{messages.length} messages to #{table}")
+      create_table_if_not_exists table
+      successful = @bq_client.append @dataset, table, messages, @ignore_unknown_values
+      write_to_errors_file(messages, table) unless successful
+    rescue StandardError => e
+      @logger.error 'Error uploading data.', :exception => e
+      write_to_errors_file(messages, table)
+    end
+  end
+  def create_table_if_not_exists table
+    begin
+      return nil if @bq_client.table_exists? @dataset, table
+      @bq_client.create_table(@dataset, table, @schema)
+    rescue StandardError => e
+      @logger.error 'Error creating table.', :exception => e
+    end
+  end
+  def write_to_errors_file(messages, table)
+    begin
+      FileUtils.mkdir_p @error_directory
+      t = Time.new
+      error_file_name = "#{table}-#{t.to_i}.log"
+      error_file_path = ::File.join(@error_directory, error_file_name)
+      @logger.info "Problem data is being stored in: #{error_file_path}"
+      File.open(error_file_path, 'w') do |f|
+        messages.each { |message| f.puts message }
+      end
+    rescue StandardError => e
+      @logger.error 'Error creating error file.', :exception => e, :messages => messages, :table => table
+    end
+  end
+  def init_batcher_flush_thread
+    @flush_thread = Thread.new do
+      loop do
+        sleep @flush_interval_secs
+        @batcher.enqueue(nil) { |batch| publish(batch) }
+      end
+    end
+  end
+end