logstash-output-google_bigquery 4.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +71 -0
  3. data/CONTRIBUTORS +15 -0
  4. data/Gemfile +11 -0
  5. data/LICENSE +13 -0
  6. data/NOTICE.TXT +5 -0
  7. data/README.md +100 -0
  8. data/docs/index.asciidoc +348 -0
  9. data/lib/logstash-output-google_bigquery_jars.rb +38 -0
  10. data/lib/logstash/outputs/bigquery/batcher.rb +82 -0
  11. data/lib/logstash/outputs/bigquery/schema.rb +93 -0
  12. data/lib/logstash/outputs/bigquery/streamclient.rb +120 -0
  13. data/lib/logstash/outputs/google_bigquery.rb +280 -0
  14. data/logstash-output-google_bigquery.gemspec +31 -0
  15. data/spec/outputs/bigquery/batcher_spec.rb +110 -0
  16. data/spec/outputs/bigquery/schema_spec.rb +101 -0
  17. data/spec/outputs/google_bigquery_spec.rb +154 -0
  18. data/vendor/jar-dependencies/com/fasterxml/jackson/core/jackson-core/2.1.3/jackson-core-2.1.3.jar +0 -0
  19. data/vendor/jar-dependencies/com/google/api-client/google-api-client/1.23.0/google-api-client-1.23.0.jar +0 -0
  20. data/vendor/jar-dependencies/com/google/api/api-common/1.5.0/api-common-1.5.0.jar +0 -0
  21. data/vendor/jar-dependencies/com/google/api/gax-httpjson/0.40.0/gax-httpjson-0.40.0.jar +0 -0
  22. data/vendor/jar-dependencies/com/google/api/gax/1.23.0/gax-1.23.0.jar +0 -0
  23. data/vendor/jar-dependencies/com/google/api/grpc/proto-google-common-protos/1.7.0/proto-google-common-protos-1.7.0.jar +0 -0
  24. data/vendor/jar-dependencies/com/google/api/grpc/proto-google-iam-v1/0.8.0/proto-google-iam-v1-0.8.0.jar +0 -0
  25. data/vendor/jar-dependencies/com/google/apis/google-api-services-bigquery/v2-rev377-1.23.0/google-api-services-bigquery-v2-rev377-1.23.0.jar +0 -0
  26. data/vendor/jar-dependencies/com/google/auth/google-auth-library-credentials/0.9.0/google-auth-library-credentials-0.9.0.jar +0 -0
  27. data/vendor/jar-dependencies/com/google/auth/google-auth-library-oauth2-http/0.9.0/google-auth-library-oauth2-http-0.9.0.jar +0 -0
  28. data/vendor/jar-dependencies/com/google/auto/value/auto-value/1.4/auto-value-1.4.jar +0 -0
  29. data/vendor/jar-dependencies/com/google/cloud/google-cloud-bigquery/1.24.1/google-cloud-bigquery-1.24.1.jar +0 -0
  30. data/vendor/jar-dependencies/com/google/cloud/google-cloud-core-http/1.24.1/google-cloud-core-http-1.24.1.jar +0 -0
  31. data/vendor/jar-dependencies/com/google/cloud/google-cloud-core/1.24.1/google-cloud-core-1.24.1.jar +0 -0
  32. data/vendor/jar-dependencies/com/google/code/findbugs/jsr305/3.0.1/jsr305-3.0.1.jar +0 -0
  33. data/vendor/jar-dependencies/com/google/code/gson/gson/2.7/gson-2.7.jar +0 -0
  34. data/vendor/jar-dependencies/com/google/errorprone/error_prone_annotations/2.2.0/error_prone_annotations-2.2.0.jar +0 -0
  35. data/vendor/jar-dependencies/com/google/guava/guava/20.0/guava-20.0.jar +0 -0
  36. data/vendor/jar-dependencies/com/google/http-client/google-http-client-appengine/1.23.0/google-http-client-appengine-1.23.0.jar +0 -0
  37. data/vendor/jar-dependencies/com/google/http-client/google-http-client-jackson/1.23.0/google-http-client-jackson-1.23.0.jar +0 -0
  38. data/vendor/jar-dependencies/com/google/http-client/google-http-client-jackson2/1.23.0/google-http-client-jackson2-1.23.0.jar +0 -0
  39. data/vendor/jar-dependencies/com/google/http-client/google-http-client/1.23.0/google-http-client-1.23.0.jar +0 -0
  40. data/vendor/jar-dependencies/com/google/oauth-client/google-oauth-client/1.23.0/google-oauth-client-1.23.0.jar +0 -0
  41. data/vendor/jar-dependencies/com/google/protobuf/protobuf-java-util/3.5.1/protobuf-java-util-3.5.1.jar +0 -0
  42. data/vendor/jar-dependencies/com/google/protobuf/protobuf-java/3.5.1/protobuf-java-3.5.1.jar +0 -0
  43. data/vendor/jar-dependencies/commons-codec/commons-codec/1.9/commons-codec-1.9.jar +0 -0
  44. data/vendor/jar-dependencies/commons-logging/commons-logging/1.2/commons-logging-1.2.jar +0 -0
  45. data/vendor/jar-dependencies/io/grpc/grpc-context/1.9.0/grpc-context-1.9.0.jar +0 -0
  46. data/vendor/jar-dependencies/io/opencensus/opencensus-api/0.11.1/opencensus-api-0.11.1.jar +0 -0
  47. data/vendor/jar-dependencies/io/opencensus/opencensus-contrib-http-util/0.11.1/opencensus-contrib-http-util-0.11.1.jar +0 -0
  48. data/vendor/jar-dependencies/joda-time/joda-time/2.9.2/joda-time-2.9.2.jar +0 -0
  49. data/vendor/jar-dependencies/org/apache/httpcomponents/httpclient/4.5.2/httpclient-4.5.2.jar +0 -0
  50. data/vendor/jar-dependencies/org/apache/httpcomponents/httpcore/4.4.4/httpcore-4.4.4.jar +0 -0
  51. data/vendor/jar-dependencies/org/codehaus/jackson/jackson-core-asl/1.9.11/jackson-core-asl-1.9.11.jar +0 -0
  52. data/vendor/jar-dependencies/org/threeten/threetenbp/1.3.3/threetenbp-1.3.3.jar +0 -0
  53. metadata +178 -0
@@ -0,0 +1,38 @@
1
+ # AUTOGENERATED BY THE GRADLE SCRIPT. DO NOT EDIT.
2
+
3
+ require 'jar_dependencies'
4
+ require_jar('com.google.cloud', 'google-cloud-bigquery', '1.24.1')
5
+ require_jar('com.fasterxml.jackson.core', 'jackson-core', '2.1.3')
6
+ require_jar('com.google.api', 'api-common', '1.5.0')
7
+ require_jar('com.google.api-client', 'google-api-client', '1.23.0')
8
+ require_jar('com.google.api', 'gax', '1.23.0')
9
+ require_jar('com.google.api', 'gax-httpjson', '0.40.0')
10
+ require_jar('com.google.api.grpc', 'proto-google-common-protos', '1.7.0')
11
+ require_jar('com.google.api.grpc', 'proto-google-iam-v1', '0.8.0')
12
+ require_jar('com.google.apis', 'google-api-services-bigquery', 'v2-rev377-1.23.0')
13
+ require_jar('com.google.auth', 'google-auth-library-credentials', '0.9.0')
14
+ require_jar('com.google.auth', 'google-auth-library-oauth2-http', '0.9.0')
15
+ require_jar('com.google.auto.value', 'auto-value', '1.4')
16
+ require_jar('com.google.cloud', 'google-cloud-core', '1.24.1')
17
+ require_jar('com.google.cloud', 'google-cloud-core-http', '1.24.1')
18
+ require_jar('com.google.code.findbugs', 'jsr305', '3.0.1')
19
+ require_jar('com.google.code.gson', 'gson', '2.7')
20
+ require_jar('com.google.errorprone', 'error_prone_annotations', '2.2.0')
21
+ require_jar('com.google.guava', 'guava', '20.0')
22
+ require_jar('com.google.http-client', 'google-http-client', '1.23.0')
23
+ require_jar('com.google.http-client', 'google-http-client-appengine', '1.23.0')
24
+ require_jar('com.google.http-client', 'google-http-client-jackson', '1.23.0')
25
+ require_jar('com.google.http-client', 'google-http-client-jackson2', '1.23.0')
26
+ require_jar('com.google.oauth-client', 'google-oauth-client', '1.23.0')
27
+ require_jar('com.google.protobuf', 'protobuf-java', '3.5.1')
28
+ require_jar('com.google.protobuf', 'protobuf-java-util', '3.5.1')
29
+ require_jar('commons-codec', 'commons-codec', '1.9')
30
+ require_jar('commons-logging', 'commons-logging', '1.2')
31
+ require_jar('io.grpc', 'grpc-context', '1.9.0')
32
+ require_jar('io.opencensus', 'opencensus-api', '0.11.1')
33
+ require_jar('io.opencensus', 'opencensus-contrib-http-util', '0.11.1')
34
+ require_jar('joda-time', 'joda-time', '2.9.2')
35
+ require_jar('org.apache.httpcomponents', 'httpclient', '4.5.2')
36
+ require_jar('org.apache.httpcomponents', 'httpcore', '4.4.4')
37
+ require_jar('org.codehaus.jackson', 'jackson-core-asl', '1.9.11')
38
+ require_jar('org.threeten', 'threetenbp', '1.3.3')
@@ -0,0 +1,82 @@
1
+ require 'thread'
2
+ require 'java'
3
+ require 'logstash-output-google_bigquery_jars.rb'
4
+
5
+ module LogStash
6
+ module Outputs
7
+ module BigQuery
8
+ # Batcher is a queue that bundles messages in batches based on their
9
+ # size in bytes or count. It's used to provide guarantees around
10
+ # maximum data loss due to a fault while maintaining good upload
11
+ # throughput.
12
+ class Batcher
13
+ include_package 'java.util.concurrent.locks'
14
+
15
+ def initialize(max_length, max_bytes)
16
+ @lock = ReentrantReadWriteLock.new
17
+ @max_length = max_length
18
+ @max_bytes = max_bytes
19
+
20
+ clear
21
+ end
22
+
23
+ # enqueue_push calls enqueue and if a batch is ready to go pushes it to
24
+ # the provided queue.
25
+ def enqueue_push(message, queue)
26
+ batch = enqueue message
27
+
28
+ queue << batch unless batch.nil?
29
+ end
30
+
31
+ # enqueue adds a message to the batch. If the batch is ready to be sent
32
+ # out the internal state is reset and the array of messages is both
33
+ # yielded and returned.
34
+ # Otherwise nil is returned.
35
+ def enqueue(message)
36
+ @lock.write_lock.lock
37
+
38
+ begin
39
+ is_flush_request = message.nil?
40
+
41
+ unless is_flush_request
42
+ @batch_size_bytes += message.length
43
+ @batch << message
44
+ end
45
+
46
+ length_met = @batch.length >= @max_length
47
+ size_met = @batch_size_bytes >= @max_bytes
48
+
49
+ if is_flush_request || length_met || size_met
50
+ orig = @batch
51
+ clear
52
+
53
+ yield(orig) if block_given?
54
+ return orig
55
+ end
56
+
57
+ nil
58
+ ensure
59
+ @lock.write_lock.unlock
60
+ end
61
+ end
62
+
63
+ # removes all elements from the batch
64
+ def clear
65
+ @lock.write_lock.lock
66
+ @batch = []
67
+ @batch_size_bytes = 0
68
+ @lock.write_lock.unlock
69
+ end
70
+
71
+ def empty?
72
+ @lock.read_lock.lock
73
+ begin
74
+ @batch.empty? && @batch_size_bytes.zero?
75
+ ensure
76
+ @lock.read_lock.unlock
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,93 @@
1
+ require 'java'
2
+ require 'logstash-output-google_bigquery_jars.rb'
3
+ require 'logstash/json'
4
+
5
+ module LogStash
6
+ module Outputs
7
+ module BigQuery
8
+ class Schema
9
+ include_package 'com.google.cloud.bigquery'
10
+
11
+ # Converts a CSV schema or JSON schema into a BigQuery Java Schema.
12
+ def self.parse_csv_or_json(csv_schema, json_schema)
13
+ csv_blank = csv_schema.nil? || csv_schema.empty?
14
+ json_blank = json_schema.nil? || json_schema.empty?
15
+
16
+ unless csv_blank ^ json_blank
17
+ raise ArgumentError.new("You must provide either json_schema OR csv_schema. csv: #{csv_schema}, json: #{json_schema}")
18
+ end
19
+
20
+ if csv_blank
21
+ schema = json_schema
22
+ else
23
+ schema = parse_csv_schema csv_schema
24
+ end
25
+
26
+ self.hash_to_java_schema schema
27
+ end
28
+
29
+ # Converts a CSV of field:type pairs into the JSON style schema.
30
+ def self.parse_csv_schema(csv_schema)
31
+ require 'csv'
32
+
33
+ fields = []
34
+
35
+ CSV.parse(csv_schema.gsub('\"', '""')).flatten.each do |field|
36
+ raise ArgumentError.new('csv_schema must follow the format <field-name>:<field-type>') if field.nil?
37
+
38
+ temp = field.strip.split(':')
39
+
40
+ if temp.length != 2
41
+ raise ArgumentError.new('csv_schema must follow the format <field-name>:<field-type>')
42
+ end
43
+
44
+ fields << { 'name' => temp[0], 'type' => temp[1] }
45
+ end
46
+
47
+ # Check that we have at least one field in the schema
48
+ raise ArgumentError.new('csv_schema must contain at least one field') if fields.empty?
49
+
50
+ { 'fields' => fields }
51
+ end
52
+
53
+ # Converts the Ruby hash style schema into a BigQuery Java schema
54
+ def self.hash_to_java_schema(schema_hash)
55
+ field_list = self.parse_field_list schema_hash['fields']
56
+ com.google.cloud.bigquery.Schema.of field_list
57
+ end
58
+
59
+ # Converts a list of fields into a BigQuery Java FieldList
60
+ def self.parse_field_list(fields)
61
+ fieldslist = fields.map {|field| self.parse_field field}
62
+
63
+ FieldList.of fieldslist
64
+ end
65
+
66
+ # Converts a single field definition into a BigQuery Java Field object.
67
+ # This includes any nested fields as well.
68
+ def self.parse_field(field)
69
+ type = LegacySQLTypeName.valueOfStrict(field['type'])
70
+ name = field['name']
71
+
72
+ if field.has_key? 'fields'
73
+ sub_fields = self.parse_field_list field['fields']
74
+ builder = Field.newBuilder(name, type, sub_fields)
75
+ else
76
+ builder = Field.newBuilder(name, type)
77
+ end
78
+
79
+ if field.has_key? 'description'
80
+ builder = builder.setDescription(field['description'])
81
+ end
82
+
83
+ if field.has_key? 'mode'
84
+ mode = Field::Mode.valueOf field['mode']
85
+ builder = builder.setMode(mode)
86
+ end
87
+
88
+ builder.build
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,120 @@
1
+ require 'java'
2
+ require 'openssl'
3
+ require 'logstash-output-google_bigquery_jars.rb'
4
+
5
+ module LogStash
6
+ module Outputs
7
+ module BigQuery
8
+ # NOTE: This file uses _a lot_ of Java. Please keep the Java looking
9
+ # java-y so it's easy to tell the languages apart.
10
+
11
+ include_package 'com.google.cloud.bigquery'
12
+
13
+ # StreamingClient supports shipping data to BigQuery using streams.
14
+ class StreamingClient
15
+ def initialize(json_key_file, project_id, logger)
16
+ @logger = logger
17
+
18
+ @bigquery = initialize_google_client json_key_file, project_id
19
+ end
20
+
21
+ def table_exists?(dataset, table)
22
+ api_debug('Checking if table exists', dataset, table)
23
+ tbl = @bigquery.getTable dataset, table
24
+
25
+ !tbl.nil?
26
+ end
27
+
28
+ # Creates a table with the given name in the given dataset
29
+ def create_table(dataset, table, schema)
30
+ api_debug('Creating table', dataset, table)
31
+ table_id = com.google.cloud.bigquery.TableId.of dataset, table
32
+
33
+ table_defn = com.google.cloud.bigquery.StandardTableDefinition.of schema
34
+ table_info = com.google.cloud.bigquery.TableInfo.newBuilder(table_id, table_defn).build()
35
+
36
+ @bigquery.create table_info
37
+ end
38
+
39
+ def append(dataset, table, rows, ignore_unknown)
40
+ api_debug("Appending #{rows.length} rows", dataset, table)
41
+
42
+ request = build_append_request dataset, table, rows, ignore_unknown
43
+
44
+ response = @bigquery.insertAll request
45
+ return true unless response.hasErrors
46
+
47
+ response.getInsertErrors().entrySet().each{ |entry|
48
+ key = entry.getKey
49
+ errors = entry.getValue
50
+
51
+ errors.each{|bqError|
52
+ @logger.warn('Error while inserting',
53
+ key: key,
54
+ location: bqError.getLocation,
55
+ message: bqError.getMessage,
56
+ reason: bqError.getReason)
57
+ }
58
+ }
59
+
60
+ false
61
+ end
62
+
63
+ def build_append_request(dataset, table, rows, ignore_unknown)
64
+ request = com.google.cloud.bigquery.InsertAllRequest.newBuilder dataset, table
65
+ request.setIgnoreUnknownValues ignore_unknown
66
+
67
+ rows.each { |serialized_row|
68
+ # deserialize rows into Java maps
69
+ deserialized = LogStash::Json.load serialized_row
70
+ request.addRow deserialized
71
+ }
72
+
73
+ request.build
74
+ end
75
+
76
+ # raises an exception if the key file is invalid
77
+ def get_key_file_error(json_key_file)
78
+ return nil if json_key_file.nil? || json_key_file == ''
79
+
80
+ abs = ::File.absolute_path json_key_file
81
+ unless abs == json_key_file
82
+ return "json_key_file must be an absolute path: #{json_key_file}"
83
+ end
84
+
85
+ unless ::File.exist? json_key_file
86
+ return "json_key_file does not exist: #{json_key_file}"
87
+ end
88
+
89
+ nil
90
+ end
91
+
92
+ def initialize_google_client(json_key_file, project_id)
93
+ @logger.info("Initializing Google API client #{project_id} key: #{json_key_file}")
94
+ err = get_key_file_error json_key_file
95
+ raise err unless err.nil?
96
+
97
+ if json_key_file.nil? || json_key_file.empty?
98
+ return com.google.cloud.bigquery.BigQueryOptions.getDefaultInstance().getService()
99
+ end
100
+
101
+ # TODO: set User-Agent
102
+
103
+ key_file = java.io.FileInputStream.new json_key_file
104
+ credentials = com.google.auth.oauth2.ServiceAccountCredentials.fromStream key_file
105
+ return com.google.cloud.bigquery.BigQueryOptions.newBuilder()
106
+ .setCredentials(credentials)
107
+ .setProjectId(project_id)
108
+ .build()
109
+ .getService()
110
+ end
111
+
112
+ private
113
+
114
+ def api_debug(message, dataset, table)
115
+ @logger.debug(message, dataset: dataset, table: table)
116
+ end
117
+ end
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,280 @@
1
+ require 'logstash/outputs/base'
2
+ require 'logstash/namespace'
3
+ require 'logstash/json'
4
+ require 'logstash/outputs/bigquery/streamclient'
5
+ require 'logstash/outputs/bigquery/batcher'
6
+ require 'logstash/outputs/bigquery/schema'
7
+
8
+ require 'time'
9
+ require 'fileutils'
10
+
11
+ #
12
+ # === Summary
13
+ #
14
+ # This plugin uploads events to Google BigQuery using the streaming API
15
+ # so data can become available nearly immediately.
16
+ #
17
+ # You can configure it to flush periodically, after N events or after
18
+ # a certain amount of data is ingested.
19
+ #
20
+ # === Environment Configuration
21
+ #
22
+ # You must enable BigQuery on your GCS account and create a dataset to
23
+ # hold the tables this plugin generates.
24
+ #
25
+ # You must also grant the service account this plugin uses access to
26
+ # the dataset.
27
+ #
28
+ # You can use https://www.elastic.co/guide/en/logstash/current/event-dependent-configuration.html[Logstash conditionals]
29
+ # and multiple configuration blocks to upload events with different structures.
30
+ #
31
+ # === Usage
32
+ # This is an example of logstash config:
33
+ #
34
+ # [source,ruby]
35
+ # --------------------------
36
+ # output {
37
+ # google_bigquery {
38
+ # project_id => "folkloric-guru-278" (required)
39
+ # dataset => "logs" (required)
40
+ # csv_schema => "path:STRING,status:INTEGER,score:FLOAT" (required) <1>
41
+ # json_key_file => "/path/to/key.json" (optional) <2>
42
+ # error_directory => "/tmp/bigquery-errors" (required)
43
+ # date_pattern => "%Y-%m-%dT%H:00" (optional)
44
+ # flush_interval_secs => 30 (optional)
45
+ # }
46
+ # }
47
+ # --------------------------
48
+ #
49
+ # <1> Specify either a csv_schema or a json_schema.
50
+ #
51
+ # <2> If the key is not used, then the plugin tries to find
52
+ # https://cloud.google.com/docs/authentication/production[Application Default Credentials]
53
+ #
54
+ # === Considerations
55
+ #
56
+ # * There is a small fee to insert data into BigQuery using the streaming API
57
+ # * This plugin buffers events in-memory, so make sure the flush configurations are appropriate
58
+ # for your use-case and consider using
59
+ # https://www.elastic.co/guide/en/logstash/current/persistent-queues.html[Logstash Persistent Queues]
60
+ #
61
+ # === Additional Resources
62
+ #
63
+ # * https://cloud.google.com/bigquery/[BigQuery Introduction]
64
+ # * https://cloud.google.com/bigquery/docs/schemas[BigQuery Schema Formats and Types]
65
+ # * https://cloud.google.com/bigquery/pricing[Pricing Information]
66
+ #
67
+ class LogStash::Outputs::GoogleBigQuery < LogStash::Outputs::Base
68
+ config_name 'google_bigquery'
69
+
70
+ concurrency :single
71
+
72
+ # Google Cloud Project ID (number, not Project Name!).
73
+ config :project_id, validate: :string, required: true
74
+
75
+ # The BigQuery dataset the tables for the events will be added to.
76
+ config :dataset, validate: :string, required: true
77
+
78
+ # BigQuery table ID prefix to be used when creating new tables for log data.
79
+ # Table name will be `<table_prefix><table_separator><date>`
80
+ config :table_prefix, validate: :string, default: 'logstash'
81
+
82
+ # BigQuery table separator to be added between the table_prefix and the
83
+ # date suffix.
84
+ config :table_separator, validate: :string, default: '_'
85
+
86
+ # Schema for log data. It must follow the format `name1:type1(,name2:type2)*`.
87
+ # For example, `path:STRING,status:INTEGER,score:FLOAT`.
88
+ config :csv_schema, validate: :string, required: false, default: nil
89
+
90
+ # Schema for log data as a hash.
91
+ # These can include nested records, descriptions, and modes.
92
+ #
93
+ # Example:
94
+ # # [source,ruby]
95
+ # --------------------------
96
+ # json_schema => {
97
+ # fields => [{
98
+ # name => "endpoint"
99
+ # type => "STRING"
100
+ # description => "Request route"
101
+ # }, {
102
+ # name => "status"
103
+ # type => "INTEGER"
104
+ # mode => "NULLABLE"
105
+ # }, {
106
+ # name => "params"
107
+ # type => "RECORD"
108
+ # mode => "REPEATED"
109
+ # fields => [{
110
+ # name => "key"
111
+ # type => "STRING"
112
+ # }, {
113
+ # name => "value"
114
+ # type => "STRING"
115
+ # }]
116
+ # }]
117
+ # }
118
+ # --------------------------
119
+ config :json_schema, validate: :hash, required: false, default: nil
120
+
121
+ # Indicates if BigQuery should ignore values that are not represented in the table schema.
122
+ # If true, the extra values are discarded.
123
+ # If false, BigQuery will reject the records with extra fields and the job will fail.
124
+ # The default value is false.
125
+ #
126
+ # NOTE: You may want to add a Logstash filter like the following to remove common fields it adds:
127
+ # [source,ruby]
128
+ # ----------------------------------
129
+ # mutate {
130
+ # remove_field => ["@version","@timestamp","path","host","type", "message"]
131
+ # }
132
+ # ----------------------------------
133
+ config :ignore_unknown_values, validate: :boolean, default: false
134
+
135
+ # Time pattern for BigQuery table, defaults to hourly tables.
136
+ # Must Time.strftime patterns: www.ruby-doc.org/core-2.0/Time.html#method-i-strftime
137
+ config :date_pattern, validate: :string, default: '%Y-%m-%dT%H:00'
138
+
139
+ # If logstash is running within Google Compute Engine, the plugin will use
140
+ # GCE's Application Default Credentials. Outside of GCE, you will need to
141
+ # specify a Service Account JSON key file.
142
+ config :json_key_file, validate: :string, required: false
143
+
144
+ # The number of messages to upload at a single time. (< 1000, default: 128)
145
+ config :batch_size, validate: :number, required: true, default: 128
146
+
147
+ # An approximate number of bytes to upload as part of a batch. Default: 1MB
148
+ config :batch_size_bytes, validate: :number, required: true, default: 1_000_000
149
+
150
+ # Uploads all data this often even if other upload criteria aren't met. Default: 5s
151
+ config :flush_interval_secs, validate: :number, required: true, default: 5
152
+
153
+ # The location to store events that could not be uploaded due to errors.
154
+ # Consider using an additional Logstash input to pipe the contents of
155
+ # these to an alert platform so you can manually fix the events.
156
+ #
157
+ # Or use https://cloud.google.com/storage/docs/gcs-fuse[GCS FUSE] to
158
+ # transparently upload to a GCS bucket.
159
+ #
160
+ # Files names follow the pattern `[table name]-[UNIX timestamp].log`
161
+ config :error_directory, validate: :string, required: true, default: '/tmp/bigquery_errors'
162
+
163
+ # The following configuration options still exist to alert users that are using them
164
+ config :uploader_interval_secs, validate: :number, deprecated: 'No longer used.'
165
+ config :deleter_interval_secs, validate: :number, deprecated: 'No longer used.'
166
+ config :key_path, validate: :string, obsolete: 'Use json_key_file or ADC instead.'
167
+ config :key_password, validate: :string, deprecated: 'No longer needed with json_key_file or ADC.'
168
+ config :service_account, validate: :string, deprecated: 'No longer needed with json_key_file or ADC.'
169
+ config :temp_file_prefix, validate: :string, deprecated: 'No longer used.'
170
+ config :temp_directory, validate: :string, deprecated: 'No longer used.'
171
+
172
+ public
173
+
174
+ def register
175
+ @logger.debug('Registering plugin')
176
+
177
+ @schema = LogStash::Outputs::BigQuery::Schema.parse_csv_or_json @csv_schema, @json_schema
178
+ @bq_client = LogStash::Outputs::BigQuery::StreamingClient.new @json_key_file, @project_id, @logger
179
+ @batcher = LogStash::Outputs::BigQuery::Batcher.new @batch_size, @batch_size_bytes
180
+
181
+ init_batcher_flush_thread
182
+ end
183
+
184
+ # Method called for each log event. It writes the event to the current output
185
+ # file, flushing depending on flush interval configuration.
186
+ def receive(event)
187
+ @logger.debug('BQ: receive method called', event: event)
188
+
189
+ # Property names MUST NOT have @ in them
190
+ message = replace_at_keys event.to_hash
191
+
192
+ # Message must be written as json
193
+ encoded_message = LogStash::Json.dump message
194
+
195
+ @batcher.enqueue(encoded_message) { |batch| publish(batch) }
196
+ end
197
+
198
+ def get_table_name(time=nil)
199
+ time ||= Time.now
200
+
201
+ str_time = time.strftime(@date_pattern)
202
+ table_id = @table_prefix + @table_separator + str_time
203
+
204
+ # BQ does not accept anything other than alphanumeric and _
205
+ # Ref: https://developers.google.com/bigquery/browser-tool-quickstart?hl=en
206
+ table_id.tr!(':-', '_')
207
+
208
+ table_id
209
+ end
210
+
211
+ # Remove @ symbols in hash keys
212
+ def replace_at_keys(event)
213
+ return event unless event.is_a? Hash
214
+
215
+ out = {}
216
+
217
+ event.each do |key, value|
218
+ new_key = key.to_s.delete '@'
219
+ out[new_key] = replace_at_keys value
220
+ end
221
+
222
+ out
223
+ end
224
+
225
+ # publish sends messages to a BigQuery table immediately
226
+ def publish(messages)
227
+ begin
228
+ return if messages.nil? || messages.empty?
229
+
230
+ table = get_table_name
231
+ @logger.info("Publishing #{messages.length} messages to #{table}")
232
+
233
+ create_table_if_not_exists table
234
+
235
+ successful = @bq_client.append @dataset, table, messages, @ignore_unknown_values
236
+ write_to_errors_file(messages, table) unless successful
237
+ rescue StandardError => e
238
+ @logger.error 'Error uploading data.', :exception => e
239
+
240
+ write_to_errors_file(messages, table)
241
+ end
242
+ end
243
+
244
+ def create_table_if_not_exists table
245
+ begin
246
+ return nil if @bq_client.table_exists? @dataset, table
247
+ @bq_client.create_table(@dataset, table, @schema)
248
+
249
+ rescue StandardError => e
250
+ @logger.error 'Error creating table.', :exception => e
251
+ end
252
+ end
253
+
254
+ def write_to_errors_file(messages, table)
255
+ begin
256
+ FileUtils.mkdir_p @error_directory
257
+
258
+ t = Time.new
259
+ error_file_name = "#{table}-#{t.to_i}.log"
260
+ error_file_path = ::File.join(@error_directory, error_file_name)
261
+ @logger.info "Problem data is being stored in: #{error_file_path}"
262
+
263
+ File.open(error_file_path, 'w') do |f|
264
+ messages.each { |message| f.puts message }
265
+ end
266
+ rescue StandardError => e
267
+ @logger.error 'Error creating error file.', :exception => e, :messages => messages, :table => table
268
+ end
269
+ end
270
+
271
+ def init_batcher_flush_thread
272
+ @flush_thread = Thread.new do
273
+ loop do
274
+ sleep @flush_interval_secs
275
+
276
+ @batcher.enqueue(nil) { |batch| publish(batch) }
277
+ end
278
+ end
279
+ end
280
+ end