logstash-output-google_bigquery 4.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +71 -0
- data/CONTRIBUTORS +15 -0
- data/Gemfile +11 -0
- data/LICENSE +13 -0
- data/NOTICE.TXT +5 -0
- data/README.md +100 -0
- data/docs/index.asciidoc +348 -0
- data/lib/logstash-output-google_bigquery_jars.rb +38 -0
- data/lib/logstash/outputs/bigquery/batcher.rb +82 -0
- data/lib/logstash/outputs/bigquery/schema.rb +93 -0
- data/lib/logstash/outputs/bigquery/streamclient.rb +120 -0
- data/lib/logstash/outputs/google_bigquery.rb +280 -0
- data/logstash-output-google_bigquery.gemspec +31 -0
- data/spec/outputs/bigquery/batcher_spec.rb +110 -0
- data/spec/outputs/bigquery/schema_spec.rb +101 -0
- data/spec/outputs/google_bigquery_spec.rb +154 -0
- data/vendor/jar-dependencies/com/fasterxml/jackson/core/jackson-core/2.1.3/jackson-core-2.1.3.jar +0 -0
- data/vendor/jar-dependencies/com/google/api-client/google-api-client/1.23.0/google-api-client-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/api-common/1.5.0/api-common-1.5.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/gax-httpjson/0.40.0/gax-httpjson-0.40.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/gax/1.23.0/gax-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/grpc/proto-google-common-protos/1.7.0/proto-google-common-protos-1.7.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/grpc/proto-google-iam-v1/0.8.0/proto-google-iam-v1-0.8.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/apis/google-api-services-bigquery/v2-rev377-1.23.0/google-api-services-bigquery-v2-rev377-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/auth/google-auth-library-credentials/0.9.0/google-auth-library-credentials-0.9.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/auth/google-auth-library-oauth2-http/0.9.0/google-auth-library-oauth2-http-0.9.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/auto/value/auto-value/1.4/auto-value-1.4.jar +0 -0
- data/vendor/jar-dependencies/com/google/cloud/google-cloud-bigquery/1.24.1/google-cloud-bigquery-1.24.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/cloud/google-cloud-core-http/1.24.1/google-cloud-core-http-1.24.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/cloud/google-cloud-core/1.24.1/google-cloud-core-1.24.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/code/findbugs/jsr305/3.0.1/jsr305-3.0.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/code/gson/gson/2.7/gson-2.7.jar +0 -0
- data/vendor/jar-dependencies/com/google/errorprone/error_prone_annotations/2.2.0/error_prone_annotations-2.2.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/guava/guava/20.0/guava-20.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/http-client/google-http-client-appengine/1.23.0/google-http-client-appengine-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/http-client/google-http-client-jackson/1.23.0/google-http-client-jackson-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/http-client/google-http-client-jackson2/1.23.0/google-http-client-jackson2-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/http-client/google-http-client/1.23.0/google-http-client-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/oauth-client/google-oauth-client/1.23.0/google-oauth-client-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/protobuf/protobuf-java-util/3.5.1/protobuf-java-util-3.5.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/protobuf/protobuf-java/3.5.1/protobuf-java-3.5.1.jar +0 -0
- data/vendor/jar-dependencies/commons-codec/commons-codec/1.9/commons-codec-1.9.jar +0 -0
- data/vendor/jar-dependencies/commons-logging/commons-logging/1.2/commons-logging-1.2.jar +0 -0
- data/vendor/jar-dependencies/io/grpc/grpc-context/1.9.0/grpc-context-1.9.0.jar +0 -0
- data/vendor/jar-dependencies/io/opencensus/opencensus-api/0.11.1/opencensus-api-0.11.1.jar +0 -0
- data/vendor/jar-dependencies/io/opencensus/opencensus-contrib-http-util/0.11.1/opencensus-contrib-http-util-0.11.1.jar +0 -0
- data/vendor/jar-dependencies/joda-time/joda-time/2.9.2/joda-time-2.9.2.jar +0 -0
- data/vendor/jar-dependencies/org/apache/httpcomponents/httpclient/4.5.2/httpclient-4.5.2.jar +0 -0
- data/vendor/jar-dependencies/org/apache/httpcomponents/httpcore/4.4.4/httpcore-4.4.4.jar +0 -0
- data/vendor/jar-dependencies/org/codehaus/jackson/jackson-core-asl/1.9.11/jackson-core-asl-1.9.11.jar +0 -0
- data/vendor/jar-dependencies/org/threeten/threetenbp/1.3.3/threetenbp-1.3.3.jar +0 -0
- metadata +178 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# AUTOGENERATED BY THE GRADLE SCRIPT. DO NOT EDIT.
|
|
2
|
+
|
|
3
|
+
require 'jar_dependencies'
|
|
4
|
+
require_jar('com.google.cloud', 'google-cloud-bigquery', '1.24.1')
|
|
5
|
+
require_jar('com.fasterxml.jackson.core', 'jackson-core', '2.1.3')
|
|
6
|
+
require_jar('com.google.api', 'api-common', '1.5.0')
|
|
7
|
+
require_jar('com.google.api-client', 'google-api-client', '1.23.0')
|
|
8
|
+
require_jar('com.google.api', 'gax', '1.23.0')
|
|
9
|
+
require_jar('com.google.api', 'gax-httpjson', '0.40.0')
|
|
10
|
+
require_jar('com.google.api.grpc', 'proto-google-common-protos', '1.7.0')
|
|
11
|
+
require_jar('com.google.api.grpc', 'proto-google-iam-v1', '0.8.0')
|
|
12
|
+
require_jar('com.google.apis', 'google-api-services-bigquery', 'v2-rev377-1.23.0')
|
|
13
|
+
require_jar('com.google.auth', 'google-auth-library-credentials', '0.9.0')
|
|
14
|
+
require_jar('com.google.auth', 'google-auth-library-oauth2-http', '0.9.0')
|
|
15
|
+
require_jar('com.google.auto.value', 'auto-value', '1.4')
|
|
16
|
+
require_jar('com.google.cloud', 'google-cloud-core', '1.24.1')
|
|
17
|
+
require_jar('com.google.cloud', 'google-cloud-core-http', '1.24.1')
|
|
18
|
+
require_jar('com.google.code.findbugs', 'jsr305', '3.0.1')
|
|
19
|
+
require_jar('com.google.code.gson', 'gson', '2.7')
|
|
20
|
+
require_jar('com.google.errorprone', 'error_prone_annotations', '2.2.0')
|
|
21
|
+
require_jar('com.google.guava', 'guava', '20.0')
|
|
22
|
+
require_jar('com.google.http-client', 'google-http-client', '1.23.0')
|
|
23
|
+
require_jar('com.google.http-client', 'google-http-client-appengine', '1.23.0')
|
|
24
|
+
require_jar('com.google.http-client', 'google-http-client-jackson', '1.23.0')
|
|
25
|
+
require_jar('com.google.http-client', 'google-http-client-jackson2', '1.23.0')
|
|
26
|
+
require_jar('com.google.oauth-client', 'google-oauth-client', '1.23.0')
|
|
27
|
+
require_jar('com.google.protobuf', 'protobuf-java', '3.5.1')
|
|
28
|
+
require_jar('com.google.protobuf', 'protobuf-java-util', '3.5.1')
|
|
29
|
+
require_jar('commons-codec', 'commons-codec', '1.9')
|
|
30
|
+
require_jar('commons-logging', 'commons-logging', '1.2')
|
|
31
|
+
require_jar('io.grpc', 'grpc-context', '1.9.0')
|
|
32
|
+
require_jar('io.opencensus', 'opencensus-api', '0.11.1')
|
|
33
|
+
require_jar('io.opencensus', 'opencensus-contrib-http-util', '0.11.1')
|
|
34
|
+
require_jar('joda-time', 'joda-time', '2.9.2')
|
|
35
|
+
require_jar('org.apache.httpcomponents', 'httpclient', '4.5.2')
|
|
36
|
+
require_jar('org.apache.httpcomponents', 'httpcore', '4.4.4')
|
|
37
|
+
require_jar('org.codehaus.jackson', 'jackson-core-asl', '1.9.11')
|
|
38
|
+
require_jar('org.threeten', 'threetenbp', '1.3.3')
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
require 'thread'
|
|
2
|
+
require 'java'
|
|
3
|
+
require 'logstash-output-google_bigquery_jars.rb'
|
|
4
|
+
|
|
5
|
+
module LogStash
|
|
6
|
+
module Outputs
|
|
7
|
+
module BigQuery
|
|
8
|
+
# Batcher is a queue that bundles messages in batches based on their
|
|
9
|
+
# size in bytes or count. It's used to provide guarantees around
|
|
10
|
+
# maximum data loss due to a fault while maintaining good upload
|
|
11
|
+
# throughput.
|
|
12
|
+
class Batcher
|
|
13
|
+
include_package 'java.util.concurrent.locks'
|
|
14
|
+
|
|
15
|
+
def initialize(max_length, max_bytes)
|
|
16
|
+
@lock = ReentrantReadWriteLock.new
|
|
17
|
+
@max_length = max_length
|
|
18
|
+
@max_bytes = max_bytes
|
|
19
|
+
|
|
20
|
+
clear
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# enqueue_push calls enqueue and if a batch is ready to go pushes it to
|
|
24
|
+
# the provided queue.
|
|
25
|
+
def enqueue_push(message, queue)
|
|
26
|
+
batch = enqueue message
|
|
27
|
+
|
|
28
|
+
queue << batch unless batch.nil?
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# enqueue adds a message to the batch. If the batch is ready to be sent
|
|
32
|
+
# out the internal state is reset and the array of messages is both
|
|
33
|
+
# yielded and returned.
|
|
34
|
+
# Otherwise nil is returned.
|
|
35
|
+
def enqueue(message)
|
|
36
|
+
@lock.write_lock.lock
|
|
37
|
+
|
|
38
|
+
begin
|
|
39
|
+
is_flush_request = message.nil?
|
|
40
|
+
|
|
41
|
+
unless is_flush_request
|
|
42
|
+
@batch_size_bytes += message.length
|
|
43
|
+
@batch << message
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
length_met = @batch.length >= @max_length
|
|
47
|
+
size_met = @batch_size_bytes >= @max_bytes
|
|
48
|
+
|
|
49
|
+
if is_flush_request || length_met || size_met
|
|
50
|
+
orig = @batch
|
|
51
|
+
clear
|
|
52
|
+
|
|
53
|
+
yield(orig) if block_given?
|
|
54
|
+
return orig
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
nil
|
|
58
|
+
ensure
|
|
59
|
+
@lock.write_lock.unlock
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# removes all elements from the batch
|
|
64
|
+
def clear
|
|
65
|
+
@lock.write_lock.lock
|
|
66
|
+
@batch = []
|
|
67
|
+
@batch_size_bytes = 0
|
|
68
|
+
@lock.write_lock.unlock
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def empty?
|
|
72
|
+
@lock.read_lock.lock
|
|
73
|
+
begin
|
|
74
|
+
@batch.empty? && @batch_size_bytes.zero?
|
|
75
|
+
ensure
|
|
76
|
+
@lock.read_lock.unlock
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
require 'java'
|
|
2
|
+
require 'logstash-output-google_bigquery_jars.rb'
|
|
3
|
+
require 'logstash/json'
|
|
4
|
+
|
|
5
|
+
module LogStash
|
|
6
|
+
module Outputs
|
|
7
|
+
module BigQuery
|
|
8
|
+
class Schema
|
|
9
|
+
include_package 'com.google.cloud.bigquery'
|
|
10
|
+
|
|
11
|
+
# Converts a CSV schema or JSON schema into a BigQuery Java Schema.
|
|
12
|
+
def self.parse_csv_or_json(csv_schema, json_schema)
|
|
13
|
+
csv_blank = csv_schema.nil? || csv_schema.empty?
|
|
14
|
+
json_blank = json_schema.nil? || json_schema.empty?
|
|
15
|
+
|
|
16
|
+
unless csv_blank ^ json_blank
|
|
17
|
+
raise ArgumentError.new("You must provide either json_schema OR csv_schema. csv: #{csv_schema}, json: #{json_schema}")
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
if csv_blank
|
|
21
|
+
schema = json_schema
|
|
22
|
+
else
|
|
23
|
+
schema = parse_csv_schema csv_schema
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
self.hash_to_java_schema schema
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Converts a CSV of field:type pairs into the JSON style schema.
|
|
30
|
+
def self.parse_csv_schema(csv_schema)
|
|
31
|
+
require 'csv'
|
|
32
|
+
|
|
33
|
+
fields = []
|
|
34
|
+
|
|
35
|
+
CSV.parse(csv_schema.gsub('\"', '""')).flatten.each do |field|
|
|
36
|
+
raise ArgumentError.new('csv_schema must follow the format <field-name>:<field-type>') if field.nil?
|
|
37
|
+
|
|
38
|
+
temp = field.strip.split(':')
|
|
39
|
+
|
|
40
|
+
if temp.length != 2
|
|
41
|
+
raise ArgumentError.new('csv_schema must follow the format <field-name>:<field-type>')
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
fields << { 'name' => temp[0], 'type' => temp[1] }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Check that we have at least one field in the schema
|
|
48
|
+
raise ArgumentError.new('csv_schema must contain at least one field') if fields.empty?
|
|
49
|
+
|
|
50
|
+
{ 'fields' => fields }
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Converts the Ruby hash style schema into a BigQuery Java schema
|
|
54
|
+
def self.hash_to_java_schema(schema_hash)
|
|
55
|
+
field_list = self.parse_field_list schema_hash['fields']
|
|
56
|
+
com.google.cloud.bigquery.Schema.of field_list
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Converts a list of fields into a BigQuery Java FieldList
|
|
60
|
+
def self.parse_field_list(fields)
|
|
61
|
+
fieldslist = fields.map {|field| self.parse_field field}
|
|
62
|
+
|
|
63
|
+
FieldList.of fieldslist
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Converts a single field definition into a BigQuery Java Field object.
|
|
67
|
+
# This includes any nested fields as well.
|
|
68
|
+
def self.parse_field(field)
|
|
69
|
+
type = LegacySQLTypeName.valueOfStrict(field['type'])
|
|
70
|
+
name = field['name']
|
|
71
|
+
|
|
72
|
+
if field.has_key? 'fields'
|
|
73
|
+
sub_fields = self.parse_field_list field['fields']
|
|
74
|
+
builder = Field.newBuilder(name, type, sub_fields)
|
|
75
|
+
else
|
|
76
|
+
builder = Field.newBuilder(name, type)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
if field.has_key? 'description'
|
|
80
|
+
builder = builder.setDescription(field['description'])
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
if field.has_key? 'mode'
|
|
84
|
+
mode = Field::Mode.valueOf field['mode']
|
|
85
|
+
builder = builder.setMode(mode)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
builder.build
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
require 'java'
|
|
2
|
+
require 'openssl'
|
|
3
|
+
require 'logstash-output-google_bigquery_jars.rb'
|
|
4
|
+
|
|
5
|
+
module LogStash
|
|
6
|
+
module Outputs
|
|
7
|
+
module BigQuery
|
|
8
|
+
# NOTE: This file uses _a lot_ of Java. Please keep the Java looking
|
|
9
|
+
# java-y so it's easy to tell the languages apart.
|
|
10
|
+
|
|
11
|
+
include_package 'com.google.cloud.bigquery'
|
|
12
|
+
|
|
13
|
+
# StreamingClient supports shipping data to BigQuery using streams.
|
|
14
|
+
class StreamingClient
|
|
15
|
+
def initialize(json_key_file, project_id, logger)
|
|
16
|
+
@logger = logger
|
|
17
|
+
|
|
18
|
+
@bigquery = initialize_google_client json_key_file, project_id
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def table_exists?(dataset, table)
|
|
22
|
+
api_debug('Checking if table exists', dataset, table)
|
|
23
|
+
tbl = @bigquery.getTable dataset, table
|
|
24
|
+
|
|
25
|
+
!tbl.nil?
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Creates a table with the given name in the given dataset
|
|
29
|
+
def create_table(dataset, table, schema)
|
|
30
|
+
api_debug('Creating table', dataset, table)
|
|
31
|
+
table_id = com.google.cloud.bigquery.TableId.of dataset, table
|
|
32
|
+
|
|
33
|
+
table_defn = com.google.cloud.bigquery.StandardTableDefinition.of schema
|
|
34
|
+
table_info = com.google.cloud.bigquery.TableInfo.newBuilder(table_id, table_defn).build()
|
|
35
|
+
|
|
36
|
+
@bigquery.create table_info
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def append(dataset, table, rows, ignore_unknown)
|
|
40
|
+
api_debug("Appending #{rows.length} rows", dataset, table)
|
|
41
|
+
|
|
42
|
+
request = build_append_request dataset, table, rows, ignore_unknown
|
|
43
|
+
|
|
44
|
+
response = @bigquery.insertAll request
|
|
45
|
+
return true unless response.hasErrors
|
|
46
|
+
|
|
47
|
+
response.getInsertErrors().entrySet().each{ |entry|
|
|
48
|
+
key = entry.getKey
|
|
49
|
+
errors = entry.getValue
|
|
50
|
+
|
|
51
|
+
errors.each{|bqError|
|
|
52
|
+
@logger.warn('Error while inserting',
|
|
53
|
+
key: key,
|
|
54
|
+
location: bqError.getLocation,
|
|
55
|
+
message: bqError.getMessage,
|
|
56
|
+
reason: bqError.getReason)
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
false
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def build_append_request(dataset, table, rows, ignore_unknown)
|
|
64
|
+
request = com.google.cloud.bigquery.InsertAllRequest.newBuilder dataset, table
|
|
65
|
+
request.setIgnoreUnknownValues ignore_unknown
|
|
66
|
+
|
|
67
|
+
rows.each { |serialized_row|
|
|
68
|
+
# deserialize rows into Java maps
|
|
69
|
+
deserialized = LogStash::Json.load serialized_row
|
|
70
|
+
request.addRow deserialized
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
request.build
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# raises an exception if the key file is invalid
|
|
77
|
+
def get_key_file_error(json_key_file)
|
|
78
|
+
return nil if json_key_file.nil? || json_key_file == ''
|
|
79
|
+
|
|
80
|
+
abs = ::File.absolute_path json_key_file
|
|
81
|
+
unless abs == json_key_file
|
|
82
|
+
return "json_key_file must be an absolute path: #{json_key_file}"
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
unless ::File.exist? json_key_file
|
|
86
|
+
return "json_key_file does not exist: #{json_key_file}"
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
nil
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def initialize_google_client(json_key_file, project_id)
|
|
93
|
+
@logger.info("Initializing Google API client #{project_id} key: #{json_key_file}")
|
|
94
|
+
err = get_key_file_error json_key_file
|
|
95
|
+
raise err unless err.nil?
|
|
96
|
+
|
|
97
|
+
if json_key_file.nil? || json_key_file.empty?
|
|
98
|
+
return com.google.cloud.bigquery.BigQueryOptions.getDefaultInstance().getService()
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# TODO: set User-Agent
|
|
102
|
+
|
|
103
|
+
key_file = java.io.FileInputStream.new json_key_file
|
|
104
|
+
credentials = com.google.auth.oauth2.ServiceAccountCredentials.fromStream key_file
|
|
105
|
+
return com.google.cloud.bigquery.BigQueryOptions.newBuilder()
|
|
106
|
+
.setCredentials(credentials)
|
|
107
|
+
.setProjectId(project_id)
|
|
108
|
+
.build()
|
|
109
|
+
.getService()
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
private
|
|
113
|
+
|
|
114
|
+
def api_debug(message, dataset, table)
|
|
115
|
+
@logger.debug(message, dataset: dataset, table: table)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
require 'logstash/outputs/base'
|
|
2
|
+
require 'logstash/namespace'
|
|
3
|
+
require 'logstash/json'
|
|
4
|
+
require 'logstash/outputs/bigquery/streamclient'
|
|
5
|
+
require 'logstash/outputs/bigquery/batcher'
|
|
6
|
+
require 'logstash/outputs/bigquery/schema'
|
|
7
|
+
|
|
8
|
+
require 'time'
|
|
9
|
+
require 'fileutils'
|
|
10
|
+
|
|
11
|
+
#
|
|
12
|
+
# === Summary
|
|
13
|
+
#
|
|
14
|
+
# This plugin uploads events to Google BigQuery using the streaming API
|
|
15
|
+
# so data can become available nearly immediately.
|
|
16
|
+
#
|
|
17
|
+
# You can configure it to flush periodically, after N events or after
|
|
18
|
+
# a certain amount of data is ingested.
|
|
19
|
+
#
|
|
20
|
+
# === Environment Configuration
|
|
21
|
+
#
|
|
22
|
+
# You must enable BigQuery on your GCS account and create a dataset to
|
|
23
|
+
# hold the tables this plugin generates.
|
|
24
|
+
#
|
|
25
|
+
# You must also grant the service account this plugin uses access to
|
|
26
|
+
# the dataset.
|
|
27
|
+
#
|
|
28
|
+
# You can use https://www.elastic.co/guide/en/logstash/current/event-dependent-configuration.html[Logstash conditionals]
|
|
29
|
+
# and multiple configuration blocks to upload events with different structures.
|
|
30
|
+
#
|
|
31
|
+
# === Usage
|
|
32
|
+
# This is an example of logstash config:
|
|
33
|
+
#
|
|
34
|
+
# [source,ruby]
|
|
35
|
+
# --------------------------
|
|
36
|
+
# output {
|
|
37
|
+
# google_bigquery {
|
|
38
|
+
# project_id => "folkloric-guru-278" (required)
|
|
39
|
+
# dataset => "logs" (required)
|
|
40
|
+
# csv_schema => "path:STRING,status:INTEGER,score:FLOAT" (required) <1>
|
|
41
|
+
# json_key_file => "/path/to/key.json" (optional) <2>
|
|
42
|
+
# error_directory => "/tmp/bigquery-errors" (required)
|
|
43
|
+
# date_pattern => "%Y-%m-%dT%H:00" (optional)
|
|
44
|
+
# flush_interval_secs => 30 (optional)
|
|
45
|
+
# }
|
|
46
|
+
# }
|
|
47
|
+
# --------------------------
|
|
48
|
+
#
|
|
49
|
+
# <1> Specify either a csv_schema or a json_schema.
|
|
50
|
+
#
|
|
51
|
+
# <2> If the key is not used, then the plugin tries to find
|
|
52
|
+
# https://cloud.google.com/docs/authentication/production[Application Default Credentials]
|
|
53
|
+
#
|
|
54
|
+
# === Considerations
|
|
55
|
+
#
|
|
56
|
+
# * There is a small fee to insert data into BigQuery using the streaming API
|
|
57
|
+
# * This plugin buffers events in-memory, so make sure the flush configurations are appropriate
|
|
58
|
+
# for your use-case and consider using
|
|
59
|
+
# https://www.elastic.co/guide/en/logstash/current/persistent-queues.html[Logstash Persistent Queues]
|
|
60
|
+
#
|
|
61
|
+
# === Additional Resources
|
|
62
|
+
#
|
|
63
|
+
# * https://cloud.google.com/bigquery/[BigQuery Introduction]
|
|
64
|
+
# * https://cloud.google.com/bigquery/docs/schemas[BigQuery Schema Formats and Types]
|
|
65
|
+
# * https://cloud.google.com/bigquery/pricing[Pricing Information]
|
|
66
|
+
#
|
|
67
|
+
class LogStash::Outputs::GoogleBigQuery < LogStash::Outputs::Base
|
|
68
|
+
config_name 'google_bigquery'
|
|
69
|
+
|
|
70
|
+
concurrency :single
|
|
71
|
+
|
|
72
|
+
# Google Cloud Project ID (number, not Project Name!).
|
|
73
|
+
config :project_id, validate: :string, required: true
|
|
74
|
+
|
|
75
|
+
# The BigQuery dataset the tables for the events will be added to.
|
|
76
|
+
config :dataset, validate: :string, required: true
|
|
77
|
+
|
|
78
|
+
# BigQuery table ID prefix to be used when creating new tables for log data.
|
|
79
|
+
# Table name will be `<table_prefix><table_separator><date>`
|
|
80
|
+
config :table_prefix, validate: :string, default: 'logstash'
|
|
81
|
+
|
|
82
|
+
# BigQuery table separator to be added between the table_prefix and the
|
|
83
|
+
# date suffix.
|
|
84
|
+
config :table_separator, validate: :string, default: '_'
|
|
85
|
+
|
|
86
|
+
# Schema for log data. It must follow the format `name1:type1(,name2:type2)*`.
|
|
87
|
+
# For example, `path:STRING,status:INTEGER,score:FLOAT`.
|
|
88
|
+
config :csv_schema, validate: :string, required: false, default: nil
|
|
89
|
+
|
|
90
|
+
# Schema for log data as a hash.
|
|
91
|
+
# These can include nested records, descriptions, and modes.
|
|
92
|
+
#
|
|
93
|
+
# Example:
|
|
94
|
+
# # [source,ruby]
|
|
95
|
+
# --------------------------
|
|
96
|
+
# json_schema => {
|
|
97
|
+
# fields => [{
|
|
98
|
+
# name => "endpoint"
|
|
99
|
+
# type => "STRING"
|
|
100
|
+
# description => "Request route"
|
|
101
|
+
# }, {
|
|
102
|
+
# name => "status"
|
|
103
|
+
# type => "INTEGER"
|
|
104
|
+
# mode => "NULLABLE"
|
|
105
|
+
# }, {
|
|
106
|
+
# name => "params"
|
|
107
|
+
# type => "RECORD"
|
|
108
|
+
# mode => "REPEATED"
|
|
109
|
+
# fields => [{
|
|
110
|
+
# name => "key"
|
|
111
|
+
# type => "STRING"
|
|
112
|
+
# }, {
|
|
113
|
+
# name => "value"
|
|
114
|
+
# type => "STRING"
|
|
115
|
+
# }]
|
|
116
|
+
# }]
|
|
117
|
+
# }
|
|
118
|
+
# --------------------------
|
|
119
|
+
config :json_schema, validate: :hash, required: false, default: nil
|
|
120
|
+
|
|
121
|
+
# Indicates if BigQuery should ignore values that are not represented in the table schema.
|
|
122
|
+
# If true, the extra values are discarded.
|
|
123
|
+
# If false, BigQuery will reject the records with extra fields and the job will fail.
|
|
124
|
+
# The default value is false.
|
|
125
|
+
#
|
|
126
|
+
# NOTE: You may want to add a Logstash filter like the following to remove common fields it adds:
|
|
127
|
+
# [source,ruby]
|
|
128
|
+
# ----------------------------------
|
|
129
|
+
# mutate {
|
|
130
|
+
# remove_field => ["@version","@timestamp","path","host","type", "message"]
|
|
131
|
+
# }
|
|
132
|
+
# ----------------------------------
|
|
133
|
+
config :ignore_unknown_values, validate: :boolean, default: false
|
|
134
|
+
|
|
135
|
+
# Time pattern for BigQuery table, defaults to hourly tables.
|
|
136
|
+
# Must Time.strftime patterns: www.ruby-doc.org/core-2.0/Time.html#method-i-strftime
|
|
137
|
+
config :date_pattern, validate: :string, default: '%Y-%m-%dT%H:00'
|
|
138
|
+
|
|
139
|
+
# If logstash is running within Google Compute Engine, the plugin will use
|
|
140
|
+
# GCE's Application Default Credentials. Outside of GCE, you will need to
|
|
141
|
+
# specify a Service Account JSON key file.
|
|
142
|
+
config :json_key_file, validate: :string, required: false
|
|
143
|
+
|
|
144
|
+
# The number of messages to upload at a single time. (< 1000, default: 128)
|
|
145
|
+
config :batch_size, validate: :number, required: true, default: 128
|
|
146
|
+
|
|
147
|
+
# An approximate number of bytes to upload as part of a batch. Default: 1MB
|
|
148
|
+
config :batch_size_bytes, validate: :number, required: true, default: 1_000_000
|
|
149
|
+
|
|
150
|
+
# Uploads all data this often even if other upload criteria aren't met. Default: 5s
|
|
151
|
+
config :flush_interval_secs, validate: :number, required: true, default: 5
|
|
152
|
+
|
|
153
|
+
# The location to store events that could not be uploaded due to errors.
|
|
154
|
+
# Consider using an additional Logstash input to pipe the contents of
|
|
155
|
+
# these to an alert platform so you can manually fix the events.
|
|
156
|
+
#
|
|
157
|
+
# Or use https://cloud.google.com/storage/docs/gcs-fuse[GCS FUSE] to
|
|
158
|
+
# transparently upload to a GCS bucket.
|
|
159
|
+
#
|
|
160
|
+
# Files names follow the pattern `[table name]-[UNIX timestamp].log`
|
|
161
|
+
config :error_directory, validate: :string, required: true, default: '/tmp/bigquery_errors'
|
|
162
|
+
|
|
163
|
+
# The following configuration options still exist to alert users that are using them
|
|
164
|
+
config :uploader_interval_secs, validate: :number, deprecated: 'No longer used.'
|
|
165
|
+
config :deleter_interval_secs, validate: :number, deprecated: 'No longer used.'
|
|
166
|
+
config :key_path, validate: :string, obsolete: 'Use json_key_file or ADC instead.'
|
|
167
|
+
config :key_password, validate: :string, deprecated: 'No longer needed with json_key_file or ADC.'
|
|
168
|
+
config :service_account, validate: :string, deprecated: 'No longer needed with json_key_file or ADC.'
|
|
169
|
+
config :temp_file_prefix, validate: :string, deprecated: 'No longer used.'
|
|
170
|
+
config :temp_directory, validate: :string, deprecated: 'No longer used.'
|
|
171
|
+
|
|
172
|
+
public
|
|
173
|
+
|
|
174
|
+
def register
|
|
175
|
+
@logger.debug('Registering plugin')
|
|
176
|
+
|
|
177
|
+
@schema = LogStash::Outputs::BigQuery::Schema.parse_csv_or_json @csv_schema, @json_schema
|
|
178
|
+
@bq_client = LogStash::Outputs::BigQuery::StreamingClient.new @json_key_file, @project_id, @logger
|
|
179
|
+
@batcher = LogStash::Outputs::BigQuery::Batcher.new @batch_size, @batch_size_bytes
|
|
180
|
+
|
|
181
|
+
init_batcher_flush_thread
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# Method called for each log event. It writes the event to the current output
|
|
185
|
+
# file, flushing depending on flush interval configuration.
|
|
186
|
+
def receive(event)
|
|
187
|
+
@logger.debug('BQ: receive method called', event: event)
|
|
188
|
+
|
|
189
|
+
# Property names MUST NOT have @ in them
|
|
190
|
+
message = replace_at_keys event.to_hash
|
|
191
|
+
|
|
192
|
+
# Message must be written as json
|
|
193
|
+
encoded_message = LogStash::Json.dump message
|
|
194
|
+
|
|
195
|
+
@batcher.enqueue(encoded_message) { |batch| publish(batch) }
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def get_table_name(time=nil)
|
|
199
|
+
time ||= Time.now
|
|
200
|
+
|
|
201
|
+
str_time = time.strftime(@date_pattern)
|
|
202
|
+
table_id = @table_prefix + @table_separator + str_time
|
|
203
|
+
|
|
204
|
+
# BQ does not accept anything other than alphanumeric and _
|
|
205
|
+
# Ref: https://developers.google.com/bigquery/browser-tool-quickstart?hl=en
|
|
206
|
+
table_id.tr!(':-', '_')
|
|
207
|
+
|
|
208
|
+
table_id
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
# Remove @ symbols in hash keys
|
|
212
|
+
def replace_at_keys(event)
|
|
213
|
+
return event unless event.is_a? Hash
|
|
214
|
+
|
|
215
|
+
out = {}
|
|
216
|
+
|
|
217
|
+
event.each do |key, value|
|
|
218
|
+
new_key = key.to_s.delete '@'
|
|
219
|
+
out[new_key] = replace_at_keys value
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
out
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# publish sends messages to a BigQuery table immediately
|
|
226
|
+
def publish(messages)
|
|
227
|
+
begin
|
|
228
|
+
return if messages.nil? || messages.empty?
|
|
229
|
+
|
|
230
|
+
table = get_table_name
|
|
231
|
+
@logger.info("Publishing #{messages.length} messages to #{table}")
|
|
232
|
+
|
|
233
|
+
create_table_if_not_exists table
|
|
234
|
+
|
|
235
|
+
successful = @bq_client.append @dataset, table, messages, @ignore_unknown_values
|
|
236
|
+
write_to_errors_file(messages, table) unless successful
|
|
237
|
+
rescue StandardError => e
|
|
238
|
+
@logger.error 'Error uploading data.', :exception => e
|
|
239
|
+
|
|
240
|
+
write_to_errors_file(messages, table)
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def create_table_if_not_exists table
|
|
245
|
+
begin
|
|
246
|
+
return nil if @bq_client.table_exists? @dataset, table
|
|
247
|
+
@bq_client.create_table(@dataset, table, @schema)
|
|
248
|
+
|
|
249
|
+
rescue StandardError => e
|
|
250
|
+
@logger.error 'Error creating table.', :exception => e
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
def write_to_errors_file(messages, table)
|
|
255
|
+
begin
|
|
256
|
+
FileUtils.mkdir_p @error_directory
|
|
257
|
+
|
|
258
|
+
t = Time.new
|
|
259
|
+
error_file_name = "#{table}-#{t.to_i}.log"
|
|
260
|
+
error_file_path = ::File.join(@error_directory, error_file_name)
|
|
261
|
+
@logger.info "Problem data is being stored in: #{error_file_path}"
|
|
262
|
+
|
|
263
|
+
File.open(error_file_path, 'w') do |f|
|
|
264
|
+
messages.each { |message| f.puts message }
|
|
265
|
+
end
|
|
266
|
+
rescue StandardError => e
|
|
267
|
+
@logger.error 'Error creating error file.', :exception => e, :messages => messages, :table => table
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
def init_batcher_flush_thread
|
|
272
|
+
@flush_thread = Thread.new do
|
|
273
|
+
loop do
|
|
274
|
+
sleep @flush_interval_secs
|
|
275
|
+
|
|
276
|
+
@batcher.enqueue(nil) { |batch| publish(batch) }
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
end
|