logstash-output-google_bigquery 4.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +71 -0
- data/CONTRIBUTORS +15 -0
- data/Gemfile +11 -0
- data/LICENSE +13 -0
- data/NOTICE.TXT +5 -0
- data/README.md +100 -0
- data/docs/index.asciidoc +348 -0
- data/lib/logstash-output-google_bigquery_jars.rb +38 -0
- data/lib/logstash/outputs/bigquery/batcher.rb +82 -0
- data/lib/logstash/outputs/bigquery/schema.rb +93 -0
- data/lib/logstash/outputs/bigquery/streamclient.rb +120 -0
- data/lib/logstash/outputs/google_bigquery.rb +280 -0
- data/logstash-output-google_bigquery.gemspec +31 -0
- data/spec/outputs/bigquery/batcher_spec.rb +110 -0
- data/spec/outputs/bigquery/schema_spec.rb +101 -0
- data/spec/outputs/google_bigquery_spec.rb +154 -0
- data/vendor/jar-dependencies/com/fasterxml/jackson/core/jackson-core/2.1.3/jackson-core-2.1.3.jar +0 -0
- data/vendor/jar-dependencies/com/google/api-client/google-api-client/1.23.0/google-api-client-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/api-common/1.5.0/api-common-1.5.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/gax-httpjson/0.40.0/gax-httpjson-0.40.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/gax/1.23.0/gax-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/grpc/proto-google-common-protos/1.7.0/proto-google-common-protos-1.7.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/api/grpc/proto-google-iam-v1/0.8.0/proto-google-iam-v1-0.8.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/apis/google-api-services-bigquery/v2-rev377-1.23.0/google-api-services-bigquery-v2-rev377-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/auth/google-auth-library-credentials/0.9.0/google-auth-library-credentials-0.9.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/auth/google-auth-library-oauth2-http/0.9.0/google-auth-library-oauth2-http-0.9.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/auto/value/auto-value/1.4/auto-value-1.4.jar +0 -0
- data/vendor/jar-dependencies/com/google/cloud/google-cloud-bigquery/1.24.1/google-cloud-bigquery-1.24.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/cloud/google-cloud-core-http/1.24.1/google-cloud-core-http-1.24.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/cloud/google-cloud-core/1.24.1/google-cloud-core-1.24.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/code/findbugs/jsr305/3.0.1/jsr305-3.0.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/code/gson/gson/2.7/gson-2.7.jar +0 -0
- data/vendor/jar-dependencies/com/google/errorprone/error_prone_annotations/2.2.0/error_prone_annotations-2.2.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/guava/guava/20.0/guava-20.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/http-client/google-http-client-appengine/1.23.0/google-http-client-appengine-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/http-client/google-http-client-jackson/1.23.0/google-http-client-jackson-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/http-client/google-http-client-jackson2/1.23.0/google-http-client-jackson2-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/http-client/google-http-client/1.23.0/google-http-client-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/oauth-client/google-oauth-client/1.23.0/google-oauth-client-1.23.0.jar +0 -0
- data/vendor/jar-dependencies/com/google/protobuf/protobuf-java-util/3.5.1/protobuf-java-util-3.5.1.jar +0 -0
- data/vendor/jar-dependencies/com/google/protobuf/protobuf-java/3.5.1/protobuf-java-3.5.1.jar +0 -0
- data/vendor/jar-dependencies/commons-codec/commons-codec/1.9/commons-codec-1.9.jar +0 -0
- data/vendor/jar-dependencies/commons-logging/commons-logging/1.2/commons-logging-1.2.jar +0 -0
- data/vendor/jar-dependencies/io/grpc/grpc-context/1.9.0/grpc-context-1.9.0.jar +0 -0
- data/vendor/jar-dependencies/io/opencensus/opencensus-api/0.11.1/opencensus-api-0.11.1.jar +0 -0
- data/vendor/jar-dependencies/io/opencensus/opencensus-contrib-http-util/0.11.1/opencensus-contrib-http-util-0.11.1.jar +0 -0
- data/vendor/jar-dependencies/joda-time/joda-time/2.9.2/joda-time-2.9.2.jar +0 -0
- data/vendor/jar-dependencies/org/apache/httpcomponents/httpclient/4.5.2/httpclient-4.5.2.jar +0 -0
- data/vendor/jar-dependencies/org/apache/httpcomponents/httpcore/4.4.4/httpcore-4.4.4.jar +0 -0
- data/vendor/jar-dependencies/org/codehaus/jackson/jackson-core-asl/1.9.11/jackson-core-asl-1.9.11.jar +0 -0
- data/vendor/jar-dependencies/org/threeten/threetenbp/1.3.3/threetenbp-1.3.3.jar +0 -0
- metadata +178 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
# AUTOGENERATED BY THE GRADLE SCRIPT. DO NOT EDIT.
|
2
|
+
|
3
|
+
require 'jar_dependencies'
|
4
|
+
require_jar('com.google.cloud', 'google-cloud-bigquery', '1.24.1')
|
5
|
+
require_jar('com.fasterxml.jackson.core', 'jackson-core', '2.1.3')
|
6
|
+
require_jar('com.google.api', 'api-common', '1.5.0')
|
7
|
+
require_jar('com.google.api-client', 'google-api-client', '1.23.0')
|
8
|
+
require_jar('com.google.api', 'gax', '1.23.0')
|
9
|
+
require_jar('com.google.api', 'gax-httpjson', '0.40.0')
|
10
|
+
require_jar('com.google.api.grpc', 'proto-google-common-protos', '1.7.0')
|
11
|
+
require_jar('com.google.api.grpc', 'proto-google-iam-v1', '0.8.0')
|
12
|
+
require_jar('com.google.apis', 'google-api-services-bigquery', 'v2-rev377-1.23.0')
|
13
|
+
require_jar('com.google.auth', 'google-auth-library-credentials', '0.9.0')
|
14
|
+
require_jar('com.google.auth', 'google-auth-library-oauth2-http', '0.9.0')
|
15
|
+
require_jar('com.google.auto.value', 'auto-value', '1.4')
|
16
|
+
require_jar('com.google.cloud', 'google-cloud-core', '1.24.1')
|
17
|
+
require_jar('com.google.cloud', 'google-cloud-core-http', '1.24.1')
|
18
|
+
require_jar('com.google.code.findbugs', 'jsr305', '3.0.1')
|
19
|
+
require_jar('com.google.code.gson', 'gson', '2.7')
|
20
|
+
require_jar('com.google.errorprone', 'error_prone_annotations', '2.2.0')
|
21
|
+
require_jar('com.google.guava', 'guava', '20.0')
|
22
|
+
require_jar('com.google.http-client', 'google-http-client', '1.23.0')
|
23
|
+
require_jar('com.google.http-client', 'google-http-client-appengine', '1.23.0')
|
24
|
+
require_jar('com.google.http-client', 'google-http-client-jackson', '1.23.0')
|
25
|
+
require_jar('com.google.http-client', 'google-http-client-jackson2', '1.23.0')
|
26
|
+
require_jar('com.google.oauth-client', 'google-oauth-client', '1.23.0')
|
27
|
+
require_jar('com.google.protobuf', 'protobuf-java', '3.5.1')
|
28
|
+
require_jar('com.google.protobuf', 'protobuf-java-util', '3.5.1')
|
29
|
+
require_jar('commons-codec', 'commons-codec', '1.9')
|
30
|
+
require_jar('commons-logging', 'commons-logging', '1.2')
|
31
|
+
require_jar('io.grpc', 'grpc-context', '1.9.0')
|
32
|
+
require_jar('io.opencensus', 'opencensus-api', '0.11.1')
|
33
|
+
require_jar('io.opencensus', 'opencensus-contrib-http-util', '0.11.1')
|
34
|
+
require_jar('joda-time', 'joda-time', '2.9.2')
|
35
|
+
require_jar('org.apache.httpcomponents', 'httpclient', '4.5.2')
|
36
|
+
require_jar('org.apache.httpcomponents', 'httpcore', '4.4.4')
|
37
|
+
require_jar('org.codehaus.jackson', 'jackson-core-asl', '1.9.11')
|
38
|
+
require_jar('org.threeten', 'threetenbp', '1.3.3')
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'java'
|
3
|
+
require 'logstash-output-google_bigquery_jars.rb'
|
4
|
+
|
5
|
+
module LogStash
|
6
|
+
module Outputs
|
7
|
+
module BigQuery
|
8
|
+
# Batcher is a queue that bundles messages in batches based on their
|
9
|
+
# size in bytes or count. It's used to provide guarantees around
|
10
|
+
# maximum data loss due to a fault while maintaining good upload
|
11
|
+
# throughput.
|
12
|
+
class Batcher
|
13
|
+
include_package 'java.util.concurrent.locks'
|
14
|
+
|
15
|
+
def initialize(max_length, max_bytes)
|
16
|
+
@lock = ReentrantReadWriteLock.new
|
17
|
+
@max_length = max_length
|
18
|
+
@max_bytes = max_bytes
|
19
|
+
|
20
|
+
clear
|
21
|
+
end
|
22
|
+
|
23
|
+
# enqueue_push calls enqueue and if a batch is ready to go pushes it to
|
24
|
+
# the provided queue.
|
25
|
+
def enqueue_push(message, queue)
|
26
|
+
batch = enqueue message
|
27
|
+
|
28
|
+
queue << batch unless batch.nil?
|
29
|
+
end
|
30
|
+
|
31
|
+
# enqueue adds a message to the batch. If the batch is ready to be sent
|
32
|
+
# out the internal state is reset and the array of messages is both
|
33
|
+
# yielded and returned.
|
34
|
+
# Otherwise nil is returned.
|
35
|
+
def enqueue(message)
|
36
|
+
@lock.write_lock.lock
|
37
|
+
|
38
|
+
begin
|
39
|
+
is_flush_request = message.nil?
|
40
|
+
|
41
|
+
unless is_flush_request
|
42
|
+
@batch_size_bytes += message.length
|
43
|
+
@batch << message
|
44
|
+
end
|
45
|
+
|
46
|
+
length_met = @batch.length >= @max_length
|
47
|
+
size_met = @batch_size_bytes >= @max_bytes
|
48
|
+
|
49
|
+
if is_flush_request || length_met || size_met
|
50
|
+
orig = @batch
|
51
|
+
clear
|
52
|
+
|
53
|
+
yield(orig) if block_given?
|
54
|
+
return orig
|
55
|
+
end
|
56
|
+
|
57
|
+
nil
|
58
|
+
ensure
|
59
|
+
@lock.write_lock.unlock
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# removes all elements from the batch
|
64
|
+
def clear
|
65
|
+
@lock.write_lock.lock
|
66
|
+
@batch = []
|
67
|
+
@batch_size_bytes = 0
|
68
|
+
@lock.write_lock.unlock
|
69
|
+
end
|
70
|
+
|
71
|
+
def empty?
|
72
|
+
@lock.read_lock.lock
|
73
|
+
begin
|
74
|
+
@batch.empty? && @batch_size_bytes.zero?
|
75
|
+
ensure
|
76
|
+
@lock.read_lock.unlock
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'java'
|
2
|
+
require 'logstash-output-google_bigquery_jars.rb'
|
3
|
+
require 'logstash/json'
|
4
|
+
|
5
|
+
module LogStash
|
6
|
+
module Outputs
|
7
|
+
module BigQuery
|
8
|
+
class Schema
|
9
|
+
include_package 'com.google.cloud.bigquery'
|
10
|
+
|
11
|
+
# Converts a CSV schema or JSON schema into a BigQuery Java Schema.
|
12
|
+
def self.parse_csv_or_json(csv_schema, json_schema)
|
13
|
+
csv_blank = csv_schema.nil? || csv_schema.empty?
|
14
|
+
json_blank = json_schema.nil? || json_schema.empty?
|
15
|
+
|
16
|
+
unless csv_blank ^ json_blank
|
17
|
+
raise ArgumentError.new("You must provide either json_schema OR csv_schema. csv: #{csv_schema}, json: #{json_schema}")
|
18
|
+
end
|
19
|
+
|
20
|
+
if csv_blank
|
21
|
+
schema = json_schema
|
22
|
+
else
|
23
|
+
schema = parse_csv_schema csv_schema
|
24
|
+
end
|
25
|
+
|
26
|
+
self.hash_to_java_schema schema
|
27
|
+
end
|
28
|
+
|
29
|
+
# Converts a CSV of field:type pairs into the JSON style schema.
|
30
|
+
def self.parse_csv_schema(csv_schema)
|
31
|
+
require 'csv'
|
32
|
+
|
33
|
+
fields = []
|
34
|
+
|
35
|
+
CSV.parse(csv_schema.gsub('\"', '""')).flatten.each do |field|
|
36
|
+
raise ArgumentError.new('csv_schema must follow the format <field-name>:<field-type>') if field.nil?
|
37
|
+
|
38
|
+
temp = field.strip.split(':')
|
39
|
+
|
40
|
+
if temp.length != 2
|
41
|
+
raise ArgumentError.new('csv_schema must follow the format <field-name>:<field-type>')
|
42
|
+
end
|
43
|
+
|
44
|
+
fields << { 'name' => temp[0], 'type' => temp[1] }
|
45
|
+
end
|
46
|
+
|
47
|
+
# Check that we have at least one field in the schema
|
48
|
+
raise ArgumentError.new('csv_schema must contain at least one field') if fields.empty?
|
49
|
+
|
50
|
+
{ 'fields' => fields }
|
51
|
+
end
|
52
|
+
|
53
|
+
# Converts the Ruby hash style schema into a BigQuery Java schema
|
54
|
+
def self.hash_to_java_schema(schema_hash)
|
55
|
+
field_list = self.parse_field_list schema_hash['fields']
|
56
|
+
com.google.cloud.bigquery.Schema.of field_list
|
57
|
+
end
|
58
|
+
|
59
|
+
# Converts a list of fields into a BigQuery Java FieldList
|
60
|
+
def self.parse_field_list(fields)
|
61
|
+
fieldslist = fields.map {|field| self.parse_field field}
|
62
|
+
|
63
|
+
FieldList.of fieldslist
|
64
|
+
end
|
65
|
+
|
66
|
+
# Converts a single field definition into a BigQuery Java Field object.
|
67
|
+
# This includes any nested fields as well.
|
68
|
+
def self.parse_field(field)
|
69
|
+
type = LegacySQLTypeName.valueOfStrict(field['type'])
|
70
|
+
name = field['name']
|
71
|
+
|
72
|
+
if field.has_key? 'fields'
|
73
|
+
sub_fields = self.parse_field_list field['fields']
|
74
|
+
builder = Field.newBuilder(name, type, sub_fields)
|
75
|
+
else
|
76
|
+
builder = Field.newBuilder(name, type)
|
77
|
+
end
|
78
|
+
|
79
|
+
if field.has_key? 'description'
|
80
|
+
builder = builder.setDescription(field['description'])
|
81
|
+
end
|
82
|
+
|
83
|
+
if field.has_key? 'mode'
|
84
|
+
mode = Field::Mode.valueOf field['mode']
|
85
|
+
builder = builder.setMode(mode)
|
86
|
+
end
|
87
|
+
|
88
|
+
builder.build
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'java'
|
2
|
+
require 'openssl'
|
3
|
+
require 'logstash-output-google_bigquery_jars.rb'
|
4
|
+
|
5
|
+
module LogStash
|
6
|
+
module Outputs
|
7
|
+
module BigQuery
|
8
|
+
# NOTE: This file uses _a lot_ of Java. Please keep the Java looking
|
9
|
+
# java-y so it's easy to tell the languages apart.
|
10
|
+
|
11
|
+
include_package 'com.google.cloud.bigquery'
|
12
|
+
|
13
|
+
# StreamingClient supports shipping data to BigQuery using streams.
|
14
|
+
class StreamingClient
|
15
|
+
def initialize(json_key_file, project_id, logger)
|
16
|
+
@logger = logger
|
17
|
+
|
18
|
+
@bigquery = initialize_google_client json_key_file, project_id
|
19
|
+
end
|
20
|
+
|
21
|
+
def table_exists?(dataset, table)
|
22
|
+
api_debug('Checking if table exists', dataset, table)
|
23
|
+
tbl = @bigquery.getTable dataset, table
|
24
|
+
|
25
|
+
!tbl.nil?
|
26
|
+
end
|
27
|
+
|
28
|
+
# Creates a table with the given name in the given dataset
|
29
|
+
def create_table(dataset, table, schema)
|
30
|
+
api_debug('Creating table', dataset, table)
|
31
|
+
table_id = com.google.cloud.bigquery.TableId.of dataset, table
|
32
|
+
|
33
|
+
table_defn = com.google.cloud.bigquery.StandardTableDefinition.of schema
|
34
|
+
table_info = com.google.cloud.bigquery.TableInfo.newBuilder(table_id, table_defn).build()
|
35
|
+
|
36
|
+
@bigquery.create table_info
|
37
|
+
end
|
38
|
+
|
39
|
+
def append(dataset, table, rows, ignore_unknown)
|
40
|
+
api_debug("Appending #{rows.length} rows", dataset, table)
|
41
|
+
|
42
|
+
request = build_append_request dataset, table, rows, ignore_unknown
|
43
|
+
|
44
|
+
response = @bigquery.insertAll request
|
45
|
+
return true unless response.hasErrors
|
46
|
+
|
47
|
+
response.getInsertErrors().entrySet().each{ |entry|
|
48
|
+
key = entry.getKey
|
49
|
+
errors = entry.getValue
|
50
|
+
|
51
|
+
errors.each{|bqError|
|
52
|
+
@logger.warn('Error while inserting',
|
53
|
+
key: key,
|
54
|
+
location: bqError.getLocation,
|
55
|
+
message: bqError.getMessage,
|
56
|
+
reason: bqError.getReason)
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
false
|
61
|
+
end
|
62
|
+
|
63
|
+
def build_append_request(dataset, table, rows, ignore_unknown)
|
64
|
+
request = com.google.cloud.bigquery.InsertAllRequest.newBuilder dataset, table
|
65
|
+
request.setIgnoreUnknownValues ignore_unknown
|
66
|
+
|
67
|
+
rows.each { |serialized_row|
|
68
|
+
# deserialize rows into Java maps
|
69
|
+
deserialized = LogStash::Json.load serialized_row
|
70
|
+
request.addRow deserialized
|
71
|
+
}
|
72
|
+
|
73
|
+
request.build
|
74
|
+
end
|
75
|
+
|
76
|
+
# raises an exception if the key file is invalid
|
77
|
+
def get_key_file_error(json_key_file)
|
78
|
+
return nil if json_key_file.nil? || json_key_file == ''
|
79
|
+
|
80
|
+
abs = ::File.absolute_path json_key_file
|
81
|
+
unless abs == json_key_file
|
82
|
+
return "json_key_file must be an absolute path: #{json_key_file}"
|
83
|
+
end
|
84
|
+
|
85
|
+
unless ::File.exist? json_key_file
|
86
|
+
return "json_key_file does not exist: #{json_key_file}"
|
87
|
+
end
|
88
|
+
|
89
|
+
nil
|
90
|
+
end
|
91
|
+
|
92
|
+
def initialize_google_client(json_key_file, project_id)
|
93
|
+
@logger.info("Initializing Google API client #{project_id} key: #{json_key_file}")
|
94
|
+
err = get_key_file_error json_key_file
|
95
|
+
raise err unless err.nil?
|
96
|
+
|
97
|
+
if json_key_file.nil? || json_key_file.empty?
|
98
|
+
return com.google.cloud.bigquery.BigQueryOptions.getDefaultInstance().getService()
|
99
|
+
end
|
100
|
+
|
101
|
+
# TODO: set User-Agent
|
102
|
+
|
103
|
+
key_file = java.io.FileInputStream.new json_key_file
|
104
|
+
credentials = com.google.auth.oauth2.ServiceAccountCredentials.fromStream key_file
|
105
|
+
return com.google.cloud.bigquery.BigQueryOptions.newBuilder()
|
106
|
+
.setCredentials(credentials)
|
107
|
+
.setProjectId(project_id)
|
108
|
+
.build()
|
109
|
+
.getService()
|
110
|
+
end
|
111
|
+
|
112
|
+
private
|
113
|
+
|
114
|
+
def api_debug(message, dataset, table)
|
115
|
+
@logger.debug(message, dataset: dataset, table: table)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,280 @@
|
|
1
|
+
require 'logstash/outputs/base'
|
2
|
+
require 'logstash/namespace'
|
3
|
+
require 'logstash/json'
|
4
|
+
require 'logstash/outputs/bigquery/streamclient'
|
5
|
+
require 'logstash/outputs/bigquery/batcher'
|
6
|
+
require 'logstash/outputs/bigquery/schema'
|
7
|
+
|
8
|
+
require 'time'
|
9
|
+
require 'fileutils'
|
10
|
+
|
11
|
+
#
|
12
|
+
# === Summary
|
13
|
+
#
|
14
|
+
# This plugin uploads events to Google BigQuery using the streaming API
|
15
|
+
# so data can become available nearly immediately.
|
16
|
+
#
|
17
|
+
# You can configure it to flush periodically, after N events or after
|
18
|
+
# a certain amount of data is ingested.
|
19
|
+
#
|
20
|
+
# === Environment Configuration
|
21
|
+
#
|
22
|
+
# You must enable BigQuery on your GCS account and create a dataset to
|
23
|
+
# hold the tables this plugin generates.
|
24
|
+
#
|
25
|
+
# You must also grant the service account this plugin uses access to
|
26
|
+
# the dataset.
|
27
|
+
#
|
28
|
+
# You can use https://www.elastic.co/guide/en/logstash/current/event-dependent-configuration.html[Logstash conditionals]
|
29
|
+
# and multiple configuration blocks to upload events with different structures.
|
30
|
+
#
|
31
|
+
# === Usage
|
32
|
+
# This is an example of logstash config:
|
33
|
+
#
|
34
|
+
# [source,ruby]
|
35
|
+
# --------------------------
|
36
|
+
# output {
|
37
|
+
# google_bigquery {
|
38
|
+
# project_id => "folkloric-guru-278" (required)
|
39
|
+
# dataset => "logs" (required)
|
40
|
+
# csv_schema => "path:STRING,status:INTEGER,score:FLOAT" (required) <1>
|
41
|
+
# json_key_file => "/path/to/key.json" (optional) <2>
|
42
|
+
# error_directory => "/tmp/bigquery-errors" (required)
|
43
|
+
# date_pattern => "%Y-%m-%dT%H:00" (optional)
|
44
|
+
# flush_interval_secs => 30 (optional)
|
45
|
+
# }
|
46
|
+
# }
|
47
|
+
# --------------------------
|
48
|
+
#
|
49
|
+
# <1> Specify either a csv_schema or a json_schema.
|
50
|
+
#
|
51
|
+
# <2> If the key is not used, then the plugin tries to find
|
52
|
+
# https://cloud.google.com/docs/authentication/production[Application Default Credentials]
|
53
|
+
#
|
54
|
+
# === Considerations
|
55
|
+
#
|
56
|
+
# * There is a small fee to insert data into BigQuery using the streaming API
|
57
|
+
# * This plugin buffers events in-memory, so make sure the flush configurations are appropriate
|
58
|
+
# for your use-case and consider using
|
59
|
+
# https://www.elastic.co/guide/en/logstash/current/persistent-queues.html[Logstash Persistent Queues]
|
60
|
+
#
|
61
|
+
# === Additional Resources
|
62
|
+
#
|
63
|
+
# * https://cloud.google.com/bigquery/[BigQuery Introduction]
|
64
|
+
# * https://cloud.google.com/bigquery/docs/schemas[BigQuery Schema Formats and Types]
|
65
|
+
# * https://cloud.google.com/bigquery/pricing[Pricing Information]
|
66
|
+
#
|
67
|
+
class LogStash::Outputs::GoogleBigQuery < LogStash::Outputs::Base
|
68
|
+
config_name 'google_bigquery'
|
69
|
+
|
70
|
+
concurrency :single
|
71
|
+
|
72
|
+
# Google Cloud Project ID (number, not Project Name!).
|
73
|
+
config :project_id, validate: :string, required: true
|
74
|
+
|
75
|
+
# The BigQuery dataset the tables for the events will be added to.
|
76
|
+
config :dataset, validate: :string, required: true
|
77
|
+
|
78
|
+
# BigQuery table ID prefix to be used when creating new tables for log data.
|
79
|
+
# Table name will be `<table_prefix><table_separator><date>`
|
80
|
+
config :table_prefix, validate: :string, default: 'logstash'
|
81
|
+
|
82
|
+
# BigQuery table separator to be added between the table_prefix and the
|
83
|
+
# date suffix.
|
84
|
+
config :table_separator, validate: :string, default: '_'
|
85
|
+
|
86
|
+
# Schema for log data. It must follow the format `name1:type1(,name2:type2)*`.
|
87
|
+
# For example, `path:STRING,status:INTEGER,score:FLOAT`.
|
88
|
+
config :csv_schema, validate: :string, required: false, default: nil
|
89
|
+
|
90
|
+
# Schema for log data as a hash.
|
91
|
+
# These can include nested records, descriptions, and modes.
|
92
|
+
#
|
93
|
+
# Example:
|
94
|
+
# # [source,ruby]
|
95
|
+
# --------------------------
|
96
|
+
# json_schema => {
|
97
|
+
# fields => [{
|
98
|
+
# name => "endpoint"
|
99
|
+
# type => "STRING"
|
100
|
+
# description => "Request route"
|
101
|
+
# }, {
|
102
|
+
# name => "status"
|
103
|
+
# type => "INTEGER"
|
104
|
+
# mode => "NULLABLE"
|
105
|
+
# }, {
|
106
|
+
# name => "params"
|
107
|
+
# type => "RECORD"
|
108
|
+
# mode => "REPEATED"
|
109
|
+
# fields => [{
|
110
|
+
# name => "key"
|
111
|
+
# type => "STRING"
|
112
|
+
# }, {
|
113
|
+
# name => "value"
|
114
|
+
# type => "STRING"
|
115
|
+
# }]
|
116
|
+
# }]
|
117
|
+
# }
|
118
|
+
# --------------------------
|
119
|
+
config :json_schema, validate: :hash, required: false, default: nil
|
120
|
+
|
121
|
+
# Indicates if BigQuery should ignore values that are not represented in the table schema.
|
122
|
+
# If true, the extra values are discarded.
|
123
|
+
# If false, BigQuery will reject the records with extra fields and the job will fail.
|
124
|
+
# The default value is false.
|
125
|
+
#
|
126
|
+
# NOTE: You may want to add a Logstash filter like the following to remove common fields it adds:
|
127
|
+
# [source,ruby]
|
128
|
+
# ----------------------------------
|
129
|
+
# mutate {
|
130
|
+
# remove_field => ["@version","@timestamp","path","host","type", "message"]
|
131
|
+
# }
|
132
|
+
# ----------------------------------
|
133
|
+
config :ignore_unknown_values, validate: :boolean, default: false
|
134
|
+
|
135
|
+
# Time pattern for BigQuery table, defaults to hourly tables.
|
136
|
+
# Must Time.strftime patterns: www.ruby-doc.org/core-2.0/Time.html#method-i-strftime
|
137
|
+
config :date_pattern, validate: :string, default: '%Y-%m-%dT%H:00'
|
138
|
+
|
139
|
+
# If logstash is running within Google Compute Engine, the plugin will use
|
140
|
+
# GCE's Application Default Credentials. Outside of GCE, you will need to
|
141
|
+
# specify a Service Account JSON key file.
|
142
|
+
config :json_key_file, validate: :string, required: false
|
143
|
+
|
144
|
+
# The number of messages to upload at a single time. (< 1000, default: 128)
|
145
|
+
config :batch_size, validate: :number, required: true, default: 128
|
146
|
+
|
147
|
+
# An approximate number of bytes to upload as part of a batch. Default: 1MB
|
148
|
+
config :batch_size_bytes, validate: :number, required: true, default: 1_000_000
|
149
|
+
|
150
|
+
# Uploads all data this often even if other upload criteria aren't met. Default: 5s
|
151
|
+
config :flush_interval_secs, validate: :number, required: true, default: 5
|
152
|
+
|
153
|
+
# The location to store events that could not be uploaded due to errors.
|
154
|
+
# Consider using an additional Logstash input to pipe the contents of
|
155
|
+
# these to an alert platform so you can manually fix the events.
|
156
|
+
#
|
157
|
+
# Or use https://cloud.google.com/storage/docs/gcs-fuse[GCS FUSE] to
|
158
|
+
# transparently upload to a GCS bucket.
|
159
|
+
#
|
160
|
+
# Files names follow the pattern `[table name]-[UNIX timestamp].log`
|
161
|
+
config :error_directory, validate: :string, required: true, default: '/tmp/bigquery_errors'
|
162
|
+
|
163
|
+
# The following configuration options still exist to alert users that are using them
|
164
|
+
config :uploader_interval_secs, validate: :number, deprecated: 'No longer used.'
|
165
|
+
config :deleter_interval_secs, validate: :number, deprecated: 'No longer used.'
|
166
|
+
config :key_path, validate: :string, obsolete: 'Use json_key_file or ADC instead.'
|
167
|
+
config :key_password, validate: :string, deprecated: 'No longer needed with json_key_file or ADC.'
|
168
|
+
config :service_account, validate: :string, deprecated: 'No longer needed with json_key_file or ADC.'
|
169
|
+
config :temp_file_prefix, validate: :string, deprecated: 'No longer used.'
|
170
|
+
config :temp_directory, validate: :string, deprecated: 'No longer used.'
|
171
|
+
|
172
|
+
public
|
173
|
+
|
174
|
+
def register
|
175
|
+
@logger.debug('Registering plugin')
|
176
|
+
|
177
|
+
@schema = LogStash::Outputs::BigQuery::Schema.parse_csv_or_json @csv_schema, @json_schema
|
178
|
+
@bq_client = LogStash::Outputs::BigQuery::StreamingClient.new @json_key_file, @project_id, @logger
|
179
|
+
@batcher = LogStash::Outputs::BigQuery::Batcher.new @batch_size, @batch_size_bytes
|
180
|
+
|
181
|
+
init_batcher_flush_thread
|
182
|
+
end
|
183
|
+
|
184
|
+
# Method called for each log event. It writes the event to the current output
|
185
|
+
# file, flushing depending on flush interval configuration.
|
186
|
+
def receive(event)
|
187
|
+
@logger.debug('BQ: receive method called', event: event)
|
188
|
+
|
189
|
+
# Property names MUST NOT have @ in them
|
190
|
+
message = replace_at_keys event.to_hash
|
191
|
+
|
192
|
+
# Message must be written as json
|
193
|
+
encoded_message = LogStash::Json.dump message
|
194
|
+
|
195
|
+
@batcher.enqueue(encoded_message) { |batch| publish(batch) }
|
196
|
+
end
|
197
|
+
|
198
|
+
def get_table_name(time=nil)
|
199
|
+
time ||= Time.now
|
200
|
+
|
201
|
+
str_time = time.strftime(@date_pattern)
|
202
|
+
table_id = @table_prefix + @table_separator + str_time
|
203
|
+
|
204
|
+
# BQ does not accept anything other than alphanumeric and _
|
205
|
+
# Ref: https://developers.google.com/bigquery/browser-tool-quickstart?hl=en
|
206
|
+
table_id.tr!(':-', '_')
|
207
|
+
|
208
|
+
table_id
|
209
|
+
end
|
210
|
+
|
211
|
+
# Remove @ symbols in hash keys
|
212
|
+
def replace_at_keys(event)
|
213
|
+
return event unless event.is_a? Hash
|
214
|
+
|
215
|
+
out = {}
|
216
|
+
|
217
|
+
event.each do |key, value|
|
218
|
+
new_key = key.to_s.delete '@'
|
219
|
+
out[new_key] = replace_at_keys value
|
220
|
+
end
|
221
|
+
|
222
|
+
out
|
223
|
+
end
|
224
|
+
|
225
|
+
# publish sends messages to a BigQuery table immediately
|
226
|
+
def publish(messages)
|
227
|
+
begin
|
228
|
+
return if messages.nil? || messages.empty?
|
229
|
+
|
230
|
+
table = get_table_name
|
231
|
+
@logger.info("Publishing #{messages.length} messages to #{table}")
|
232
|
+
|
233
|
+
create_table_if_not_exists table
|
234
|
+
|
235
|
+
successful = @bq_client.append @dataset, table, messages, @ignore_unknown_values
|
236
|
+
write_to_errors_file(messages, table) unless successful
|
237
|
+
rescue StandardError => e
|
238
|
+
@logger.error 'Error uploading data.', :exception => e
|
239
|
+
|
240
|
+
write_to_errors_file(messages, table)
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def create_table_if_not_exists table
|
245
|
+
begin
|
246
|
+
return nil if @bq_client.table_exists? @dataset, table
|
247
|
+
@bq_client.create_table(@dataset, table, @schema)
|
248
|
+
|
249
|
+
rescue StandardError => e
|
250
|
+
@logger.error 'Error creating table.', :exception => e
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
def write_to_errors_file(messages, table)
|
255
|
+
begin
|
256
|
+
FileUtils.mkdir_p @error_directory
|
257
|
+
|
258
|
+
t = Time.new
|
259
|
+
error_file_name = "#{table}-#{t.to_i}.log"
|
260
|
+
error_file_path = ::File.join(@error_directory, error_file_name)
|
261
|
+
@logger.info "Problem data is being stored in: #{error_file_path}"
|
262
|
+
|
263
|
+
File.open(error_file_path, 'w') do |f|
|
264
|
+
messages.each { |message| f.puts message }
|
265
|
+
end
|
266
|
+
rescue StandardError => e
|
267
|
+
@logger.error 'Error creating error file.', :exception => e, :messages => messages, :table => table
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
def init_batcher_flush_thread
|
272
|
+
@flush_thread = Thread.new do
|
273
|
+
loop do
|
274
|
+
sleep @flush_interval_secs
|
275
|
+
|
276
|
+
@batcher.enqueue(nil) { |batch| publish(batch) }
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
280
|
+
end
|