logstash-input-dynamodb 2.0.0-java → 2.0.1-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,68 @@
1
+ # encoding: utf-8
2
+ #
3
+ #Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4
+ #
5
+ #Licensed under the Apache License, Version 2.0 (the "License");
6
+ #you may not use this file except in compliance with the License.
7
+ #You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ #Unless required by applicable law or agreed to in writing, software
12
+ #distributed under the License is distributed on an "AS IS" BASIS,
13
+ #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ #See the License for the specific language governing permissions and
15
+ #limitations under the License.
16
+ #
17
+ require "java"
18
+
19
+ require "logstash-input-dynamodb_jars"
20
+ java_import "com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason"
21
+ java_import "java.lang.IllegalStateException"
22
+ java_import "org.apache.log4j.LogManager"
23
+
24
+ module Logstash
25
+ module Inputs
26
+ module DynamoDB
27
+ class LogStashRecordProcessor
28
+ include com.amazonaws.services.kinesis.clientlibrary.interfaces::IRecordProcessor
29
+
30
+ attr_accessor :queue, :shard_id
31
+
32
+ def initialize(queue)
33
+ # Workaround for IRecordProcessor.initialize(String shardId) interfering with constructor.
34
+ # No good way to overload methods in JRuby, so deciding which was supposed to be called here.
35
+ if (queue.is_a? String)
36
+ @shard_id = queue
37
+ return
38
+ else
39
+ @queue ||= queue
40
+ @logger ||= LogStash::Inputs::DynamoDB.logger
41
+ end
42
+ end
43
+
44
+ def process_records(records, checkpointer)
45
+ @logger.debug("Processing batch of " + records.size().to_s + " records")
46
+ records.each do |record|
47
+ @queue.push(record)
48
+ end
49
+ #checkpoint once all of the records have been consumed
50
+ checkpointer.checkpoint()
51
+ end
52
+
53
+ def shutdown(checkpointer, reason)
54
+ case reason
55
+ when ShutdownReason::TERMINATE
56
+ checkpointer.checkpoint()
57
+ when ShutdownReason::ZOMBIE
58
+ else
59
+ raise RuntimeError, "Invalid shutdown reason."
60
+ end
61
+ unless @shard_id.nil?
62
+ @logger.info("shutting down record processor with shardId: " + @shard_id + " with reason " + reason.to_s)
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+ #
3
+ #Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4
+ #
5
+ #Licensed under the Apache License, Version 2.0 (the "License");
6
+ #you may not use this file except in compliance with the License.
7
+ #You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ #Unless required by applicable law or agreed to in writing, software
12
+ #distributed under the License is distributed on an "AS IS" BASIS,
13
+ #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ #See the License for the specific language governing permissions and
15
+ #limitations under the License.
16
+ #
17
+ require 'java'
18
+ require_relative "LogStashRecordProcessor"
19
+
20
+ require "logstash-input-dynamodb_jars"
21
+
22
+ module KCL
23
+ include_package "com.amazonaws.services.kinesis.clientlibrary.interfaces"
24
+ end
25
+
26
+ module Logstash
27
+ module Inputs
28
+ module DynamoDB
29
+ class LogStashRecordProcessorFactory
30
+ include KCL::IRecordProcessorFactory
31
+
32
+ def initialize(queue)
33
+ @queue ||= queue
34
+ end
35
+
36
+ def create_processor
37
+ return Logstash::Inputs::DynamoDB::LogStashRecordProcessor.new(@queue)
38
+ end
39
+
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,341 @@
1
+ # encoding: utf-8
2
+ #
3
+ #Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4
+ #
5
+ #Licensed under the Apache License, Version 2.0 (the "License");
6
+ #you may not use this file except in compliance with the License.
7
+ #You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ #Unless required by applicable law or agreed to in writing, software
12
+ #distributed under the License is distributed on an "AS IS" BASIS,
13
+ #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ #See the License for the specific language governing permissions and
15
+ #limitations under the License.
16
+ #
17
+ require "logstash/inputs/base"
18
+ require "logstash/namespace"
19
+ require "securerandom"
20
+ require "thread"
21
+ require "socket"
22
+ require_relative "LogStashRecordProcessorFactory"
23
+ require_relative "DynamoDBLogParser"
24
+
25
+ require "logstash-input-dynamodb_jars"
26
+
27
+ require 'java'
28
+ java_import "com.amazonaws.AmazonClientException"
29
+ java_import "org.apache.log4j.LogManager"
30
+ java_import "org.apache.log4j.Level"
31
+ java_import "com.fasterxml.jackson.annotation.JsonInclude"
32
+ java_import "com.amazonaws.regions.RegionUtils"
33
+
34
+ module AmazonDynamoDB
35
+ include_package "com.amazonaws"
36
+ include_package "com.amazonaws.services.dynamodbv2"
37
+ include_package "com.amazonaws.services.dynamodbv2.streamsadapter"
38
+ include_package "com.amazonaws.services.dynamodbv2.model"
39
+ end
40
+ module AmazonCredentials
41
+ include_package "com.amazonaws.auth"
42
+ include_package "com.amazonaws.internal"
43
+ end
44
+
45
+ module DynamoDBBootstrap
46
+ include_package "com.amazonaws.dynamodb.bootstrap"
47
+ end
48
+
49
+ module CloudWatch
50
+ include_package "com.amazonaws.services.cloudwatch"
51
+ end
52
+
53
+ module KCL
54
+ include_package "com.amazonaws.services.kinesis.clientlibrary.lib.worker"
55
+ end
56
+
57
+ #DynamoDBStreams plugin that will first scan the DynamoDB table
58
+ #and then consume streams and push those records into Logstash
59
+ class LogStash::Inputs::DynamoDB < LogStash::Inputs::Base
60
+ config_name "dynamodb"
61
+
62
+ USER_AGENT = " logstash-input-dynamodb/1.0.0".freeze
63
+
64
+ LF_DYNAMODB = "dymamodb".freeze
65
+ LF_JSON_NO_BIN = "json_drop_binary".freeze
66
+ LF_PLAIN = "plain".freeze
67
+ LF_JSON_BIN_AS_TEXT = "json_binary_as_text".freeze
68
+ VT_KEYS_ONLY = "keys_only".freeze
69
+ VT_OLD_IMAGE = "old_image".freeze
70
+ VT_NEW_IMAGE = "new_image".freeze
71
+ VT_ALL_IMAGES = "new_and_old_images".freeze
72
+
73
+ default :codec, 'json'
74
+
75
+ # The name of the table to copy and stream through Logstash
76
+ config :table_name, :validate => :string, :required => true
77
+
78
+ # Configuration for what information from the scan and streams to include in the log.
79
+ # keys_only will return the hash and range keys along with the values for each entry
80
+ # new_image will return the entire new entry and keys
81
+ # old_image will return the entire entry before modification and keys (NOTE: Cannot perform scan when using this option)
82
+ # new_and_old_images will return the old entry before modification along with the new entry and keys
83
+ config :view_type, :validate => [VT_KEYS_ONLY, VT_OLD_IMAGE, VT_NEW_IMAGE, VT_ALL_IMAGES], :required => true
84
+
85
+ # Endpoint from which the table is located. Example: dynamodb.us-east-1.amazonaws.com
86
+ config :endpoint, :validate => :string, :required => true
87
+
88
+ # Endpoint from which streams should read. Example: streams.dynamodb.us-east-1.amazonaws.com
89
+ config :streams_endpoint, :validate => :string
90
+
91
+ # AWS credentials access key.
92
+ config :aws_access_key_id, :validate => :string, :default => ""
93
+
94
+ # AWS credentials secret access key.
95
+ config :aws_secret_access_key, :validate => :string, :default => ""
96
+
97
+ # A flag to indicate whether or not the plugin should scan the entire table before streaming new records.
98
+ # Streams will only push records that are less than 24 hours old, so in order to get the entire table
99
+ # an initial scan must be done.
100
+ config :perform_scan, :validate => :boolean, :default => true
101
+
102
+ # A string that uniquely identifies the KCL checkpointer name and cloudwatch metrics name.
103
+ # This is used when one worker leaves a shard so that another worker knows where to start again.
104
+ config :checkpointer, :validate => :string, :default => "logstash_input_dynamodb_cptr"
105
+
106
+ # Option to publish metrics to Cloudwatch using the checkpointer name.
107
+ config :publish_metrics, :validate => :boolean, :default => false
108
+
109
+ # Option to not automatically stream new data into logstash from DynamoDB streams.
110
+ config :perform_stream, :validate => :boolean, :default => true
111
+
112
+ # Number of read operations per second to perform when scanning the specified table.
113
+ config :read_ops, :validate => :number, :default => 1
114
+
115
+ # Number of threads to use when scanning the specified table
116
+ config :number_of_scan_threads, :validate => :number, :default => 1
117
+
118
+ # Number of threads to write to the logstash queue when scanning the table
119
+ config :number_of_write_threads, :validate => :number, :default => 1
120
+
121
+ # Configuation for how the logs will be transferred.
122
+ # plain is simply pass the message along without editing it.
123
+ # dynamodb will return just the data specified in the view_format in dynamodb format.
124
+ # For more information see: docs.aws.amazon.com/amazondynamodb/latest/developerguide/DataFormat.html
125
+ # json_drop_binary will return just the data specified in the view_format in JSON while not including any binary values that were present.
126
+ # json_binary_as_text will return just the data specified in the view_format in JSON while including binary values as base64-encoded text.
127
+ config :log_format, :validate => [LF_PLAIN, LF_DYNAMODB, LF_JSON_NO_BIN, LF_JSON_BIN_AS_TEXT], :default => "plain"
128
+
129
+ public
130
+ def build_credentials
131
+ if !@aws_access_key_id.to_s.empty? and !@aws_secret_access_key.to_s.empty?
132
+ @logger.info("Using static credentials: " + @aws_access_key_id + ", " + @aws_secret_access_key)
133
+ basic = AmazonCredentials::BasicAWSCredentials.new(@aws_access_key_id, @aws_secret_access_key)
134
+ return AmazonCredentials::StaticCredentialsProvider.new(basic)
135
+ else
136
+ @logger.info("Using default provider chain")
137
+ return AmazonCredentials::DefaultAWSCredentialsProviderChain.new()
138
+ end # if neither aws access keys
139
+ end # def build_credentials
140
+
141
+ public
142
+ def register
143
+ LogStash::Logger.setup_log4j(@logger)
144
+
145
+ @host = Socket.gethostname
146
+ @logger.info("Tablename: " + @table_name)
147
+ @queue = SizedQueue.new(20)
148
+ @credentials = build_credentials()
149
+ @logger.info("Checkpointer: " + @checkpointer)
150
+
151
+ if @perform_scan and @view_type == VT_OLD_IMAGE
152
+ raise(LogStash::ConfigurationError, "Cannot perform scan with view type: " + @view_type + " configuration")
153
+ end
154
+ if @view_type == VT_ALL_IMAGES and !(@log_format == LF_PLAIN)
155
+ raise(LogStash::ConfigurationError, "Cannot show view_type: " + @view_type + ", with log_format: " + @log_format)
156
+ end
157
+
158
+ #Create DynamoDB Client
159
+ @client_configuration = AmazonDynamoDB::ClientConfiguration.new()
160
+ @client_configuration.setUserAgent(@client_configuration.getUserAgent() + USER_AGENT)
161
+ @dynamodb_client = AmazonDynamoDB::AmazonDynamoDBClient.new(@credentials, @client_configuration)
162
+
163
+ @logger.info(@dynamodb_client.to_s)
164
+
165
+ @dynamodb_client.setEndpoint(@endpoint)
166
+ @logger.info("DynamoDB endpoint: " + @endpoint)
167
+
168
+ @key_schema = Array.new
169
+ @table_description = @dynamodb_client.describeTable(@table_name).getTable()
170
+ key_iterator = @table_description.getKeySchema().iterator()
171
+ while(key_iterator.hasNext())
172
+ @key_schema.push(key_iterator.next().getAttributeName().to_s)
173
+ end
174
+ region = RegionUtils.getRegionByEndpoint(@endpoint)
175
+
176
+ @parser ||= Logstash::Inputs::DynamoDB::DynamoDBLogParser.new(@view_type, @log_format, @key_schema, region)
177
+
178
+ if @perform_stream
179
+ setup_stream
180
+ end # unless @perform_stream
181
+ end # def register
182
+
183
+ public
184
+ def run(logstash_queue)
185
+ begin
186
+ run_with_catch(logstash_queue)
187
+ rescue LogStash::ShutdownSignal
188
+ exit_threads
189
+ until @queue.empty?
190
+ @logger.info("Flushing rest of events in logstash queue")
191
+ event = @queue.pop()
192
+ queue_event(@parser.parse_stream(event), logstash_queue, @host)
193
+ end # until !@queue.empty?
194
+ end # begin
195
+ end # def run(logstash_queue)
196
+
197
+ # Starts KCL app in a background thread
198
+ # Starts parallel scan if need be in a background thread
199
+ private
200
+ def run_with_catch(logstash_queue)
201
+ if @perform_scan
202
+ scan(logstash_queue)
203
+ end # if @perform_scan
204
+
205
+ # Once scan is finished, start kcl thread to read from streams
206
+ if @perform_stream
207
+ stream(logstash_queue)
208
+ end # unless @perform_stream
209
+ end # def run
210
+
211
+ private
212
+ def setup_stream
213
+ worker_id = SecureRandom.uuid()
214
+ @logger.info("WorkerId: " + worker_id)
215
+
216
+ dynamodb_streams_client = AmazonDynamoDB::AmazonDynamoDBStreamsClient.new(@credentials, @client_configuration)
217
+ adapter = Java::ComAmazonawsServicesDynamodbv2Streamsadapter::AmazonDynamoDBStreamsAdapterClient.new(@credentials)
218
+ if !@streams_endpoint.nil?
219
+ adapter.setEndpoint(@streams_endpoint)
220
+ dynamodb_streams_client.setEndpoint(@streams_endpoint)
221
+ @logger.info("DynamoDB Streams endpoint: " + @streams_endpoint)
222
+ else
223
+ raise(LogStash::ConfigurationError, "Cannot stream without a configured streams endpoint")
224
+ end # if not @streams_endpoint.to_s.empty?
225
+
226
+ stream_arn = nil
227
+ begin
228
+ stream_arn = @table_description.getLatestStreamArn()
229
+ stream_description = dynamodb_streams_client.describeStream(AmazonDynamoDB::DescribeStreamRequest.new() \
230
+ .withStreamArn(stream_arn)).getStreamDescription()
231
+
232
+ stream_status = stream_description.getStreamStatus()
233
+
234
+ stream_view_type = stream_description.getStreamViewType().to_s.downcase
235
+ unless (stream_view_type == @view_type or @view_type == VT_KEYS_ONLY or stream_view_type == VT_ALL_IMAGES)
236
+ raise(LogStash::ConfigurationError, "Cannot stream " + @view_type + " when stream is setup for " + stream_view_type)
237
+ end
238
+
239
+ while stream_status == "ENABLING"
240
+ if(stream_status == "ENABLING")
241
+ @logger.info("Sleeping until stream is enabled")
242
+ sleep(1)
243
+ end # if stream_status == "ENABLING"
244
+ stream_description = dynamodb_streams_client.describeStream(AmazonDynamoDB::DescribeStreamRequest.new() \
245
+ .withStreamArn(stream_arn)).getStreamDescription()
246
+ stream_status = stream_description.getStreamStatus()
247
+ end # while not active
248
+
249
+ if !(stream_status == "ENABLED")
250
+ raise(LogStash::PluginLoadingError, "No streams are enabled")
251
+ end # if not active
252
+ @logger.info("Stream Id: " + stream_arn)
253
+ rescue AmazonDynamoDB::ResourceNotFoundException => rnfe
254
+ raise(LogStash::PluginLoadingError, rnfe.message)
255
+ rescue AmazonClientException => ace
256
+ raise(LogStash::ConfigurationError, "AWS credentials invalid or not found in the provider chain\n" + ace.message)
257
+ end # begin
258
+
259
+ kcl_config = KCL::KinesisClientLibConfiguration.new(@checkpointer, stream_arn, @credentials, worker_id) \
260
+ .withInitialPositionInStream(KCL::InitialPositionInStream::TRIM_HORIZON)
261
+ cloudwatch_client = nil
262
+ if @publish_metrics
263
+ cloudwatch_client = CloudWatch::AmazonCloudWatchClient.new(@credentials)
264
+ else
265
+ kclMetricsLogger = LogManager.getLogger("com.amazonaws.services.kinesis.metrics")
266
+ kclMetricsLogger.setAdditivity(false)
267
+ kclMetricsLogger.setLevel(Level::OFF)
268
+ end # if @publish_metrics
269
+ @worker = KCL::Worker.new(Logstash::Inputs::DynamoDB::LogStashRecordProcessorFactory.new(@queue), kcl_config, adapter, @dynamodb_client, cloudwatch_client)
270
+ end # def setup_stream
271
+
272
+ private
273
+ def scan(logstash_queue)
274
+ @logger.info("Starting scan...")
275
+ @logstash_writer = DynamoDBBootstrap::BlockingQueueConsumer.new(@number_of_write_threads)
276
+
277
+ @connector = DynamoDBBootstrap::DynamoDBBootstrapWorker.new(@dynamodb_client, @read_ops, @table_name, @number_of_scan_threads)
278
+ start_table_copy_thread
279
+
280
+ scan_queue = @logstash_writer.getQueue()
281
+ while true
282
+ event = scan_queue.take()
283
+ if event.getEntry().nil? and event.getSize() == -1
284
+ break
285
+ end # if event.isEmpty()
286
+ queue_event(@parser.parse_scan(event.getEntry(), event.getSize()), logstash_queue, @host)
287
+ end # while true
288
+ end
289
+
290
+ private
291
+ def stream(logstash_queue)
292
+ @logger.info("Starting stream...")
293
+ start_kcl_thread
294
+
295
+ while true
296
+ event = @queue.pop()
297
+ queue_event(@parser.parse_stream(event), logstash_queue, @host)
298
+ end # while true
299
+ end
300
+
301
+ private
302
+ def exit_threads
303
+ unless @dynamodb_scan_thread.nil?
304
+ @dynamodb_scan_thread.exit
305
+ end # unless @dynamodb_scan_thread.nil?
306
+
307
+ unless @kcl_thread.nil?
308
+ @kcl_thread.exit
309
+ end # unless @kcl_thread.nil?
310
+ end # def exit_threads
311
+
312
+ public
313
+ def queue_event(event, logstash_queue, event_host)
314
+ logstash_event = LogStash::Event.new("message" => event, "host" => event_host)
315
+ decorate(logstash_event)
316
+ logstash_queue << logstash_event
317
+ end # def queue_event
318
+
319
+ private
320
+ def start_table_copy_thread
321
+ @dynamodb_scan_thread = Thread.new(@connector, @logstash_writer) {
322
+ begin
323
+ @connector.pipe(@logstash_writer)
324
+ rescue Exception => e
325
+ abort("Scanning the table caused an error.\n" + e.message)
326
+ end # begin
327
+ }
328
+ end # def start_table_copy_thread()
329
+
330
+ private
331
+ def start_kcl_thread
332
+ @kcl_thread = Thread.new(@worker) {
333
+ begin
334
+ @worker.run()
335
+ rescue Exception => e
336
+ abort("KCL worker encountered an error.\n" + e.message)
337
+ end # begin
338
+ }
339
+ end # def start_kcl_thread
340
+
341
+ end # class Logstash::Inputs::DynamoDB