logstash-input-s3-sns-sqs 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8b7f924e911ea9f2379c0b8f2415c4dbd8aa6f16
4
- data.tar.gz: c9c44b0742020e1e911e01a9f5bad9bca55857ee
3
+ metadata.gz: fa00cb9383c9782647f404283f10281febb31622
4
+ data.tar.gz: a8fe32f0d3668c893ff5ef5cd253e02dee8f4fd5
5
5
  SHA512:
6
- metadata.gz: 4b2f60bbe55ff50d580f7524faa78a30235ace54f8770a54baa97009aad0041d334edd5bc87822c84714142a734e86cbf859947aa6f7200f4a1c0992521a3cf3
7
- data.tar.gz: d635defff1a48274967afdcd358fff1da8f9a91cde99729558f07c1b238cff866e179ab2a7122f24dc12a547d1d784d5c5736460b5afa9ed4cb76c4c8f4c3bcd
6
+ metadata.gz: a602a1a99073817666f83f0f0194e953c3ee6a75efd5e4e2faf4a44a1d19323133ea0c96e6fab7d306ec75f2f6384c936f2f1a134b6639e4ffe645c76af727f4
7
+ data.tar.gz: 8652e9ecc5c3b9342dabb842c8258adcf52f28d5184dae8f3e1bd27ffed0bac0333a79f2055be4a55bbb2e0849db8ac5426d6936fafe1c96ac25a1701038d58f
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 1.4.0
2
+ - Filehandling rewritten THX to logstash-input-s3 for inspiration
3
+ - Improve performance of gzip decoding by 10x by using Java's Zlib
4
+ - Added multithreading via config Use: consumer_threads in config
1
5
  ## 1.2.0
2
6
  - Add codec suggestion by content-type
3
7
  - enrich metadata
data/Gemfile CHANGED
@@ -1,2 +1,11 @@
1
1
  source 'https://rubygems.org'
2
+
2
3
  gemspec
4
+
5
+ logstash_path = ENV["LOGSTASH_PATH"] || "../../logstash"
6
+ use_logstash_source = ENV["LOGSTASH_SOURCE"] && ENV["LOGSTASH_SOURCE"].to_s == "1"
7
+
8
+ if Dir.exist?(logstash_path) && use_logstash_source
9
+ gem 'logstash-core', :path => "#{logstash_path}/logstash-core"
10
+ gem 'logstash-core-plugin-api', :path => "#{logstash_path}/logstash-core-plugin-api"
11
+ end
@@ -6,6 +6,16 @@ require "logstash/timestamp"
6
6
  require "logstash/plugin_mixins/aws_config"
7
7
  require "logstash/errors"
8
8
  require 'logstash/inputs/s3sqs/patch'
9
+ require "aws-sdk"
10
+ require 'cgi'
11
+
12
+ require 'java'
13
+ java_import java.io.InputStream
14
+ java_import java.io.InputStreamReader
15
+ java_import java.io.FileInputStream
16
+ java_import java.io.BufferedReader
17
+ java_import java.util.zip.GZIPInputStream
18
+ java_import java.util.zip.ZipException
9
19
 
10
20
  Aws.eager_autoload!
11
21
 
@@ -96,6 +106,10 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
96
106
  config :delete_on_success, :validate => :boolean, :default => false
97
107
  # Whether the event is processed though an SNS to SQS. (S3>SNS>SQS = true |S3>SQS=false)
98
108
  config :from_sns, :validate => :boolean, :default => true
109
+ # To run in multiple threads use this
110
+ config :consumer_threads, :validate => :number, :default => 1
111
+ config :temporary_directory, :validate => :string, :default => File.join(Dir.tmpdir, "logstash")
112
+
99
113
 
100
114
  attr_reader :poller
101
115
  attr_reader :s3
@@ -106,26 +120,33 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
106
120
  require "logstash/codecs/json"
107
121
  require "logstash/codecs/json_lines"
108
122
  if content_type == "application/json_lines" then
109
- @logger.info("Automatically switching from #{@codec.class.config_name} to json_lines codec", :plugin => self.class.config_name)
110
- @codec = LogStash::Codecs::JSONLines.new("charset" => @codec.charset)
123
+ @logger.info("Automatically switching from #{@codec.class.config_name} to json_lines codec", :plugin => self.class.config_name)
124
+ @codec = LogStash::Codecs::JSONLines.new("charset" => @codec.charset)
111
125
  elsif content_type == "application/json" or key.end_with?(".json") then
112
- @logger.info("Automatically switching from #{@codec.class.config_name} to json codec", :plugin => self.class.config_name)
113
- @codec = LogStash::Codecs::JSON.new("charset" => @codec.charset)
126
+ @logger.info("Automatically switching from #{@codec.class.config_name} to json codec", :plugin => self.class.config_name)
127
+ @codec = LogStash::Codecs::JSON.new("charset" => @codec.charset)
114
128
  end
115
129
  end
116
130
 
131
+ public
117
132
  def register
118
- require "aws-sdk"
119
- require 'cgi'
133
+ require "fileutils"
134
+ require "digest/md5"
135
+ require "aws-sdk-resources"
136
+
137
+ @runner_threads = []
120
138
  @logger.info("Registering SQS input", :queue => @queue)
121
139
  setup_queue
140
+
141
+ FileUtils.mkdir_p(@temporary_directory) unless Dir.exist?(@temporary_directory)
122
142
  end
123
143
 
124
144
  def setup_queue
125
145
  aws_sqs_client = Aws::SQS::Client.new(aws_options_hash)
126
146
  queue_url = aws_sqs_client.get_queue_url({ queue_name: @queue, queue_owner_aws_account_id: @queue_owner_aws_account_id})[:queue_url]
127
147
  @poller = Aws::SQS::QueuePoller.new(queue_url, :client => aws_sqs_client)
128
- @s3 = Aws::S3::Client.new(aws_options_hash)
148
+ @s3_client = Aws::S3::Client.new(aws_options_hash)
149
+ @s3_resource = get_s3object
129
150
  rescue Aws::SQS::Errors::ServiceError => e
130
151
  @logger.error("Cannot establish connection to Amazon SQS", :error => e)
131
152
  raise LogStash::ConfigurationError, "Verify the SQS queue name and your credentials"
@@ -133,17 +154,17 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
133
154
 
134
155
  def polling_options
135
156
  {
136
- # we will query 1 message at a time, so we can ensure correct error handling if we can't download a single file correctly
137
- # (we will throw :skip_delete if download size isn't correct to process the event again later
138
- # -> set a reasonable "Default Visibility Timeout" for your queue, so that there's enough time to process the log files)
139
- :max_number_of_messages => 1,
140
- # we will use the queue's setting, a good value is 10 seconds
141
- # (to ensure fast logstash shutdown on the one hand and few api calls on the other hand)
142
- :wait_time_seconds => nil,
157
+ # we will query 1 message at a time, so we can ensure correct error handling if we can't download a single file correctly
158
+ # (we will throw :skip_delete if download size isn't correct to process the event again later
159
+ # -> set a reasonable "Default Visibility Timeout" for your queue, so that there's enough time to process the log files)
160
+ :max_number_of_messages => 1,
161
+ # we will use the queue's setting, a good value is 10 seconds
162
+ # (to ensure fast logstash shutdown on the one hand and few api calls on the other hand)
163
+ :wait_time_seconds => nil,
143
164
  }
144
165
  end
145
166
 
146
- def handle_message(message, queue)
167
+ def handle_message(message, queue, instance_codec)
147
168
  hash = JSON.parse message.body
148
169
  @logger.debug("handle_message", :hash => hash, :message => message)
149
170
  #If send via sns there is an additional JSON layer
@@ -152,97 +173,239 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
152
173
  end
153
174
  # there may be test events sent from the s3 bucket which won't contain a Records array,
154
175
  # we will skip those events and remove them from queue
155
- if hash['Records'] then
156
- # typically there will be only 1 record per event, but since it is an array we will
157
- # treat it as if there could be more records
158
- hash['Records'].each do |record|
159
- @logger.debug("We found a record", :record => record)
160
- # in case there are any events with Records that aren't s3 object-created events and can't therefore be
161
- # processed by this plugin, we will skip them and remove them from queue
162
- if record['eventSource'] == EVENT_SOURCE and record['eventName'].start_with?(EVENT_TYPE) then
163
- @logger.debug("It is a valid record")
164
- bucket = CGI.unescape(record['s3']['bucket']['name'])
165
- key = CGI.unescape(record['s3']['object']['key'])
166
-
167
-
168
- # try download and :skip_delete if it fails
169
- begin
170
- response = @s3.get_object(
171
- bucket: bucket,
172
- key: key,
173
- )
174
- rescue => e
175
- @logger.warn("issuing :skip_delete on failed download", :bucket => bucket, :object => key, :error => e)
176
- throw :skip_delete
177
- end
178
-
179
- # verify downloaded content size
180
- if response.content_length == record['s3']['object']['size'] then
181
- body = response.body
182
- # if necessary unzip
183
- if response.content_encoding == "gzip" or record['s3']['object']['key'].end_with?(".gz") then
184
- @logger.debug("Ohhh i´ll try to unzip")
185
- begin
186
- temp = Zlib::GzipReader.new(body)
187
- rescue => e
188
- @logger.warn("content is marked to be gzipped but can't unzip it, assuming plain text", :bucket => bucket, :object => key, :error => e)
189
- temp = body
190
- end
191
- body = temp
192
- end
193
- # Make a suggestion for a good codec
194
- suggest_codec(response.content_type,record['s3']['object']['key'])
195
- # process the plain text content
196
- begin
197
- lines = body.read.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: "\u2370").split(/\n/)
198
- lines.each do |line|
199
- @logger.debug("Decorating the event")
200
- @codec.decode(line) do |event|
201
- decorate(event)
202
-
203
- event.set('[@metadata][s3_bucket_name]', record['s3']['bucket']['name'])
204
- event.set('[@metadata][s3_object_key]', record['s3']['object']['key'])
205
- if match=/#{s3_key_prefix}\/?(?<type_folder>.*?)\/.*/.match(key)
206
- event.set('[@metadata][s3_object_folder]', match['type_folder'])
207
- end
208
- queue << event
209
- end
210
- end
211
- rescue => e
212
- @logger.warn("issuing :skip_delete on failed plain text processing", :bucket => bucket, :object => key, :error => e)
213
- throw :skip_delete
214
- end
215
-
216
- # Delete the files from S3
217
- begin
218
- @s3.delete_object(bucket: bucket, key: key) if @delete_on_success
219
- rescue => e
220
- @logger.warn("Failed to delete S3 object", :bucket => bucket, :object => key, :error => e)
221
- end
222
- # otherwise try again later
223
- else
224
- @logger.warn("issuing :skip_delete on wrong download content size", :bucket => bucket, :object => key,
225
- :download_size => response.content_length, :expected => record['s3']['object']['size'])
226
- throw :skip_delete
227
- end
176
+ if hash['Records'] then
177
+ # typically there will be only 1 record per event, but since it is an array we will
178
+ # treat it as if there could be more records
179
+ hash['Records'].each do |record|
180
+ @logger.debug("We found a record", :record => record)
181
+ # in case there are any events with Records that aren't s3 object-created events and can't therefore be
182
+ # processed by this plugin, we will skip them and remove them from queue
183
+ if record['eventSource'] == EVENT_SOURCE and record['eventName'].start_with?(EVENT_TYPE) then
184
+ @logger.debug("It is a valid record")
185
+ bucket = CGI.unescape(record['s3']['bucket']['name'])
186
+ key = CGI.unescape(record['s3']['object']['key'])
187
+
188
+ # try download and :skip_delete if it fails
189
+ #if record['s3']['object']['size'] < 10000000 then
190
+ process_log(bucket, key, instance_codec, queue)
191
+ #else
192
+ # @logger.info("Your file is too big")
193
+ #end
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ private
200
+ def process_log(bucket , key, instance_codec, queue)
201
+ s3bucket = @s3_resource.bucket(bucket)
202
+ @logger.debug("Lets go reading file", :bucket => bucket, :key => key)
203
+ object = s3bucket.object(key)
204
+ filename = File.join(temporary_directory, File.basename(key))
205
+ if download_remote_file(object, filename)
206
+ if process_local_log( filename, key, instance_codec, queue)
207
+ delete_file_from_bucket(object)
208
+ FileUtils.remove_entry_secure(filename, true)
209
+ end
210
+ else
211
+ FileUtils.remove_entry_secure(filename, true)
212
+ end
213
+ end
214
+
215
+ private
216
+ # Stream the remove file to the local disk
217
+ #
218
+ # @param [S3Object] Reference to the remove S3 objec to download
219
+ # @param [String] The Temporary filename to stream to.
220
+ # @return [Boolean] True if the file was completely downloaded
221
+ def download_remote_file(remote_object, local_filename)
222
+ completed = false
223
+ @logger.debug("S3 input: Download remote file", :remote_key => remote_object.key, :local_filename => local_filename)
224
+ File.open(local_filename, 'wb') do |s3file|
225
+ return completed if stop?
226
+ remote_object.get(:response_target => s3file)
227
+ end
228
+ completed = true
229
+
230
+ return completed
231
+ end
232
+
233
+ private
234
+
235
+ # Read the content of the local file
236
+ #
237
+ # @param [Queue] Where to push the event
238
+ # @param [String] Which file to read from
239
+ # @return [Boolean] True if the file was completely read, false otherwise.
240
+ def process_local_log(filename, key, instance_codec, queue)
241
+ @logger.debug('Processing file', :filename => filename)
242
+ metadata = {}
243
+ i=1
244
+ # Currently codecs operates on bytes instead of stream.
245
+ # So all IO stuff: decompression, reading need to be done in the actual
246
+ # input and send as bytes to the codecs.
247
+ read_file(filename) do |line|
248
+ if stop?
249
+ @logger.warn("Logstash S3 input, stop reading in the middle of the file, we will read it again when logstash is started")
250
+ return false
251
+ end
252
+ #@logger.info("read line #{i}", :line => line)
253
+ #line = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: "\u2370")
254
+ instance_codec.decode(line) do |event|
255
+ #@logger.info("decorate event")
256
+ # We are making an assumption concerning cloudfront
257
+ # log format, the user will use the plain or the line codec
258
+ # and the message key will represent the actual line content.
259
+ # If the event is only metadata the event will be drop.
260
+ # This was the behavior of the pre 1.5 plugin.
261
+ #
262
+ # The line need to go through the codecs to replace
263
+ # unknown bytes in the log stream before doing a regexp match or
264
+ # you will get a `Error: invalid byte sequence in UTF-8'
265
+ #event = LogStash::Event.new("message" => @message)
266
+ if event_is_metadata?(event)
267
+ @logger.debug('Event is metadata, updating the current cloudfront metadata', :event => event)
268
+ update_metadata(metadata, event)
269
+ else
270
+
271
+ decorate(event)
272
+
273
+ event.set("cloudfront_version", metadata[:cloudfront_version]) unless metadata[:cloudfront_version].nil?
274
+ event.set("cloudfront_fields", metadata[:cloudfront_fields]) unless metadata[:cloudfront_fields].nil?
275
+
276
+ event.set("[@metadata][s3]", { "key" => key })
277
+
278
+ if match=/#{s3_key_prefix}\/?(?<type_folder>.*?)\/.*/.match(key)
279
+ event.set('[@metadata][s3_object_folder]', match['type_folder'])
228
280
  end
281
+ #@logger.info("queue event #{i}")
282
+ #i += 1
283
+ queue << event
229
284
  end
230
285
  end
286
+ end
287
+ #@logger.info("event pre flush", :event => event)
288
+ # #ensure any stateful codecs (such as multi-line ) are flushed to the queue
289
+ instance_codec.flush do |event|
290
+ queue << event
291
+ end
292
+
293
+ return true
294
+ end # def process_local_log
295
+
296
+ private
297
+ def read_file(filename, &block)
298
+ if gzip?(filename)
299
+ read_gzip_file(filename, block)
300
+ else
301
+ read_plain_file(filename, block)
302
+ end
303
+ end
304
+
305
+ def read_plain_file(filename, block)
306
+ File.open(filename, 'rb') do |file|
307
+ file.each(&block)
308
+ end
309
+ end
310
+
311
+ private
312
+ def read_gzip_file(filename, block)
313
+ file_stream = FileInputStream.new(filename)
314
+ gzip_stream = GZIPInputStream.new(file_stream)
315
+ decoder = InputStreamReader.new(gzip_stream, "UTF-8")
316
+ buffered = BufferedReader.new(decoder)
317
+
318
+ while (line = buffered.readLine())
319
+ block.call(line)
320
+ end
321
+ rescue ZipException => e
322
+ @logger.error("Gzip codec: We cannot uncompress the gzip file", :filename => filename)
323
+ raise e
324
+ ensure
325
+ buffered.close unless buffered.nil?
326
+ decoder.close unless decoder.nil?
327
+ gzip_stream.close unless gzip_stream.nil?
328
+ file_stream.close unless file_stream.nil?
329
+ end
330
+
331
+ private
332
+ def gzip?(filename)
333
+ filename.end_with?('.gz','.gzip')
231
334
  end
232
335
 
336
+
337
+ private
338
+ def delete_file_from_bucket(object)
339
+ if @delete_on_success
340
+ object.delete()
341
+ end
342
+ end
343
+
344
+ private
345
+ def get_s3object
346
+ s3 = Aws::S3::Resource.new(client: @s3_client)
347
+ end
348
+
349
+ private
350
+ def event_is_metadata?(event)
351
+ return false unless event.get("message").class == String
352
+ line = event.get("message")
353
+ version_metadata?(line) || fields_metadata?(line)
354
+ end
355
+
356
+ private
357
+ def version_metadata?(line)
358
+ line.start_with?('#Version: ')
359
+ end
360
+
361
+ private
362
+ def fields_metadata?(line)
363
+ line.start_with?('#Fields: ')
364
+ end
365
+
366
+ private
367
+ def update_metadata(metadata, event)
368
+ line = event.get('message').strip
369
+
370
+ if version_metadata?(line)
371
+ metadata[:cloudfront_version] = line.split(/#Version: (.+)/).last
372
+ end
373
+
374
+ if fields_metadata?(line)
375
+ metadata[:cloudfront_fields] = line.split(/#Fields: (.+)/).last
376
+ end
377
+ end
378
+
379
+ public
233
380
  def run(queue)
234
381
  # ensure we can stop logstash correctly
235
- poller.before_request do |stats|
236
- if stop? then
237
- @logger.warn("issuing :stop_polling on stop?", :queue => @queue)
238
- # this can take up to "Receive Message Wait Time" (of the sqs queue) seconds to be recognized
239
- throw :stop_polling
240
- end
241
- end
242
- # poll a message and process it
243
- run_with_backoff do
244
- poller.poll(polling_options) do |message|
245
- handle_message(message, queue)
382
+ @runner_threads = consumer_threads.times.map { |consumer| thread_runner(queue) }
383
+ @runner_threads.each { |t| t.join }
384
+ end
385
+
386
+ public
387
+ def stop
388
+ @runner_threads.each { |c| c.wakeup }
389
+ end
390
+
391
+ private
392
+ def thread_runner(queue)
393
+ Thread.new do
394
+ @logger.info("Starting new thread")
395
+ begin
396
+ poller.before_request do |stats|
397
+ if stop? then
398
+ @logger.warn("issuing :stop_polling on stop?", :queue => @queue)
399
+ # this can take up to "Receive Message Wait Time" (of the sqs queue) seconds to be recognized
400
+ throw :stop_polling
401
+ end
402
+ end
403
+ # poll a message and process it
404
+ run_with_backoff do
405
+ poller.poll(polling_options) do |message|
406
+ handle_message(message, queue, @codec.clone)
407
+ end
408
+ end
246
409
  end
247
410
  end
248
411
  end
@@ -266,5 +429,4 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
266
429
  retry
267
430
  end
268
431
  end
269
-
270
- end # class
432
+ end # class
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'logstash-input-s3-sns-sqs'
3
- s.version = '1.2.0'
3
+ s.version = '1.4.0'
4
4
  s.licenses = ['Apache License (2.0)']
5
5
  s.summary = "Get logs from AWS s3 buckets as issued by an object-created event via sns -> sqs."
6
6
  s.description = "This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program"
@@ -19,11 +19,12 @@ Gem::Specification.new do |s|
19
19
  s.metadata = { "logstash_plugin" => "true", "logstash_group" => "input" }
20
20
 
21
21
  # Gem dependencies
22
- s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
22
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 2.1.12", "<= 2.99"
23
23
 
24
24
  s.add_runtime_dependency 'logstash-codec-json'
25
- s.add_runtime_dependency "logstash-mixin-aws", ">= 1.0.0"
25
+ s.add_runtime_dependency "logstash-mixin-aws"
26
26
 
27
27
  s.add_development_dependency 'logstash-devutils'
28
+
28
29
  end
29
30
 
metadata CHANGED
@@ -1,21 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: logstash-input-s3-sns-sqs
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christian Herweg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-19 00:00:00.000000000 Z
11
+ date: 2018-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
15
15
  requirements:
16
16
  - - '>='
17
17
  - !ruby/object:Gem::Version
18
- version: '1.60'
18
+ version: 2.1.12
19
19
  - - <=
20
20
  - !ruby/object:Gem::Version
21
21
  version: '2.99'
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - '>='
28
28
  - !ruby/object:Gem::Version
29
- version: '1.60'
29
+ version: 2.1.12
30
30
  - - <=
31
31
  - !ruby/object:Gem::Version
32
32
  version: '2.99'
@@ -49,7 +49,7 @@ dependencies:
49
49
  requirements:
50
50
  - - '>='
51
51
  - !ruby/object:Gem::Version
52
- version: 1.0.0
52
+ version: '0'
53
53
  name: logstash-mixin-aws
54
54
  prerelease: false
55
55
  type: :runtime
@@ -57,7 +57,7 @@ dependencies:
57
57
  requirements:
58
58
  - - '>='
59
59
  - !ruby/object:Gem::Version
60
- version: 1.0.0
60
+ version: '0'
61
61
  - !ruby/object:Gem::Dependency
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements: