logstash-input-s3-sns-sqs 1.2.0 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8b7f924e911ea9f2379c0b8f2415c4dbd8aa6f16
4
- data.tar.gz: c9c44b0742020e1e911e01a9f5bad9bca55857ee
3
+ metadata.gz: fa00cb9383c9782647f404283f10281febb31622
4
+ data.tar.gz: a8fe32f0d3668c893ff5ef5cd253e02dee8f4fd5
5
5
  SHA512:
6
- metadata.gz: 4b2f60bbe55ff50d580f7524faa78a30235ace54f8770a54baa97009aad0041d334edd5bc87822c84714142a734e86cbf859947aa6f7200f4a1c0992521a3cf3
7
- data.tar.gz: d635defff1a48274967afdcd358fff1da8f9a91cde99729558f07c1b238cff866e179ab2a7122f24dc12a547d1d784d5c5736460b5afa9ed4cb76c4c8f4c3bcd
6
+ metadata.gz: a602a1a99073817666f83f0f0194e953c3ee6a75efd5e4e2faf4a44a1d19323133ea0c96e6fab7d306ec75f2f6384c936f2f1a134b6639e4ffe645c76af727f4
7
+ data.tar.gz: 8652e9ecc5c3b9342dabb842c8258adcf52f28d5184dae8f3e1bd27ffed0bac0333a79f2055be4a55bbb2e0849db8ac5426d6936fafe1c96ac25a1701038d58f
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 1.4.0
2
+ - Filehandling rewritten THX to logstash-input-s3 for inspiration
3
+ - Improve performance of gzip decoding by 10x by using Java's Zlib
4
+ - Added multithreading via config Use: consumer_threads in config
1
5
  ## 1.2.0
2
6
  - Add codec suggestion by content-type
3
7
  - enrich metadata
data/Gemfile CHANGED
@@ -1,2 +1,11 @@
1
1
  source 'https://rubygems.org'
2
+
2
3
  gemspec
4
+
5
+ logstash_path = ENV["LOGSTASH_PATH"] || "../../logstash"
6
+ use_logstash_source = ENV["LOGSTASH_SOURCE"] && ENV["LOGSTASH_SOURCE"].to_s == "1"
7
+
8
+ if Dir.exist?(logstash_path) && use_logstash_source
9
+ gem 'logstash-core', :path => "#{logstash_path}/logstash-core"
10
+ gem 'logstash-core-plugin-api', :path => "#{logstash_path}/logstash-core-plugin-api"
11
+ end
@@ -6,6 +6,16 @@ require "logstash/timestamp"
6
6
  require "logstash/plugin_mixins/aws_config"
7
7
  require "logstash/errors"
8
8
  require 'logstash/inputs/s3sqs/patch'
9
+ require "aws-sdk"
10
+ require 'cgi'
11
+
12
+ require 'java'
13
+ java_import java.io.InputStream
14
+ java_import java.io.InputStreamReader
15
+ java_import java.io.FileInputStream
16
+ java_import java.io.BufferedReader
17
+ java_import java.util.zip.GZIPInputStream
18
+ java_import java.util.zip.ZipException
9
19
 
10
20
  Aws.eager_autoload!
11
21
 
@@ -96,6 +106,10 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
96
106
  config :delete_on_success, :validate => :boolean, :default => false
97
107
  # Whether the event is processed though an SNS to SQS. (S3>SNS>SQS = true |S3>SQS=false)
98
108
  config :from_sns, :validate => :boolean, :default => true
109
+ # To run in multiple threads use this
110
+ config :consumer_threads, :validate => :number, :default => 1
111
+ config :temporary_directory, :validate => :string, :default => File.join(Dir.tmpdir, "logstash")
112
+
99
113
 
100
114
  attr_reader :poller
101
115
  attr_reader :s3
@@ -106,26 +120,33 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
106
120
  require "logstash/codecs/json"
107
121
  require "logstash/codecs/json_lines"
108
122
  if content_type == "application/json_lines" then
109
- @logger.info("Automatically switching from #{@codec.class.config_name} to json_lines codec", :plugin => self.class.config_name)
110
- @codec = LogStash::Codecs::JSONLines.new("charset" => @codec.charset)
123
+ @logger.info("Automatically switching from #{@codec.class.config_name} to json_lines codec", :plugin => self.class.config_name)
124
+ @codec = LogStash::Codecs::JSONLines.new("charset" => @codec.charset)
111
125
  elsif content_type == "application/json" or key.end_with?(".json") then
112
- @logger.info("Automatically switching from #{@codec.class.config_name} to json codec", :plugin => self.class.config_name)
113
- @codec = LogStash::Codecs::JSON.new("charset" => @codec.charset)
126
+ @logger.info("Automatically switching from #{@codec.class.config_name} to json codec", :plugin => self.class.config_name)
127
+ @codec = LogStash::Codecs::JSON.new("charset" => @codec.charset)
114
128
  end
115
129
  end
116
130
 
131
+ public
117
132
  def register
118
- require "aws-sdk"
119
- require 'cgi'
133
+ require "fileutils"
134
+ require "digest/md5"
135
+ require "aws-sdk-resources"
136
+
137
+ @runner_threads = []
120
138
  @logger.info("Registering SQS input", :queue => @queue)
121
139
  setup_queue
140
+
141
+ FileUtils.mkdir_p(@temporary_directory) unless Dir.exist?(@temporary_directory)
122
142
  end
123
143
 
124
144
  def setup_queue
125
145
  aws_sqs_client = Aws::SQS::Client.new(aws_options_hash)
126
146
  queue_url = aws_sqs_client.get_queue_url({ queue_name: @queue, queue_owner_aws_account_id: @queue_owner_aws_account_id})[:queue_url]
127
147
  @poller = Aws::SQS::QueuePoller.new(queue_url, :client => aws_sqs_client)
128
- @s3 = Aws::S3::Client.new(aws_options_hash)
148
+ @s3_client = Aws::S3::Client.new(aws_options_hash)
149
+ @s3_resource = get_s3object
129
150
  rescue Aws::SQS::Errors::ServiceError => e
130
151
  @logger.error("Cannot establish connection to Amazon SQS", :error => e)
131
152
  raise LogStash::ConfigurationError, "Verify the SQS queue name and your credentials"
@@ -133,17 +154,17 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
133
154
 
134
155
  def polling_options
135
156
  {
136
- # we will query 1 message at a time, so we can ensure correct error handling if we can't download a single file correctly
137
- # (we will throw :skip_delete if download size isn't correct to process the event again later
138
- # -> set a reasonable "Default Visibility Timeout" for your queue, so that there's enough time to process the log files)
139
- :max_number_of_messages => 1,
140
- # we will use the queue's setting, a good value is 10 seconds
141
- # (to ensure fast logstash shutdown on the one hand and few api calls on the other hand)
142
- :wait_time_seconds => nil,
157
+ # we will query 1 message at a time, so we can ensure correct error handling if we can't download a single file correctly
158
+ # (we will throw :skip_delete if download size isn't correct to process the event again later
159
+ # -> set a reasonable "Default Visibility Timeout" for your queue, so that there's enough time to process the log files)
160
+ :max_number_of_messages => 1,
161
+ # we will use the queue's setting, a good value is 10 seconds
162
+ # (to ensure fast logstash shutdown on the one hand and few api calls on the other hand)
163
+ :wait_time_seconds => nil,
143
164
  }
144
165
  end
145
166
 
146
- def handle_message(message, queue)
167
+ def handle_message(message, queue, instance_codec)
147
168
  hash = JSON.parse message.body
148
169
  @logger.debug("handle_message", :hash => hash, :message => message)
149
170
  #If send via sns there is an additional JSON layer
@@ -152,97 +173,239 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
152
173
  end
153
174
  # there may be test events sent from the s3 bucket which won't contain a Records array,
154
175
  # we will skip those events and remove them from queue
155
- if hash['Records'] then
156
- # typically there will be only 1 record per event, but since it is an array we will
157
- # treat it as if there could be more records
158
- hash['Records'].each do |record|
159
- @logger.debug("We found a record", :record => record)
160
- # in case there are any events with Records that aren't s3 object-created events and can't therefore be
161
- # processed by this plugin, we will skip them and remove them from queue
162
- if record['eventSource'] == EVENT_SOURCE and record['eventName'].start_with?(EVENT_TYPE) then
163
- @logger.debug("It is a valid record")
164
- bucket = CGI.unescape(record['s3']['bucket']['name'])
165
- key = CGI.unescape(record['s3']['object']['key'])
166
-
167
-
168
- # try download and :skip_delete if it fails
169
- begin
170
- response = @s3.get_object(
171
- bucket: bucket,
172
- key: key,
173
- )
174
- rescue => e
175
- @logger.warn("issuing :skip_delete on failed download", :bucket => bucket, :object => key, :error => e)
176
- throw :skip_delete
177
- end
178
-
179
- # verify downloaded content size
180
- if response.content_length == record['s3']['object']['size'] then
181
- body = response.body
182
- # if necessary unzip
183
- if response.content_encoding == "gzip" or record['s3']['object']['key'].end_with?(".gz") then
184
- @logger.debug("Ohhh i´ll try to unzip")
185
- begin
186
- temp = Zlib::GzipReader.new(body)
187
- rescue => e
188
- @logger.warn("content is marked to be gzipped but can't unzip it, assuming plain text", :bucket => bucket, :object => key, :error => e)
189
- temp = body
190
- end
191
- body = temp
192
- end
193
- # Make a suggestion for a good codec
194
- suggest_codec(response.content_type,record['s3']['object']['key'])
195
- # process the plain text content
196
- begin
197
- lines = body.read.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: "\u2370").split(/\n/)
198
- lines.each do |line|
199
- @logger.debug("Decorating the event")
200
- @codec.decode(line) do |event|
201
- decorate(event)
202
-
203
- event.set('[@metadata][s3_bucket_name]', record['s3']['bucket']['name'])
204
- event.set('[@metadata][s3_object_key]', record['s3']['object']['key'])
205
- if match=/#{s3_key_prefix}\/?(?<type_folder>.*?)\/.*/.match(key)
206
- event.set('[@metadata][s3_object_folder]', match['type_folder'])
207
- end
208
- queue << event
209
- end
210
- end
211
- rescue => e
212
- @logger.warn("issuing :skip_delete on failed plain text processing", :bucket => bucket, :object => key, :error => e)
213
- throw :skip_delete
214
- end
215
-
216
- # Delete the files from S3
217
- begin
218
- @s3.delete_object(bucket: bucket, key: key) if @delete_on_success
219
- rescue => e
220
- @logger.warn("Failed to delete S3 object", :bucket => bucket, :object => key, :error => e)
221
- end
222
- # otherwise try again later
223
- else
224
- @logger.warn("issuing :skip_delete on wrong download content size", :bucket => bucket, :object => key,
225
- :download_size => response.content_length, :expected => record['s3']['object']['size'])
226
- throw :skip_delete
227
- end
176
+ if hash['Records'] then
177
+ # typically there will be only 1 record per event, but since it is an array we will
178
+ # treat it as if there could be more records
179
+ hash['Records'].each do |record|
180
+ @logger.debug("We found a record", :record => record)
181
+ # in case there are any events with Records that aren't s3 object-created events and can't therefore be
182
+ # processed by this plugin, we will skip them and remove them from queue
183
+ if record['eventSource'] == EVENT_SOURCE and record['eventName'].start_with?(EVENT_TYPE) then
184
+ @logger.debug("It is a valid record")
185
+ bucket = CGI.unescape(record['s3']['bucket']['name'])
186
+ key = CGI.unescape(record['s3']['object']['key'])
187
+
188
+ # try download and :skip_delete if it fails
189
+ #if record['s3']['object']['size'] < 10000000 then
190
+ process_log(bucket, key, instance_codec, queue)
191
+ #else
192
+ # @logger.info("Your file is too big")
193
+ #end
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ private
200
+ def process_log(bucket , key, instance_codec, queue)
201
+ s3bucket = @s3_resource.bucket(bucket)
202
+ @logger.debug("Lets go reading file", :bucket => bucket, :key => key)
203
+ object = s3bucket.object(key)
204
+ filename = File.join(temporary_directory, File.basename(key))
205
+ if download_remote_file(object, filename)
206
+ if process_local_log( filename, key, instance_codec, queue)
207
+ delete_file_from_bucket(object)
208
+ FileUtils.remove_entry_secure(filename, true)
209
+ end
210
+ else
211
+ FileUtils.remove_entry_secure(filename, true)
212
+ end
213
+ end
214
+
215
+ private
216
+ # Stream the remove file to the local disk
217
+ #
218
+ # @param [S3Object] Reference to the remove S3 objec to download
219
+ # @param [String] The Temporary filename to stream to.
220
+ # @return [Boolean] True if the file was completely downloaded
221
+ def download_remote_file(remote_object, local_filename)
222
+ completed = false
223
+ @logger.debug("S3 input: Download remote file", :remote_key => remote_object.key, :local_filename => local_filename)
224
+ File.open(local_filename, 'wb') do |s3file|
225
+ return completed if stop?
226
+ remote_object.get(:response_target => s3file)
227
+ end
228
+ completed = true
229
+
230
+ return completed
231
+ end
232
+
233
+ private
234
+
235
+ # Read the content of the local file
236
+ #
237
+ # @param [Queue] Where to push the event
238
+ # @param [String] Which file to read from
239
+ # @return [Boolean] True if the file was completely read, false otherwise.
240
+ def process_local_log(filename, key, instance_codec, queue)
241
+ @logger.debug('Processing file', :filename => filename)
242
+ metadata = {}
243
+ i=1
244
+ # Currently codecs operates on bytes instead of stream.
245
+ # So all IO stuff: decompression, reading need to be done in the actual
246
+ # input and send as bytes to the codecs.
247
+ read_file(filename) do |line|
248
+ if stop?
249
+ @logger.warn("Logstash S3 input, stop reading in the middle of the file, we will read it again when logstash is started")
250
+ return false
251
+ end
252
+ #@logger.info("read line #{i}", :line => line)
253
+ #line = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: "\u2370")
254
+ instance_codec.decode(line) do |event|
255
+ #@logger.info("decorate event")
256
+ # We are making an assumption concerning cloudfront
257
+ # log format, the user will use the plain or the line codec
258
+ # and the message key will represent the actual line content.
259
+ # If the event is only metadata the event will be drop.
260
+ # This was the behavior of the pre 1.5 plugin.
261
+ #
262
+ # The line need to go through the codecs to replace
263
+ # unknown bytes in the log stream before doing a regexp match or
264
+ # you will get a `Error: invalid byte sequence in UTF-8'
265
+ #event = LogStash::Event.new("message" => @message)
266
+ if event_is_metadata?(event)
267
+ @logger.debug('Event is metadata, updating the current cloudfront metadata', :event => event)
268
+ update_metadata(metadata, event)
269
+ else
270
+
271
+ decorate(event)
272
+
273
+ event.set("cloudfront_version", metadata[:cloudfront_version]) unless metadata[:cloudfront_version].nil?
274
+ event.set("cloudfront_fields", metadata[:cloudfront_fields]) unless metadata[:cloudfront_fields].nil?
275
+
276
+ event.set("[@metadata][s3]", { "key" => key })
277
+
278
+ if match=/#{s3_key_prefix}\/?(?<type_folder>.*?)\/.*/.match(key)
279
+ event.set('[@metadata][s3_object_folder]', match['type_folder'])
228
280
  end
281
+ #@logger.info("queue event #{i}")
282
+ #i += 1
283
+ queue << event
229
284
  end
230
285
  end
286
+ end
287
+ #@logger.info("event pre flush", :event => event)
288
+ # #ensure any stateful codecs (such as multi-line ) are flushed to the queue
289
+ instance_codec.flush do |event|
290
+ queue << event
291
+ end
292
+
293
+ return true
294
+ end # def process_local_log
295
+
296
+ private
297
+ def read_file(filename, &block)
298
+ if gzip?(filename)
299
+ read_gzip_file(filename, block)
300
+ else
301
+ read_plain_file(filename, block)
302
+ end
303
+ end
304
+
305
+ def read_plain_file(filename, block)
306
+ File.open(filename, 'rb') do |file|
307
+ file.each(&block)
308
+ end
309
+ end
310
+
311
+ private
312
+ def read_gzip_file(filename, block)
313
+ file_stream = FileInputStream.new(filename)
314
+ gzip_stream = GZIPInputStream.new(file_stream)
315
+ decoder = InputStreamReader.new(gzip_stream, "UTF-8")
316
+ buffered = BufferedReader.new(decoder)
317
+
318
+ while (line = buffered.readLine())
319
+ block.call(line)
320
+ end
321
+ rescue ZipException => e
322
+ @logger.error("Gzip codec: We cannot uncompress the gzip file", :filename => filename)
323
+ raise e
324
+ ensure
325
+ buffered.close unless buffered.nil?
326
+ decoder.close unless decoder.nil?
327
+ gzip_stream.close unless gzip_stream.nil?
328
+ file_stream.close unless file_stream.nil?
329
+ end
330
+
331
+ private
332
+ def gzip?(filename)
333
+ filename.end_with?('.gz','.gzip')
231
334
  end
232
335
 
336
+
337
+ private
338
+ def delete_file_from_bucket(object)
339
+ if @delete_on_success
340
+ object.delete()
341
+ end
342
+ end
343
+
344
+ private
345
+ def get_s3object
346
+ s3 = Aws::S3::Resource.new(client: @s3_client)
347
+ end
348
+
349
+ private
350
+ def event_is_metadata?(event)
351
+ return false unless event.get("message").class == String
352
+ line = event.get("message")
353
+ version_metadata?(line) || fields_metadata?(line)
354
+ end
355
+
356
+ private
357
+ def version_metadata?(line)
358
+ line.start_with?('#Version: ')
359
+ end
360
+
361
+ private
362
+ def fields_metadata?(line)
363
+ line.start_with?('#Fields: ')
364
+ end
365
+
366
+ private
367
+ def update_metadata(metadata, event)
368
+ line = event.get('message').strip
369
+
370
+ if version_metadata?(line)
371
+ metadata[:cloudfront_version] = line.split(/#Version: (.+)/).last
372
+ end
373
+
374
+ if fields_metadata?(line)
375
+ metadata[:cloudfront_fields] = line.split(/#Fields: (.+)/).last
376
+ end
377
+ end
378
+
379
+ public
233
380
  def run(queue)
234
381
  # ensure we can stop logstash correctly
235
- poller.before_request do |stats|
236
- if stop? then
237
- @logger.warn("issuing :stop_polling on stop?", :queue => @queue)
238
- # this can take up to "Receive Message Wait Time" (of the sqs queue) seconds to be recognized
239
- throw :stop_polling
240
- end
241
- end
242
- # poll a message and process it
243
- run_with_backoff do
244
- poller.poll(polling_options) do |message|
245
- handle_message(message, queue)
382
+ @runner_threads = consumer_threads.times.map { |consumer| thread_runner(queue) }
383
+ @runner_threads.each { |t| t.join }
384
+ end
385
+
386
+ public
387
+ def stop
388
+ @runner_threads.each { |c| c.wakeup }
389
+ end
390
+
391
+ private
392
+ def thread_runner(queue)
393
+ Thread.new do
394
+ @logger.info("Starting new thread")
395
+ begin
396
+ poller.before_request do |stats|
397
+ if stop? then
398
+ @logger.warn("issuing :stop_polling on stop?", :queue => @queue)
399
+ # this can take up to "Receive Message Wait Time" (of the sqs queue) seconds to be recognized
400
+ throw :stop_polling
401
+ end
402
+ end
403
+ # poll a message and process it
404
+ run_with_backoff do
405
+ poller.poll(polling_options) do |message|
406
+ handle_message(message, queue, @codec.clone)
407
+ end
408
+ end
246
409
  end
247
410
  end
248
411
  end
@@ -266,5 +429,4 @@ class LogStash::Inputs::S3SNSSQS < LogStash::Inputs::Threadable
266
429
  retry
267
430
  end
268
431
  end
269
-
270
- end # class
432
+ end # class
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'logstash-input-s3-sns-sqs'
3
- s.version = '1.2.0'
3
+ s.version = '1.4.0'
4
4
  s.licenses = ['Apache License (2.0)']
5
5
  s.summary = "Get logs from AWS s3 buckets as issued by an object-created event via sns -> sqs."
6
6
  s.description = "This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program"
@@ -19,11 +19,12 @@ Gem::Specification.new do |s|
19
19
  s.metadata = { "logstash_plugin" => "true", "logstash_group" => "input" }
20
20
 
21
21
  # Gem dependencies
22
- s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
22
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 2.1.12", "<= 2.99"
23
23
 
24
24
  s.add_runtime_dependency 'logstash-codec-json'
25
- s.add_runtime_dependency "logstash-mixin-aws", ">= 1.0.0"
25
+ s.add_runtime_dependency "logstash-mixin-aws"
26
26
 
27
27
  s.add_development_dependency 'logstash-devutils'
28
+
28
29
  end
29
30
 
metadata CHANGED
@@ -1,21 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: logstash-input-s3-sns-sqs
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Christian Herweg
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-19 00:00:00.000000000 Z
11
+ date: 2018-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
15
15
  requirements:
16
16
  - - '>='
17
17
  - !ruby/object:Gem::Version
18
- version: '1.60'
18
+ version: 2.1.12
19
19
  - - <=
20
20
  - !ruby/object:Gem::Version
21
21
  version: '2.99'
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - '>='
28
28
  - !ruby/object:Gem::Version
29
- version: '1.60'
29
+ version: 2.1.12
30
30
  - - <=
31
31
  - !ruby/object:Gem::Version
32
32
  version: '2.99'
@@ -49,7 +49,7 @@ dependencies:
49
49
  requirements:
50
50
  - - '>='
51
51
  - !ruby/object:Gem::Version
52
- version: 1.0.0
52
+ version: '0'
53
53
  name: logstash-mixin-aws
54
54
  prerelease: false
55
55
  type: :runtime
@@ -57,7 +57,7 @@ dependencies:
57
57
  requirements:
58
58
  - - '>='
59
59
  - !ruby/object:Gem::Version
60
- version: 1.0.0
60
+ version: '0'
61
61
  - !ruby/object:Gem::Dependency
62
62
  requirement: !ruby/object:Gem::Requirement
63
63
  requirements: