logstash-input-s3-cloudian 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,465 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+ require "logstash/plugin_mixins/aws_config"
5
+ require "time"
6
+ require "date"
7
+ require "tmpdir"
8
+ require "stud/interval"
9
+ require "stud/temporary"
10
+ require "aws-sdk"
11
+ require "logstash/inputs/s3/patch"
12
+ require "logstash/plugin_mixins/ecs_compatibility_support"
13
+
14
+ require 'java'
15
+
16
+ Aws.eager_autoload!
17
+ # Stream events from files from a S3 bucket.
18
+ #
19
+ # Each line from each file generates an event.
20
+ # Files ending in `.gz` are handled as gzip'ed files.
21
+ class LogStash::Inputs::S3 < LogStash::Inputs::Base
22
+
23
+ java_import java.io.InputStream
24
+ java_import java.io.InputStreamReader
25
+ java_import java.io.FileInputStream
26
+ java_import java.io.BufferedReader
27
+ java_import java.util.zip.GZIPInputStream
28
+ java_import java.util.zip.ZipException
29
+
30
+ include LogStash::PluginMixins::AwsConfig::V2
31
+ include LogStash::PluginMixins::ECSCompatibilitySupport(:disabled, :v1, :v8 => :v1)
32
+
33
+ config_name "s3"
34
+
35
+ default :codec, "plain"
36
+
37
+ # The name of the S3 bucket.
38
+ config :bucket, :validate => :string, :required => true
39
+
40
+ # If specified, the prefix of filenames in the bucket must match (not a regexp)
41
+ config :prefix, :validate => :string, :default => nil
42
+
43
+ config :additional_settings, :validate => :hash, :default => {}
44
+
45
+ # The path to use for writing state. The state stored by this plugin is
46
+ # a memory of files already processed by this plugin.
47
+ #
48
+ # If not specified, the default is in `{path.data}/plugins/inputs/s3/...`
49
+ #
50
+ # Should be a path with filename not just a directory.
51
+ config :sincedb_path, :validate => :string, :default => nil
52
+
53
+ # Name of a S3 bucket to backup processed files to.
54
+ config :backup_to_bucket, :validate => :string, :default => nil
55
+
56
+ # Append a prefix to the key (full path including file name in s3) after processing.
57
+ # If backing up to another (or the same) bucket, this effectively lets you
58
+ # choose a new 'folder' to place the files in
59
+ config :backup_add_prefix, :validate => :string, :default => nil
60
+
61
+ # Path of a local directory to backup processed files to.
62
+ config :backup_to_dir, :validate => :string, :default => nil
63
+
64
+ # Whether to delete processed files from the original bucket.
65
+ config :delete, :validate => :boolean, :default => false
66
+
67
+ # Interval to wait between to check the file list again after a run is finished.
68
+ # Value is in seconds.
69
+ config :interval, :validate => :number, :default => 60
70
+
71
+ # Whether to watch for new files with the interval.
72
+ # If false, overrides any interval and only lists the s3 bucket once.
73
+ config :watch_for_new_files, :validate => :boolean, :default => true
74
+
75
+ # Ruby style regexp of keys to exclude from the bucket
76
+ config :exclude_pattern, :validate => :string, :default => nil
77
+
78
+ # Set the directory where logstash will store the tmp files before processing them.
79
+ # default to the current OS temporary directory in linux /tmp/logstash
80
+ config :temporary_directory, :validate => :string, :default => File.join(Dir.tmpdir, "logstash")
81
+
82
+ # Whether or not to include the S3 object's properties (last_modified, content_type, metadata)
83
+ # into each Event at [@metadata][s3]. Regardless of this setting, [@metdata][s3][key] will always
84
+ # be present.
85
+ config :include_object_properties, :validate => :boolean, :default => false
86
+
87
+ # Regular expression used to determine whether an input file is in gzip format.
88
+ # default to an expression that matches *.gz and *.gzip file extensions
89
+ config :gzip_pattern, :validate => :string, :default => "\.gz(ip)?$"
90
+
91
+ CUTOFF_SECOND = 3
92
+
93
+ def initialize(*params)
94
+ super
95
+ @cloudfront_fields_key = ecs_select[disabled: 'cloudfront_fields', v1: '[@metadata][s3][cloudfront][fields]']
96
+ @cloudfront_version_key = ecs_select[disabled: 'cloudfront_version', v1: '[@metadata][s3][cloudfront][version]']
97
+ end
98
+
99
+ def register
100
+ require "fileutils"
101
+ require "digest/md5"
102
+ require "aws-sdk-resources"
103
+
104
+ @logger.info("Registering", :bucket => @bucket, :region => @region)
105
+
106
+ s3 = get_s3object
107
+
108
+ @s3bucket = s3.bucket(@bucket)
109
+
110
+ unless @backup_to_bucket.nil?
111
+ @backup_bucket = s3.bucket(@backup_to_bucket)
112
+ begin
113
+ s3.client.head_bucket({ :bucket => @backup_to_bucket})
114
+ rescue Aws::S3::Errors::NoSuchBucket
115
+ s3.create_bucket({ :bucket => @backup_to_bucket})
116
+ end
117
+ end
118
+
119
+ unless @backup_to_dir.nil?
120
+ Dir.mkdir(@backup_to_dir, 0700) unless File.exists?(@backup_to_dir)
121
+ end
122
+
123
+ FileUtils.mkdir_p(@temporary_directory) unless Dir.exist?(@temporary_directory)
124
+
125
+ if !@watch_for_new_files && original_params.include?('interval')
126
+ logger.warn("`watch_for_new_files` has been disabled; `interval` directive will be ignored.")
127
+ end
128
+ end
129
+
130
+ def run(queue)
131
+ @current_thread = Thread.current
132
+ Stud.interval(@interval) do
133
+ process_files(queue)
134
+ stop unless @watch_for_new_files
135
+ end
136
+ end # def run
137
+
138
+ def list_new_files
139
+ objects = []
140
+ found = false
141
+ current_time = Time.now
142
+ sincedb_time = sincedb.read
143
+ begin
144
+ @s3bucket.objects(:prefix => @prefix).each do |log|
145
+ found = true
146
+ @logger.debug('Found key', :key => log.key)
147
+ if ignore_filename?(log.key)
148
+ @logger.debug('Ignoring', :key => log.key)
149
+ elsif log.content_length <= 0
150
+ @logger.debug('Object Zero Length', :key => log.key)
151
+ elsif log.last_modified <= sincedb_time
152
+ @logger.debug('Object Not Modified', :key => log.key)
153
+ elsif log.last_modified > (current_time - CUTOFF_SECOND).utc # file modified within last two seconds will be processed in next cycle
154
+ @logger.debug('Object Modified After Cutoff Time', :key => log.key)
155
+ elsif (log.storage_class == 'GLACIER' || log.storage_class == 'DEEP_ARCHIVE') && !file_restored?(log.object)
156
+ @logger.debug('Object Archived to Glacier', :key => log.key)
157
+ else
158
+ objects << log
159
+ @logger.debug("Added to objects[]", :key => log.key, :length => objects.length)
160
+ end
161
+ end
162
+ @logger.info('No files found in bucket', :prefix => prefix) unless found
163
+ rescue Aws::Errors::ServiceError => e
164
+ @logger.error("Unable to list objects in bucket", :exception => e.class, :message => e.message, :backtrace => e.backtrace, :prefix => prefix)
165
+ end
166
+ objects.sort_by { |log| log.last_modified }
167
+ end # def fetch_new_files
168
+
169
+ def backup_to_bucket(object)
170
+ unless @backup_to_bucket.nil?
171
+ backup_key = "#{@backup_add_prefix}#{object.key}"
172
+ @backup_bucket.object(backup_key).copy_from(:copy_source => "#{object.bucket_name}/#{object.key}")
173
+ if @delete
174
+ object.delete()
175
+ end
176
+ end
177
+ end
178
+
179
+ def backup_to_dir(filename)
180
+ unless @backup_to_dir.nil?
181
+ FileUtils.cp(filename, @backup_to_dir)
182
+ end
183
+ end
184
+
185
+ def process_files(queue)
186
+ objects = list_new_files
187
+
188
+ objects.each do |log|
189
+ if stop?
190
+ break
191
+ else
192
+ process_log(queue, log)
193
+ end
194
+ end
195
+ end # def process_files
196
+
197
+ def stop
198
+ # @current_thread is initialized in the `#run` method,
199
+ # this variable is needed because the `#stop` is a called in another thread
200
+ # than the `#run` method and requiring us to call stop! with a explicit thread.
201
+ Stud.stop!(@current_thread)
202
+ end
203
+
204
+ private
205
+
206
+ # Read the content of the local file
207
+ #
208
+ # @param [Queue] Where to push the event
209
+ # @param [String] Which file to read from
210
+ # @param [S3Object] Source s3 object
211
+ # @return [Boolean] True if the file was completely read, false otherwise.
212
+ def process_local_log(queue, filename, object)
213
+ @logger.debug('Processing file', :filename => filename)
214
+ metadata = {}
215
+ # Currently codecs operates on bytes instead of stream.
216
+ # So all IO stuff: decompression, reading need to be done in the actual
217
+ # input and send as bytes to the codecs.
218
+ read_file(filename) do |line|
219
+ if stop?
220
+ @logger.warn("Logstash S3 input, stop reading in the middle of the file, we will read it again when logstash is started")
221
+ return false
222
+ end
223
+
224
+ @codec.decode(line) do |event|
225
+ # We are making an assumption concerning cloudfront
226
+ # log format, the user will use the plain or the line codec
227
+ # and the message key will represent the actual line content.
228
+ # If the event is only metadata the event will be drop.
229
+ # This was the behavior of the pre 1.5 plugin.
230
+ #
231
+ # The line need to go through the codecs to replace
232
+ # unknown bytes in the log stream before doing a regexp match or
233
+ # you will get a `Error: invalid byte sequence in UTF-8'
234
+ if event_is_metadata?(event)
235
+ @logger.debug('Event is metadata, updating the current cloudfront metadata', :event => event)
236
+ update_metadata(metadata, event)
237
+ else
238
+ push_decoded_event(queue, metadata, object, event)
239
+ end
240
+ end
241
+ end
242
+ # #ensure any stateful codecs (such as multi-line ) are flushed to the queue
243
+ @codec.flush do |event|
244
+ push_decoded_event(queue, metadata, object, event)
245
+ end
246
+
247
+ return true
248
+ end # def process_local_log
249
+
250
+ def push_decoded_event(queue, metadata, object, event)
251
+ decorate(event)
252
+
253
+ if @include_object_properties
254
+ event.set("[@metadata][s3]", object.data.to_h)
255
+ else
256
+ event.set("[@metadata][s3]", {})
257
+ end
258
+
259
+ event.set("[@metadata][s3][key]", object.key)
260
+ event.set(@cloudfront_version_key, metadata[:cloudfront_version]) unless metadata[:cloudfront_version].nil?
261
+ event.set(@cloudfront_fields_key, metadata[:cloudfront_fields]) unless metadata[:cloudfront_fields].nil?
262
+
263
+ queue << event
264
+ end
265
+
266
+ def event_is_metadata?(event)
267
+ return false unless event.get("message").class == String
268
+ line = event.get("message")
269
+ version_metadata?(line) || fields_metadata?(line)
270
+ end
271
+
272
+ def version_metadata?(line)
273
+ line.start_with?('#Version: ')
274
+ end
275
+
276
+ def fields_metadata?(line)
277
+ line.start_with?('#Fields: ')
278
+ end
279
+
280
+ def update_metadata(metadata, event)
281
+ line = event.get('message').strip
282
+
283
+ if version_metadata?(line)
284
+ metadata[:cloudfront_version] = line.split(/#Version: (.+)/).last
285
+ end
286
+
287
+ if fields_metadata?(line)
288
+ metadata[:cloudfront_fields] = line.split(/#Fields: (.+)/).last
289
+ end
290
+ end
291
+
292
+ def read_file(filename, &block)
293
+ if gzip?(filename)
294
+ read_gzip_file(filename, block)
295
+ else
296
+ read_plain_file(filename, block)
297
+ end
298
+ rescue => e
299
+ # skip any broken file
300
+ @logger.error("Failed to read file, processing skipped", :exception => e.class, :message => e.message, :filename => filename)
301
+ end
302
+
303
+ def read_plain_file(filename, block)
304
+ File.open(filename, 'rb') do |file|
305
+ file.each(&block)
306
+ end
307
+ end
308
+
309
+ def read_gzip_file(filename, block)
310
+ file_stream = FileInputStream.new(filename)
311
+ gzip_stream = GZIPInputStream.new(file_stream)
312
+ decoder = InputStreamReader.new(gzip_stream, "UTF-8")
313
+ buffered = BufferedReader.new(decoder)
314
+
315
+ while (line = buffered.readLine())
316
+ block.call(line)
317
+ end
318
+ ensure
319
+ buffered.close unless buffered.nil?
320
+ decoder.close unless decoder.nil?
321
+ gzip_stream.close unless gzip_stream.nil?
322
+ file_stream.close unless file_stream.nil?
323
+ end
324
+
325
+ def gzip?(filename)
326
+ Regexp.new(@gzip_pattern).match(filename)
327
+ end
328
+
329
+ def sincedb
330
+ @sincedb ||= if @sincedb_path.nil?
331
+ @logger.info("Using default generated file for the sincedb", :filename => sincedb_file)
332
+ SinceDB::File.new(sincedb_file)
333
+ else
334
+ @logger.info("Using the provided sincedb_path", :sincedb_path => @sincedb_path)
335
+ SinceDB::File.new(@sincedb_path)
336
+ end
337
+ end
338
+
339
+ def sincedb_file
340
+ digest = Digest::MD5.hexdigest("#{@bucket}+#{@prefix}")
341
+ dir = File.join(LogStash::SETTINGS.get_value("path.data"), "plugins", "inputs", "s3")
342
+ FileUtils::mkdir_p(dir)
343
+ path = File.join(dir, "sincedb_#{digest}")
344
+
345
+ # Migrate old default sincedb path to new one.
346
+ if ENV["HOME"]
347
+ # This is the old file path including the old digest mechanism.
348
+ # It remains as a way to automatically upgrade users with the old default ($HOME)
349
+ # to the new default (path.data)
350
+ old = File.join(ENV["HOME"], ".sincedb_" + Digest::MD5.hexdigest("#{@bucket}+#{@prefix}"))
351
+ if File.exist?(old)
352
+ logger.info("Migrating old sincedb in $HOME to {path.data}")
353
+ FileUtils.mv(old, path)
354
+ end
355
+ end
356
+
357
+ path
358
+ end
359
+
360
+ def ignore_filename?(filename)
361
+ if @prefix == filename
362
+ return true
363
+ elsif filename.end_with?("/")
364
+ return true
365
+ elsif (@backup_add_prefix && @backup_to_bucket == @bucket && filename =~ /^#{backup_add_prefix}/)
366
+ return true
367
+ elsif @exclude_pattern.nil?
368
+ return false
369
+ elsif filename =~ Regexp.new(@exclude_pattern)
370
+ return true
371
+ else
372
+ return false
373
+ end
374
+ end
375
+
376
+ def process_log(queue, log)
377
+ @logger.debug("Processing", :bucket => @bucket, :key => log.key)
378
+ object = @s3bucket.object(log.key)
379
+
380
+ filename = File.join(temporary_directory, File.basename(log.key))
381
+ if download_remote_file(object, filename)
382
+ if process_local_log(queue, filename, object)
383
+ backup_to_bucket(object)
384
+ backup_to_dir(filename)
385
+ delete_file_from_bucket(object)
386
+ FileUtils.remove_entry_secure(filename, true)
387
+ sincedb.write(log.last_modified)
388
+ end
389
+ else
390
+ FileUtils.remove_entry_secure(filename, true)
391
+ end
392
+ end
393
+
394
+ # Stream the remove file to the local disk
395
+ #
396
+ # @param [S3Object] Reference to the remove S3 objec to download
397
+ # @param [String] The Temporary filename to stream to.
398
+ # @return [Boolean] True if the file was completely downloaded
399
+ def download_remote_file(remote_object, local_filename)
400
+ completed = false
401
+ @logger.debug("Downloading remote file", :remote_key => remote_object.key, :local_filename => local_filename)
402
+ File.open(local_filename, 'wb') do |s3file|
403
+ return completed if stop?
404
+ begin
405
+ remote_object.get(:response_target => s3file)
406
+ completed = true
407
+ rescue Aws::Errors::ServiceError => e
408
+ @logger.warn("Unable to download remote file", :exception => e.class, :message => e.message, :remote_key => remote_object.key)
409
+ end
410
+ end
411
+ completed
412
+ end
413
+
414
+ def delete_file_from_bucket(object)
415
+ if @delete and @backup_to_bucket.nil?
416
+ object.delete()
417
+ end
418
+ end
419
+
420
+ def get_s3object
421
+ s3 = Aws::S3::Resource.new(aws_options_hash || {})
422
+ end
423
+
424
+ def file_restored?(object)
425
+ begin
426
+ restore = object.data.restore
427
+ if restore && restore.match(/ongoing-request\s?=\s?["']false["']/)
428
+ if restore = restore.match(/expiry-date\s?=\s?["'](.*?)["']/)
429
+ expiry_date = DateTime.parse(restore[1])
430
+ return true if DateTime.now < expiry_date # restored
431
+ else
432
+ @logger.debug("No expiry-date header for restore request: #{object.data.restore}")
433
+ return nil # no expiry-date found for ongoing request
434
+ end
435
+ end
436
+ rescue => e
437
+ @logger.debug("Could not determine Glacier restore status", :exception => e.class, :message => e.message)
438
+ end
439
+ return false
440
+ end
441
+
442
+ module SinceDB
443
+ class File
444
+ def initialize(file)
445
+ @sincedb_path = file
446
+ end
447
+
448
+ # @return [Time]
449
+ def read
450
+ if ::File.exists?(@sincedb_path)
451
+ content = ::File.read(@sincedb_path).chomp.strip
452
+ # If the file was created but we didn't have the time to write to it
453
+ return content.empty? ? Time.new(0) : Time.parse(content)
454
+ else
455
+ return Time.new(0)
456
+ end
457
+ end
458
+
459
+ def write(since = nil)
460
+ since = Time.now if since.nil?
461
+ ::File.open(@sincedb_path, 'w') { |file| file.write(since.to_s) }
462
+ end
463
+ end
464
+ end
465
+ end # class LogStash::Inputs::S3
@@ -0,0 +1,31 @@
1
+ Gem::Specification.new do |s|
2
+
3
+ s.name = 'logstash-input-s3-cloudian'
4
+ s.version = '1.0.0'
5
+ s.licenses = ['Apache-2.0']
6
+ s.summary = "Streams events from files in a S3 bucket"
7
+ s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
8
+ s.authors = ["dm.belousov"]
9
+ s.email = 'dmitriy.belousov@gmail.com'
10
+ s.homepage = "http://www.elastic.co/guide/en/logstash/current/index.html"
11
+ s.require_paths = ["lib"]
12
+
13
+ # Files
14
+ s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "docs/**/*"]
15
+
16
+ # Tests
17
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
18
+
19
+ # Special flag to let us know this is actually a logstash plugin
20
+ s.metadata = { "logstash_plugin" => "true", "logstash_group" => "input" }
21
+
22
+ # Gem dependencies
23
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 2.1.12", "<= 2.99"
24
+ s.add_runtime_dependency 'logstash-mixin-aws', '>= 5.1.0'
25
+ s.add_runtime_dependency 'stud', '~> 0.0.18'
26
+ # s.add_runtime_dependency 'aws-sdk-resources', '>= 2.0.33'
27
+ s.add_development_dependency 'logstash-devutils'
28
+ s.add_development_dependency "logstash-codec-json"
29
+ s.add_development_dependency "logstash-codec-multiline"
30
+ s.add_runtime_dependency 'logstash-mixin-ecs_compatibility_support', '~>1.2'
31
+ end
@@ -0,0 +1,4 @@
1
+ #Version: 1.0
2
+ #Fields: date time x-edge-location c-ip x-event sc-bytes x-cf-status x-cf-client-id cs-uri-stem cs-uri-query c-referrer x-page-url​ c-user-agent x-sname x-sname-query x-file-ext x-sid
3
+ 2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
4
+ 2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1
Binary file
Binary file
@@ -0,0 +1,2 @@
1
+ 2015-01-01T02:52:45.866722Z no "GET http://www.logstash.com:80/utfmadness/��4od HTTP/1.1"
2
+
@@ -0,0 +1,2 @@
1
+ { "hello": "world" }
2
+ { "hello": "awesome world" }
@@ -0,0 +1,2 @@
1
+ { "message": ["GET", 32, "/health"] }
2
+ { "message": true }
@@ -0,0 +1,6 @@
1
+ __SEPARATOR__
2
+ file:1 record:1 line:1
3
+ file:1 record:1 line:2
4
+ __SEPARATOR__
5
+ file:1 record:2 line:1
6
+ file:1 record:2 line:2
@@ -0,0 +1,2 @@
1
+ 2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
2
+ 2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1