logstash-input-s3-cloudian 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,465 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+ require "logstash/plugin_mixins/aws_config"
5
+ require "time"
6
+ require "date"
7
+ require "tmpdir"
8
+ require "stud/interval"
9
+ require "stud/temporary"
10
+ require "aws-sdk"
11
+ require "logstash/inputs/s3/patch"
12
+ require "logstash/plugin_mixins/ecs_compatibility_support"
13
+
14
+ require 'java'
15
+
16
+ Aws.eager_autoload!
17
+ # Stream events from files from a S3 bucket.
18
+ #
19
+ # Each line from each file generates an event.
20
+ # Files ending in `.gz` are handled as gzip'ed files.
21
+ class LogStash::Inputs::S3 < LogStash::Inputs::Base
22
+
23
+ java_import java.io.InputStream
24
+ java_import java.io.InputStreamReader
25
+ java_import java.io.FileInputStream
26
+ java_import java.io.BufferedReader
27
+ java_import java.util.zip.GZIPInputStream
28
+ java_import java.util.zip.ZipException
29
+
30
+ include LogStash::PluginMixins::AwsConfig::V2
31
+ include LogStash::PluginMixins::ECSCompatibilitySupport(:disabled, :v1, :v8 => :v1)
32
+
33
+ config_name "s3"
34
+
35
+ default :codec, "plain"
36
+
37
+ # The name of the S3 bucket.
38
+ config :bucket, :validate => :string, :required => true
39
+
40
+ # If specified, the prefix of filenames in the bucket must match (not a regexp)
41
+ config :prefix, :validate => :string, :default => nil
42
+
43
+ config :additional_settings, :validate => :hash, :default => {}
44
+
45
+ # The path to use for writing state. The state stored by this plugin is
46
+ # a memory of files already processed by this plugin.
47
+ #
48
+ # If not specified, the default is in `{path.data}/plugins/inputs/s3/...`
49
+ #
50
+ # Should be a path with filename not just a directory.
51
+ config :sincedb_path, :validate => :string, :default => nil
52
+
53
+ # Name of a S3 bucket to backup processed files to.
54
+ config :backup_to_bucket, :validate => :string, :default => nil
55
+
56
+ # Append a prefix to the key (full path including file name in s3) after processing.
57
+ # If backing up to another (or the same) bucket, this effectively lets you
58
+ # choose a new 'folder' to place the files in
59
+ config :backup_add_prefix, :validate => :string, :default => nil
60
+
61
+ # Path of a local directory to backup processed files to.
62
+ config :backup_to_dir, :validate => :string, :default => nil
63
+
64
+ # Whether to delete processed files from the original bucket.
65
+ config :delete, :validate => :boolean, :default => false
66
+
67
+ # Interval to wait between to check the file list again after a run is finished.
68
+ # Value is in seconds.
69
+ config :interval, :validate => :number, :default => 60
70
+
71
+ # Whether to watch for new files with the interval.
72
+ # If false, overrides any interval and only lists the s3 bucket once.
73
+ config :watch_for_new_files, :validate => :boolean, :default => true
74
+
75
+ # Ruby style regexp of keys to exclude from the bucket
76
+ config :exclude_pattern, :validate => :string, :default => nil
77
+
78
+ # Set the directory where logstash will store the tmp files before processing them.
79
+ # default to the current OS temporary directory in linux /tmp/logstash
80
+ config :temporary_directory, :validate => :string, :default => File.join(Dir.tmpdir, "logstash")
81
+
82
+ # Whether or not to include the S3 object's properties (last_modified, content_type, metadata)
83
+ # into each Event at [@metadata][s3]. Regardless of this setting, [@metdata][s3][key] will always
84
+ # be present.
85
+ config :include_object_properties, :validate => :boolean, :default => false
86
+
87
+ # Regular expression used to determine whether an input file is in gzip format.
88
+ # default to an expression that matches *.gz and *.gzip file extensions
89
+ config :gzip_pattern, :validate => :string, :default => "\.gz(ip)?$"
90
+
91
+ CUTOFF_SECOND = 3
92
+
93
+ def initialize(*params)
94
+ super
95
+ @cloudfront_fields_key = ecs_select[disabled: 'cloudfront_fields', v1: '[@metadata][s3][cloudfront][fields]']
96
+ @cloudfront_version_key = ecs_select[disabled: 'cloudfront_version', v1: '[@metadata][s3][cloudfront][version]']
97
+ end
98
+
99
+ def register
100
+ require "fileutils"
101
+ require "digest/md5"
102
+ require "aws-sdk-resources"
103
+
104
+ @logger.info("Registering", :bucket => @bucket, :region => @region)
105
+
106
+ s3 = get_s3object
107
+
108
+ @s3bucket = s3.bucket(@bucket)
109
+
110
+ unless @backup_to_bucket.nil?
111
+ @backup_bucket = s3.bucket(@backup_to_bucket)
112
+ begin
113
+ s3.client.head_bucket({ :bucket => @backup_to_bucket})
114
+ rescue Aws::S3::Errors::NoSuchBucket
115
+ s3.create_bucket({ :bucket => @backup_to_bucket})
116
+ end
117
+ end
118
+
119
+ unless @backup_to_dir.nil?
120
+ Dir.mkdir(@backup_to_dir, 0700) unless File.exists?(@backup_to_dir)
121
+ end
122
+
123
+ FileUtils.mkdir_p(@temporary_directory) unless Dir.exist?(@temporary_directory)
124
+
125
+ if !@watch_for_new_files && original_params.include?('interval')
126
+ logger.warn("`watch_for_new_files` has been disabled; `interval` directive will be ignored.")
127
+ end
128
+ end
129
+
130
+ def run(queue)
131
+ @current_thread = Thread.current
132
+ Stud.interval(@interval) do
133
+ process_files(queue)
134
+ stop unless @watch_for_new_files
135
+ end
136
+ end # def run
137
+
138
+ def list_new_files
139
+ objects = []
140
+ found = false
141
+ current_time = Time.now
142
+ sincedb_time = sincedb.read
143
+ begin
144
+ @s3bucket.objects(:prefix => @prefix).each do |log|
145
+ found = true
146
+ @logger.debug('Found key', :key => log.key)
147
+ if ignore_filename?(log.key)
148
+ @logger.debug('Ignoring', :key => log.key)
149
+ elsif log.content_length <= 0
150
+ @logger.debug('Object Zero Length', :key => log.key)
151
+ elsif log.last_modified <= sincedb_time
152
+ @logger.debug('Object Not Modified', :key => log.key)
153
+ elsif log.last_modified > (current_time - CUTOFF_SECOND).utc # file modified within last two seconds will be processed in next cycle
154
+ @logger.debug('Object Modified After Cutoff Time', :key => log.key)
155
+ elsif (log.storage_class == 'GLACIER' || log.storage_class == 'DEEP_ARCHIVE') && !file_restored?(log.object)
156
+ @logger.debug('Object Archived to Glacier', :key => log.key)
157
+ else
158
+ objects << log
159
+ @logger.debug("Added to objects[]", :key => log.key, :length => objects.length)
160
+ end
161
+ end
162
+ @logger.info('No files found in bucket', :prefix => prefix) unless found
163
+ rescue Aws::Errors::ServiceError => e
164
+ @logger.error("Unable to list objects in bucket", :exception => e.class, :message => e.message, :backtrace => e.backtrace, :prefix => prefix)
165
+ end
166
+ objects.sort_by { |log| log.last_modified }
167
+ end # def fetch_new_files
168
+
169
+ def backup_to_bucket(object)
170
+ unless @backup_to_bucket.nil?
171
+ backup_key = "#{@backup_add_prefix}#{object.key}"
172
+ @backup_bucket.object(backup_key).copy_from(:copy_source => "#{object.bucket_name}/#{object.key}")
173
+ if @delete
174
+ object.delete()
175
+ end
176
+ end
177
+ end
178
+
179
+ def backup_to_dir(filename)
180
+ unless @backup_to_dir.nil?
181
+ FileUtils.cp(filename, @backup_to_dir)
182
+ end
183
+ end
184
+
185
+ def process_files(queue)
186
+ objects = list_new_files
187
+
188
+ objects.each do |log|
189
+ if stop?
190
+ break
191
+ else
192
+ process_log(queue, log)
193
+ end
194
+ end
195
+ end # def process_files
196
+
197
+ def stop
198
+ # @current_thread is initialized in the `#run` method,
199
+ # this variable is needed because the `#stop` is a called in another thread
200
+ # than the `#run` method and requiring us to call stop! with a explicit thread.
201
+ Stud.stop!(@current_thread)
202
+ end
203
+
204
+ private
205
+
206
+ # Read the content of the local file
207
+ #
208
+ # @param [Queue] Where to push the event
209
+ # @param [String] Which file to read from
210
+ # @param [S3Object] Source s3 object
211
+ # @return [Boolean] True if the file was completely read, false otherwise.
212
+ def process_local_log(queue, filename, object)
213
+ @logger.debug('Processing file', :filename => filename)
214
+ metadata = {}
215
+ # Currently codecs operates on bytes instead of stream.
216
+ # So all IO stuff: decompression, reading need to be done in the actual
217
+ # input and send as bytes to the codecs.
218
+ read_file(filename) do |line|
219
+ if stop?
220
+ @logger.warn("Logstash S3 input, stop reading in the middle of the file, we will read it again when logstash is started")
221
+ return false
222
+ end
223
+
224
+ @codec.decode(line) do |event|
225
+ # We are making an assumption concerning cloudfront
226
+ # log format, the user will use the plain or the line codec
227
+ # and the message key will represent the actual line content.
228
+ # If the event is only metadata the event will be drop.
229
+ # This was the behavior of the pre 1.5 plugin.
230
+ #
231
+ # The line need to go through the codecs to replace
232
+ # unknown bytes in the log stream before doing a regexp match or
233
+ # you will get a `Error: invalid byte sequence in UTF-8'
234
+ if event_is_metadata?(event)
235
+ @logger.debug('Event is metadata, updating the current cloudfront metadata', :event => event)
236
+ update_metadata(metadata, event)
237
+ else
238
+ push_decoded_event(queue, metadata, object, event)
239
+ end
240
+ end
241
+ end
242
+ # #ensure any stateful codecs (such as multi-line ) are flushed to the queue
243
+ @codec.flush do |event|
244
+ push_decoded_event(queue, metadata, object, event)
245
+ end
246
+
247
+ return true
248
+ end # def process_local_log
249
+
250
+ def push_decoded_event(queue, metadata, object, event)
251
+ decorate(event)
252
+
253
+ if @include_object_properties
254
+ event.set("[@metadata][s3]", object.data.to_h)
255
+ else
256
+ event.set("[@metadata][s3]", {})
257
+ end
258
+
259
+ event.set("[@metadata][s3][key]", object.key)
260
+ event.set(@cloudfront_version_key, metadata[:cloudfront_version]) unless metadata[:cloudfront_version].nil?
261
+ event.set(@cloudfront_fields_key, metadata[:cloudfront_fields]) unless metadata[:cloudfront_fields].nil?
262
+
263
+ queue << event
264
+ end
265
+
266
+ def event_is_metadata?(event)
267
+ return false unless event.get("message").class == String
268
+ line = event.get("message")
269
+ version_metadata?(line) || fields_metadata?(line)
270
+ end
271
+
272
+ def version_metadata?(line)
273
+ line.start_with?('#Version: ')
274
+ end
275
+
276
+ def fields_metadata?(line)
277
+ line.start_with?('#Fields: ')
278
+ end
279
+
280
+ def update_metadata(metadata, event)
281
+ line = event.get('message').strip
282
+
283
+ if version_metadata?(line)
284
+ metadata[:cloudfront_version] = line.split(/#Version: (.+)/).last
285
+ end
286
+
287
+ if fields_metadata?(line)
288
+ metadata[:cloudfront_fields] = line.split(/#Fields: (.+)/).last
289
+ end
290
+ end
291
+
292
+ def read_file(filename, &block)
293
+ if gzip?(filename)
294
+ read_gzip_file(filename, block)
295
+ else
296
+ read_plain_file(filename, block)
297
+ end
298
+ rescue => e
299
+ # skip any broken file
300
+ @logger.error("Failed to read file, processing skipped", :exception => e.class, :message => e.message, :filename => filename)
301
+ end
302
+
303
+ def read_plain_file(filename, block)
304
+ File.open(filename, 'rb') do |file|
305
+ file.each(&block)
306
+ end
307
+ end
308
+
309
+ def read_gzip_file(filename, block)
310
+ file_stream = FileInputStream.new(filename)
311
+ gzip_stream = GZIPInputStream.new(file_stream)
312
+ decoder = InputStreamReader.new(gzip_stream, "UTF-8")
313
+ buffered = BufferedReader.new(decoder)
314
+
315
+ while (line = buffered.readLine())
316
+ block.call(line)
317
+ end
318
+ ensure
319
+ buffered.close unless buffered.nil?
320
+ decoder.close unless decoder.nil?
321
+ gzip_stream.close unless gzip_stream.nil?
322
+ file_stream.close unless file_stream.nil?
323
+ end
324
+
325
+ def gzip?(filename)
326
+ Regexp.new(@gzip_pattern).match(filename)
327
+ end
328
+
329
+ def sincedb
330
+ @sincedb ||= if @sincedb_path.nil?
331
+ @logger.info("Using default generated file for the sincedb", :filename => sincedb_file)
332
+ SinceDB::File.new(sincedb_file)
333
+ else
334
+ @logger.info("Using the provided sincedb_path", :sincedb_path => @sincedb_path)
335
+ SinceDB::File.new(@sincedb_path)
336
+ end
337
+ end
338
+
339
+ def sincedb_file
340
+ digest = Digest::MD5.hexdigest("#{@bucket}+#{@prefix}")
341
+ dir = File.join(LogStash::SETTINGS.get_value("path.data"), "plugins", "inputs", "s3")
342
+ FileUtils::mkdir_p(dir)
343
+ path = File.join(dir, "sincedb_#{digest}")
344
+
345
+ # Migrate old default sincedb path to new one.
346
+ if ENV["HOME"]
347
+ # This is the old file path including the old digest mechanism.
348
+ # It remains as a way to automatically upgrade users with the old default ($HOME)
349
+ # to the new default (path.data)
350
+ old = File.join(ENV["HOME"], ".sincedb_" + Digest::MD5.hexdigest("#{@bucket}+#{@prefix}"))
351
+ if File.exist?(old)
352
+ logger.info("Migrating old sincedb in $HOME to {path.data}")
353
+ FileUtils.mv(old, path)
354
+ end
355
+ end
356
+
357
+ path
358
+ end
359
+
360
+ def ignore_filename?(filename)
361
+ if @prefix == filename
362
+ return true
363
+ elsif filename.end_with?("/")
364
+ return true
365
+ elsif (@backup_add_prefix && @backup_to_bucket == @bucket && filename =~ /^#{backup_add_prefix}/)
366
+ return true
367
+ elsif @exclude_pattern.nil?
368
+ return false
369
+ elsif filename =~ Regexp.new(@exclude_pattern)
370
+ return true
371
+ else
372
+ return false
373
+ end
374
+ end
375
+
376
+ def process_log(queue, log)
377
+ @logger.debug("Processing", :bucket => @bucket, :key => log.key)
378
+ object = @s3bucket.object(log.key)
379
+
380
+ filename = File.join(temporary_directory, File.basename(log.key))
381
+ if download_remote_file(object, filename)
382
+ if process_local_log(queue, filename, object)
383
+ backup_to_bucket(object)
384
+ backup_to_dir(filename)
385
+ delete_file_from_bucket(object)
386
+ FileUtils.remove_entry_secure(filename, true)
387
+ sincedb.write(log.last_modified)
388
+ end
389
+ else
390
+ FileUtils.remove_entry_secure(filename, true)
391
+ end
392
+ end
393
+
394
+ # Stream the remove file to the local disk
395
+ #
396
+ # @param [S3Object] Reference to the remove S3 objec to download
397
+ # @param [String] The Temporary filename to stream to.
398
+ # @return [Boolean] True if the file was completely downloaded
399
+ def download_remote_file(remote_object, local_filename)
400
+ completed = false
401
+ @logger.debug("Downloading remote file", :remote_key => remote_object.key, :local_filename => local_filename)
402
+ File.open(local_filename, 'wb') do |s3file|
403
+ return completed if stop?
404
+ begin
405
+ remote_object.get(:response_target => s3file)
406
+ completed = true
407
+ rescue Aws::Errors::ServiceError => e
408
+ @logger.warn("Unable to download remote file", :exception => e.class, :message => e.message, :remote_key => remote_object.key)
409
+ end
410
+ end
411
+ completed
412
+ end
413
+
414
+ def delete_file_from_bucket(object)
415
+ if @delete and @backup_to_bucket.nil?
416
+ object.delete()
417
+ end
418
+ end
419
+
420
+ def get_s3object
421
+ s3 = Aws::S3::Resource.new(aws_options_hash || {})
422
+ end
423
+
424
+ def file_restored?(object)
425
+ begin
426
+ restore = object.data.restore
427
+ if restore && restore.match(/ongoing-request\s?=\s?["']false["']/)
428
+ if restore = restore.match(/expiry-date\s?=\s?["'](.*?)["']/)
429
+ expiry_date = DateTime.parse(restore[1])
430
+ return true if DateTime.now < expiry_date # restored
431
+ else
432
+ @logger.debug("No expiry-date header for restore request: #{object.data.restore}")
433
+ return nil # no expiry-date found for ongoing request
434
+ end
435
+ end
436
+ rescue => e
437
+ @logger.debug("Could not determine Glacier restore status", :exception => e.class, :message => e.message)
438
+ end
439
+ return false
440
+ end
441
+
442
+ module SinceDB
443
+ class File
444
+ def initialize(file)
445
+ @sincedb_path = file
446
+ end
447
+
448
+ # @return [Time]
449
+ def read
450
+ if ::File.exists?(@sincedb_path)
451
+ content = ::File.read(@sincedb_path).chomp.strip
452
+ # If the file was created but we didn't have the time to write to it
453
+ return content.empty? ? Time.new(0) : Time.parse(content)
454
+ else
455
+ return Time.new(0)
456
+ end
457
+ end
458
+
459
+ def write(since = nil)
460
+ since = Time.now if since.nil?
461
+ ::File.open(@sincedb_path, 'w') { |file| file.write(since.to_s) }
462
+ end
463
+ end
464
+ end
465
+ end # class LogStash::Inputs::S3
@@ -0,0 +1,31 @@
1
+ Gem::Specification.new do |s|
2
+
3
+ s.name = 'logstash-input-s3-cloudian'
4
+ s.version = '1.0.0'
5
+ s.licenses = ['Apache-2.0']
6
+ s.summary = "Streams events from files in a S3 bucket"
7
+ s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
8
+ s.authors = ["dm.belousov"]
9
+ s.email = 'dmitriy.belousov@gmail.com'
10
+ s.homepage = "http://www.elastic.co/guide/en/logstash/current/index.html"
11
+ s.require_paths = ["lib"]
12
+
13
+ # Files
14
+ s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "docs/**/*"]
15
+
16
+ # Tests
17
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
18
+
19
+ # Special flag to let us know this is actually a logstash plugin
20
+ s.metadata = { "logstash_plugin" => "true", "logstash_group" => "input" }
21
+
22
+ # Gem dependencies
23
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 2.1.12", "<= 2.99"
24
+ s.add_runtime_dependency 'logstash-mixin-aws', '>= 5.1.0'
25
+ s.add_runtime_dependency 'stud', '~> 0.0.18'
26
+ # s.add_runtime_dependency 'aws-sdk-resources', '>= 2.0.33'
27
+ s.add_development_dependency 'logstash-devutils'
28
+ s.add_development_dependency "logstash-codec-json"
29
+ s.add_development_dependency "logstash-codec-multiline"
30
+ s.add_runtime_dependency 'logstash-mixin-ecs_compatibility_support', '~>1.2'
31
+ end
@@ -0,0 +1,4 @@
1
+ #Version: 1.0
2
+ #Fields: date time x-edge-location c-ip x-event sc-bytes x-cf-status x-cf-client-id cs-uri-stem cs-uri-query c-referrer x-page-url​ c-user-agent x-sname x-sname-query x-file-ext x-sid
3
+ 2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
4
+ 2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1
Binary file
Binary file
@@ -0,0 +1,2 @@
1
+ 2015-01-01T02:52:45.866722Z no "GET http://www.logstash.com:80/utfmadness/��4od HTTP/1.1"
2
+
@@ -0,0 +1,2 @@
1
+ { "hello": "world" }
2
+ { "hello": "awesome world" }
@@ -0,0 +1,2 @@
1
+ { "message": ["GET", 32, "/health"] }
2
+ { "message": true }
@@ -0,0 +1,6 @@
1
+ __SEPARATOR__
2
+ file:1 record:1 line:1
3
+ file:1 record:1 line:2
4
+ __SEPARATOR__
5
+ file:1 record:2 line:1
6
+ file:1 record:2 line:2
@@ -0,0 +1,2 @@
1
+ 2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
2
+ 2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1