logstash-input-s3-local 3.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,442 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+ require "logstash/plugin_mixins/aws_config"
5
+ require "time"
6
+ require "tmpdir"
7
+ require "stud/interval"
8
+ require "stud/temporary"
9
+ require "aws-sdk"
10
+ require "logstash/inputs/s3/patch"
11
+
12
+ require 'java'
13
+ java_import java.io.InputStream
14
+ java_import java.io.InputStreamReader
15
+ java_import java.io.FileInputStream
16
+ java_import java.io.BufferedReader
17
+ java_import java.util.zip.GZIPInputStream
18
+ java_import java.util.zip.ZipException
19
+
20
+ Aws.eager_autoload!
21
+ # Stream events from files from a S3 bucket.
22
+ #
23
+ # Each line from each file generates an event.
24
+ # Files ending in `.gz` are handled as gzip'ed files.
25
+ class LogStash::Inputs::S3 < LogStash::Inputs::Base
26
+ include LogStash::PluginMixins::AwsConfig::V2
27
+
28
+ config_name "s3"
29
+
30
+ default :codec, "plain"
31
+
32
+ # The name of the S3 bucket.
33
+ config :bucket, :validate => :string, :required => true
34
+
35
+ # If specified, the prefix of filenames in the bucket must match (not a regexp)
36
+ config :prefix, :validate => :string, :default => nil
37
+
38
+ config :additional_settings, :validate => :hash, :default => {}
39
+
40
+ # The path to use for writing state. The state stored by this plugin is
41
+ # a memory of files already processed by this plugin.
42
+ #
43
+ # If not specified, the default is in `{path.data}/plugins/inputs/s3/...`
44
+ #
45
+ # Should be a path with filename not just a directory.
46
+ config :sincedb_path, :validate => :string, :default => nil
47
+
48
+ # Name of a S3 bucket to backup processed files to.
49
+ config :backup_to_bucket, :validate => :string, :default => nil
50
+
51
+ # Append a prefix to the key (full path including file name in s3) after processing.
52
+ # If backing up to another (or the same) bucket, this effectively lets you
53
+ # choose a new 'folder' to place the files in
54
+ config :backup_add_prefix, :validate => :string, :default => nil
55
+
56
+ # Path of a local directory to backup processed files to.
57
+ config :backup_to_dir, :validate => :string, :default => nil
58
+
59
+ # Whether to delete processed files from the original bucket.
60
+ config :delete, :validate => :boolean, :default => false
61
+
62
+ # Interval to wait between to check the file list again after a run is finished.
63
+ # Value is in seconds.
64
+ config :interval, :validate => :number, :default => 60
65
+
66
+ # Ruby style regexp of keys to exclude from the bucket
67
+ config :exclude_pattern, :validate => :string, :default => nil
68
+
69
+ # Set the directory where logstash will store the tmp files before processing them.
70
+ # default to the current OS temporary directory in linux /tmp/logstash
71
+ config :temporary_directory, :validate => :string, :default => File.join(Dir.tmpdir, "logstash")
72
+
73
+ public
74
+ def register
75
+ require "fileutils"
76
+ require "digest/md5"
77
+ require "aws-sdk-resources"
78
+
79
+ @logger.info("Registering s3 input", :bucket => @bucket, :region => @region)
80
+
81
+ s3 = get_s3object
82
+
83
+ @s3bucket = s3.bucket(@bucket)
84
+
85
+ unless @backup_to_bucket.nil?
86
+ @backup_bucket = s3.bucket(@backup_to_bucket)
87
+ begin
88
+ s3.client.head_bucket({ :bucket => @backup_to_bucket})
89
+ rescue Aws::S3::Errors::NoSuchBucket
90
+ s3.create_bucket({ :bucket => @backup_to_bucket})
91
+ end
92
+ end
93
+
94
+ unless @backup_to_dir.nil?
95
+ Dir.mkdir(@backup_to_dir, 0700) unless File.exists?(@backup_to_dir)
96
+ end
97
+
98
+ FileUtils.mkdir_p(@temporary_directory) unless Dir.exist?(@temporary_directory)
99
+ end
100
+
101
+ public
102
+ def run(queue)
103
+ @current_thread = Thread.current
104
+ Stud.interval(@interval) do
105
+ process_files(queue)
106
+ end
107
+ end # def run
108
+
109
+ public
110
+ def list_new_files
111
+ objects = {}
112
+ found = false
113
+ begin
114
+ @s3bucket.objects(:prefix => @prefix).each do |log|
115
+ found = true
116
+ @logger.debug("S3 input: Found key", :key => log.key)
117
+ if !ignore_filename?(log.key)
118
+ if sincedb.newer?(log.last_modified) && log.content_length > 0
119
+ objects[log.key] = log.last_modified
120
+ @logger.debug("S3 input: Adding to objects[]", :key => log.key)
121
+ @logger.debug("objects[] length is: ", :length => objects.length)
122
+ end
123
+ else
124
+ @logger.debug('S3 input: Ignoring', :key => log.key)
125
+ end
126
+ end
127
+ @logger.info('S3 input: No files found in bucket', :prefix => prefix) unless found
128
+ rescue Aws::Errors::ServiceError => e
129
+ @logger.error("S3 input: Unable to list objects in bucket", :prefix => prefix, :message => e.message)
130
+ end
131
+ objects.keys.sort {|a,b| objects[a] <=> objects[b]}
132
+ end # def fetch_new_files
133
+
134
+ public
135
+ def backup_to_bucket(object)
136
+ unless @backup_to_bucket.nil?
137
+ backup_key = "#{@backup_add_prefix}#{object.key}"
138
+ @backup_bucket.object(backup_key).copy_from(:copy_source => "#{object.bucket_name}/#{object.key}")
139
+ if @delete
140
+ object.delete()
141
+ end
142
+ end
143
+ end
144
+
145
+ public
146
+ def backup_to_dir(filename)
147
+ unless @backup_to_dir.nil?
148
+ FileUtils.cp(filename, @backup_to_dir)
149
+ end
150
+ end
151
+
152
+ public
153
+ def process_files(queue)
154
+ objects = list_new_files
155
+
156
+ objects.each do |key|
157
+ if stop?
158
+ break
159
+ else
160
+ @logger.debug("S3 input processing", :bucket => @bucket, :key => key)
161
+ process_log(queue, key)
162
+ end
163
+ end
164
+ end # def process_files
165
+
166
+ public
167
+ def stop
168
+ # @current_thread is initialized in the `#run` method,
169
+ # this variable is needed because the `#stop` is a called in another thread
170
+ # than the `#run` method and requiring us to call stop! with a explicit thread.
171
+ Stud.stop!(@current_thread)
172
+ end
173
+
174
+ private
175
+
176
+ # Read the content of the local file
177
+ #
178
+ # @param [Queue] Where to push the event
179
+ # @param [String] Which file to read from
180
+ # @return [Boolean] True if the file was completely read, false otherwise.
181
+ def process_local_log(queue, filename, key)
182
+ @logger.debug('Processing file', :filename => filename)
183
+ metadata = {}
184
+ # Currently codecs operates on bytes instead of stream.
185
+ # So all IO stuff: decompression, reading need to be done in the actual
186
+ # input and send as bytes to the codecs.
187
+ read_file(filename) do |line|
188
+ if stop?
189
+ @logger.warn("Logstash S3 input, stop reading in the middle of the file, we will read it again when logstash is started")
190
+ return false
191
+ end
192
+
193
+ @codec.decode(line) do |event|
194
+ # We are making an assumption concerning cloudfront
195
+ # log format, the user will use the plain or the line codec
196
+ # and the message key will represent the actual line content.
197
+ # If the event is only metadata the event will be drop.
198
+ # This was the behavior of the pre 1.5 plugin.
199
+ #
200
+ # The line need to go through the codecs to replace
201
+ # unknown bytes in the log stream before doing a regexp match or
202
+ # you will get a `Error: invalid byte sequence in UTF-8'
203
+ if event_is_metadata?(event)
204
+ @logger.debug('Event is metadata, updating the current cloudfront metadata', :event => event)
205
+ update_metadata(metadata, event)
206
+ else
207
+ decorate(event)
208
+
209
+ event.set("cloudfront_version", metadata[:cloudfront_version]) unless metadata[:cloudfront_version].nil?
210
+ event.set("cloudfront_fields", metadata[:cloudfront_fields]) unless metadata[:cloudfront_fields].nil?
211
+
212
+ event.set("[@metadata][s3]", { "key" => key })
213
+
214
+ queue << event
215
+ end
216
+ end
217
+ end
218
+ # #ensure any stateful codecs (such as multi-line ) are flushed to the queue
219
+ @codec.flush do |event|
220
+ queue << event
221
+ end
222
+
223
+ return true
224
+ end # def process_local_log
225
+
226
+ private
227
+ def event_is_metadata?(event)
228
+ return false unless event.get("message").class == String
229
+ line = event.get("message")
230
+ version_metadata?(line) || fields_metadata?(line)
231
+ end
232
+
233
+ private
234
+ def version_metadata?(line)
235
+ line.start_with?('#Version: ')
236
+ end
237
+
238
+ private
239
+ def fields_metadata?(line)
240
+ line.start_with?('#Fields: ')
241
+ end
242
+
243
+ private
244
+ def update_metadata(metadata, event)
245
+ line = event.get('message').strip
246
+
247
+ if version_metadata?(line)
248
+ metadata[:cloudfront_version] = line.split(/#Version: (.+)/).last
249
+ end
250
+
251
+ if fields_metadata?(line)
252
+ metadata[:cloudfront_fields] = line.split(/#Fields: (.+)/).last
253
+ end
254
+ end
255
+
256
+ private
257
+ def read_file(filename, &block)
258
+ if gzip?(filename)
259
+ read_gzip_file(filename, block)
260
+ else
261
+ read_plain_file(filename, block)
262
+ end
263
+ rescue => e
264
+ # skip any broken file
265
+ @logger.error("Failed to read the file. Skip processing.", :filename => filename, :exception => e.message)
266
+ end
267
+
268
+ def read_plain_file(filename, block)
269
+ File.open(filename, 'rb') do |file|
270
+ file.each(&block)
271
+ end
272
+ end
273
+
274
+ private
275
+ def read_gzip_file(filename, block)
276
+ file_stream = FileInputStream.new(filename)
277
+ gzip_stream = GZIPInputStream.new(file_stream)
278
+ decoder = InputStreamReader.new(gzip_stream, "UTF-8")
279
+ buffered = BufferedReader.new(decoder)
280
+
281
+ while (line = buffered.readLine())
282
+ block.call(line)
283
+ end
284
+ ensure
285
+ buffered.close unless buffered.nil?
286
+ decoder.close unless decoder.nil?
287
+ gzip_stream.close unless gzip_stream.nil?
288
+ file_stream.close unless file_stream.nil?
289
+ end
290
+
291
+ private
292
+ def gzip?(filename)
293
+ filename.end_with?('.gz','.gzip')
294
+ end
295
+
296
+ private
297
+ def sincedb
298
+ @sincedb ||= if @sincedb_path.nil?
299
+ @logger.info("Using default generated file for the sincedb", :filename => sincedb_file)
300
+ SinceDB::File.new(sincedb_file)
301
+ else
302
+ @logger.info("Using the provided sincedb_path",
303
+ :sincedb_path => @sincedb_path)
304
+ SinceDB::File.new(@sincedb_path)
305
+ end
306
+ end
307
+
308
+ private
309
+ def sincedb_file
310
+ digest = Digest::MD5.hexdigest("#{@bucket}+#{@prefix}")
311
+ dir = File.join(LogStash::SETTINGS.get_value("path.data"), "plugins", "inputs", "s3")
312
+ FileUtils::mkdir_p(dir)
313
+ path = File.join(dir, "sincedb_#{digest}")
314
+
315
+ # Migrate old default sincedb path to new one.
316
+ if ENV["HOME"]
317
+ # This is the old file path including the old digest mechanism.
318
+ # It remains as a way to automatically upgrade users with the old default ($HOME)
319
+ # to the new default (path.data)
320
+ old = File.join(ENV["HOME"], ".sincedb_" + Digest::MD5.hexdigest("#{@bucket}+#{@prefix}"))
321
+ if File.exist?(old)
322
+ logger.info("Migrating old sincedb in $HOME to {path.data}")
323
+ FileUtils.mv(old, path)
324
+ end
325
+ end
326
+
327
+ path
328
+ end
329
+
330
+ def symbolized_settings
331
+ @symbolized_settings ||= symbolize(@additional_settings)
332
+ end
333
+
334
+ def symbolize(hash)
335
+ return hash unless hash.is_a?(Hash)
336
+ symbolized = {}
337
+ hash.each { |key, value| symbolized[key.to_sym] = symbolize(value) }
338
+ symbolized
339
+ end
340
+
341
+ private
342
+ def old_sincedb_file
343
+ end
344
+
345
+ private
346
+ def ignore_filename?(filename)
347
+ if @prefix == filename
348
+ return true
349
+ elsif filename.end_with?("/")
350
+ return true
351
+ elsif (@backup_add_prefix && @backup_to_bucket == @bucket && filename =~ /^#{backup_add_prefix}/)
352
+ return true
353
+ elsif @exclude_pattern.nil?
354
+ return false
355
+ elsif filename =~ Regexp.new(@exclude_pattern)
356
+ return true
357
+ else
358
+ return false
359
+ end
360
+ end
361
+
362
+ private
363
+ def process_log(queue, key)
364
+ object = @s3bucket.object(key)
365
+
366
+ filename = File.join(temporary_directory, File.basename(key))
367
+ if download_remote_file(object, filename)
368
+ if process_local_log(queue, filename, key)
369
+ lastmod = object.last_modified
370
+ backup_to_bucket(object)
371
+ backup_to_dir(filename)
372
+ delete_file_from_bucket(object)
373
+ FileUtils.remove_entry_secure(filename, true)
374
+ sincedb.write(lastmod)
375
+ end
376
+ else
377
+ FileUtils.remove_entry_secure(filename, true)
378
+ end
379
+ end
380
+
381
+ private
382
+ # Stream the remove file to the local disk
383
+ #
384
+ # @param [S3Object] Reference to the remove S3 objec to download
385
+ # @param [String] The Temporary filename to stream to.
386
+ # @return [Boolean] True if the file was completely downloaded
387
+ def download_remote_file(remote_object, local_filename)
388
+ completed = false
389
+ @logger.debug("S3 input: Download remote file", :remote_key => remote_object.key, :local_filename => local_filename)
390
+ File.open(local_filename, 'wb') do |s3file|
391
+ return completed if stop?
392
+ begin
393
+ remote_object.get(:response_target => s3file)
394
+ completed = true
395
+ rescue Aws::Errors::ServiceError => e
396
+ @logger.warn("S3 input: Unable to download remote file", :remote_key => remote_object.key, :message => e.message)
397
+ end
398
+ end
399
+ completed
400
+ end
401
+
402
+ private
403
+ def delete_file_from_bucket(object)
404
+ if @delete and @backup_to_bucket.nil?
405
+ object.delete()
406
+ end
407
+ end
408
+
409
+ private
410
+ def get_s3object
411
+ options = symbolized_settings.merge(aws_options_hash || {})
412
+ s3 = Aws::S3::Resource.new(options)
413
+ end
414
+
415
+ private
416
+ module SinceDB
417
+ class File
418
+ def initialize(file)
419
+ @sincedb_path = file
420
+ end
421
+
422
+ def newer?(date)
423
+ date > read
424
+ end
425
+
426
+ def read
427
+ if ::File.exists?(@sincedb_path)
428
+ content = ::File.read(@sincedb_path).chomp.strip
429
+ # If the file was created but we didn't have the time to write to it
430
+ return content.empty? ? Time.new(0) : Time.parse(content)
431
+ else
432
+ return Time.new(0)
433
+ end
434
+ end
435
+
436
+ def write(since = nil)
437
+ since = Time.now() if since.nil?
438
+ ::File.open(@sincedb_path, 'w') { |file| file.write(since.to_s) }
439
+ end
440
+ end
441
+ end
442
+ end # class LogStash::Inputs::S3
@@ -0,0 +1,30 @@
1
+ Gem::Specification.new do |s|
2
+
3
+ s.name = 'logstash-input-s3-local'
4
+ s.version = '3.3.5'
5
+ s.licenses = ['Apache-2.0']
6
+ s.summary = "Streams events from files in a S3 bucket"
7
+ s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
8
+ s.authors = ["Elastic"]
9
+ s.email = 'info@elastic.co'
10
+ s.homepage = "http://www.elastic.co/guide/en/logstash/current/index.html"
11
+ s.require_paths = ["lib"]
12
+
13
+ # Files
14
+ s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "docs/**/*"]
15
+
16
+ # Tests
17
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
18
+
19
+ # Special flag to let us know this is actually a logstash plugin
20
+ s.metadata = { "logstash_plugin" => "true", "logstash_group" => "input" }
21
+
22
+ # Gem dependencies
23
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 2.1.12", "<= 2.99"
24
+ s.add_runtime_dependency 'logstash-mixin-aws', '>= 4.3.0'
25
+ s.add_runtime_dependency 'stud', '~> 0.0.18'
26
+ # s.add_runtime_dependency 'aws-sdk-resources', '>= 2.0.33'
27
+ s.add_development_dependency 'logstash-devutils'
28
+ s.add_development_dependency "logstash-codec-json"
29
+ s.add_development_dependency "logstash-codec-multiline"
30
+ end
@@ -0,0 +1,4 @@
1
+ #Version: 1.0
2
+ #Fields: date time x-edge-location c-ip x-event sc-bytes x-cf-status x-cf-client-id cs-uri-stem cs-uri-query c-referrer x-page-url​ c-user-agent x-sname x-sname-query x-file-ext x-sid
3
+ 2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
4
+ 2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1
Binary file
Binary file
@@ -0,0 +1,2 @@
1
+ 2015-01-01T02:52:45.866722Z no "GET http://www.logstash.com:80/utfmadness/��4od HTTP/1.1"
2
+
@@ -0,0 +1,2 @@
1
+ { "hello": "world" }
2
+ { "hello": "awesome world" }
@@ -0,0 +1,2 @@
1
+ { "message": ["GET", 32, "/health"] }
2
+ { "message": true }
@@ -0,0 +1,6 @@
1
+ __SEPARATOR__
2
+ file:1 record:1 line:1
3
+ file:1 record:1 line:2
4
+ __SEPARATOR__
5
+ file:1 record:2 line:1
6
+ file:1 record:2 line:2
@@ -0,0 +1,2 @@
1
+ 2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
2
+ 2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1