logstash-input-s3-local 3.3.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,442 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+ require "logstash/plugin_mixins/aws_config"
5
+ require "time"
6
+ require "tmpdir"
7
+ require "stud/interval"
8
+ require "stud/temporary"
9
+ require "aws-sdk"
10
+ require "logstash/inputs/s3/patch"
11
+
12
+ require 'java'
13
+ java_import java.io.InputStream
14
+ java_import java.io.InputStreamReader
15
+ java_import java.io.FileInputStream
16
+ java_import java.io.BufferedReader
17
+ java_import java.util.zip.GZIPInputStream
18
+ java_import java.util.zip.ZipException
19
+
20
+ Aws.eager_autoload!
21
+ # Stream events from files from a S3 bucket.
22
+ #
23
+ # Each line from each file generates an event.
24
+ # Files ending in `.gz` are handled as gzip'ed files.
25
+ class LogStash::Inputs::S3 < LogStash::Inputs::Base
26
+ include LogStash::PluginMixins::AwsConfig::V2
27
+
28
+ config_name "s3"
29
+
30
+ default :codec, "plain"
31
+
32
+ # The name of the S3 bucket.
33
+ config :bucket, :validate => :string, :required => true
34
+
35
+ # If specified, the prefix of filenames in the bucket must match (not a regexp)
36
+ config :prefix, :validate => :string, :default => nil
37
+
38
+ config :additional_settings, :validate => :hash, :default => {}
39
+
40
+ # The path to use for writing state. The state stored by this plugin is
41
+ # a memory of files already processed by this plugin.
42
+ #
43
+ # If not specified, the default is in `{path.data}/plugins/inputs/s3/...`
44
+ #
45
+ # Should be a path with filename not just a directory.
46
+ config :sincedb_path, :validate => :string, :default => nil
47
+
48
+ # Name of a S3 bucket to backup processed files to.
49
+ config :backup_to_bucket, :validate => :string, :default => nil
50
+
51
+ # Append a prefix to the key (full path including file name in s3) after processing.
52
+ # If backing up to another (or the same) bucket, this effectively lets you
53
+ # choose a new 'folder' to place the files in
54
+ config :backup_add_prefix, :validate => :string, :default => nil
55
+
56
+ # Path of a local directory to backup processed files to.
57
+ config :backup_to_dir, :validate => :string, :default => nil
58
+
59
+ # Whether to delete processed files from the original bucket.
60
+ config :delete, :validate => :boolean, :default => false
61
+
62
+ # Interval to wait between to check the file list again after a run is finished.
63
+ # Value is in seconds.
64
+ config :interval, :validate => :number, :default => 60
65
+
66
+ # Ruby style regexp of keys to exclude from the bucket
67
+ config :exclude_pattern, :validate => :string, :default => nil
68
+
69
+ # Set the directory where logstash will store the tmp files before processing them.
70
+ # default to the current OS temporary directory in linux /tmp/logstash
71
+ config :temporary_directory, :validate => :string, :default => File.join(Dir.tmpdir, "logstash")
72
+
73
+ public
74
+ def register
75
+ require "fileutils"
76
+ require "digest/md5"
77
+ require "aws-sdk-resources"
78
+
79
+ @logger.info("Registering s3 input", :bucket => @bucket, :region => @region)
80
+
81
+ s3 = get_s3object
82
+
83
+ @s3bucket = s3.bucket(@bucket)
84
+
85
+ unless @backup_to_bucket.nil?
86
+ @backup_bucket = s3.bucket(@backup_to_bucket)
87
+ begin
88
+ s3.client.head_bucket({ :bucket => @backup_to_bucket})
89
+ rescue Aws::S3::Errors::NoSuchBucket
90
+ s3.create_bucket({ :bucket => @backup_to_bucket})
91
+ end
92
+ end
93
+
94
+ unless @backup_to_dir.nil?
95
+ Dir.mkdir(@backup_to_dir, 0700) unless File.exists?(@backup_to_dir)
96
+ end
97
+
98
+ FileUtils.mkdir_p(@temporary_directory) unless Dir.exist?(@temporary_directory)
99
+ end
100
+
101
+ public
102
+ def run(queue)
103
+ @current_thread = Thread.current
104
+ Stud.interval(@interval) do
105
+ process_files(queue)
106
+ end
107
+ end # def run
108
+
109
+ public
110
+ def list_new_files
111
+ objects = {}
112
+ found = false
113
+ begin
114
+ @s3bucket.objects(:prefix => @prefix).each do |log|
115
+ found = true
116
+ @logger.debug("S3 input: Found key", :key => log.key)
117
+ if !ignore_filename?(log.key)
118
+ if sincedb.newer?(log.last_modified) && log.content_length > 0
119
+ objects[log.key] = log.last_modified
120
+ @logger.debug("S3 input: Adding to objects[]", :key => log.key)
121
+ @logger.debug("objects[] length is: ", :length => objects.length)
122
+ end
123
+ else
124
+ @logger.debug('S3 input: Ignoring', :key => log.key)
125
+ end
126
+ end
127
+ @logger.info('S3 input: No files found in bucket', :prefix => prefix) unless found
128
+ rescue Aws::Errors::ServiceError => e
129
+ @logger.error("S3 input: Unable to list objects in bucket", :prefix => prefix, :message => e.message)
130
+ end
131
+ objects.keys.sort {|a,b| objects[a] <=> objects[b]}
132
+ end # def fetch_new_files
133
+
134
+ public
135
+ def backup_to_bucket(object)
136
+ unless @backup_to_bucket.nil?
137
+ backup_key = "#{@backup_add_prefix}#{object.key}"
138
+ @backup_bucket.object(backup_key).copy_from(:copy_source => "#{object.bucket_name}/#{object.key}")
139
+ if @delete
140
+ object.delete()
141
+ end
142
+ end
143
+ end
144
+
145
+ public
146
+ def backup_to_dir(filename)
147
+ unless @backup_to_dir.nil?
148
+ FileUtils.cp(filename, @backup_to_dir)
149
+ end
150
+ end
151
+
152
+ public
153
+ def process_files(queue)
154
+ objects = list_new_files
155
+
156
+ objects.each do |key|
157
+ if stop?
158
+ break
159
+ else
160
+ @logger.debug("S3 input processing", :bucket => @bucket, :key => key)
161
+ process_log(queue, key)
162
+ end
163
+ end
164
+ end # def process_files
165
+
166
+ public
167
+ def stop
168
+ # @current_thread is initialized in the `#run` method,
169
+ # this variable is needed because the `#stop` is a called in another thread
170
+ # than the `#run` method and requiring us to call stop! with a explicit thread.
171
+ Stud.stop!(@current_thread)
172
+ end
173
+
174
+ private
175
+
176
+ # Read the content of the local file
177
+ #
178
+ # @param [Queue] Where to push the event
179
+ # @param [String] Which file to read from
180
+ # @return [Boolean] True if the file was completely read, false otherwise.
181
+ def process_local_log(queue, filename, key)
182
+ @logger.debug('Processing file', :filename => filename)
183
+ metadata = {}
184
+ # Currently codecs operates on bytes instead of stream.
185
+ # So all IO stuff: decompression, reading need to be done in the actual
186
+ # input and send as bytes to the codecs.
187
+ read_file(filename) do |line|
188
+ if stop?
189
+ @logger.warn("Logstash S3 input, stop reading in the middle of the file, we will read it again when logstash is started")
190
+ return false
191
+ end
192
+
193
+ @codec.decode(line) do |event|
194
+ # We are making an assumption concerning cloudfront
195
+ # log format, the user will use the plain or the line codec
196
+ # and the message key will represent the actual line content.
197
+ # If the event is only metadata the event will be drop.
198
+ # This was the behavior of the pre 1.5 plugin.
199
+ #
200
+ # The line need to go through the codecs to replace
201
+ # unknown bytes in the log stream before doing a regexp match or
202
+ # you will get a `Error: invalid byte sequence in UTF-8'
203
+ if event_is_metadata?(event)
204
+ @logger.debug('Event is metadata, updating the current cloudfront metadata', :event => event)
205
+ update_metadata(metadata, event)
206
+ else
207
+ decorate(event)
208
+
209
+ event.set("cloudfront_version", metadata[:cloudfront_version]) unless metadata[:cloudfront_version].nil?
210
+ event.set("cloudfront_fields", metadata[:cloudfront_fields]) unless metadata[:cloudfront_fields].nil?
211
+
212
+ event.set("[@metadata][s3]", { "key" => key })
213
+
214
+ queue << event
215
+ end
216
+ end
217
+ end
218
+ # #ensure any stateful codecs (such as multi-line ) are flushed to the queue
219
+ @codec.flush do |event|
220
+ queue << event
221
+ end
222
+
223
+ return true
224
+ end # def process_local_log
225
+
226
+ private
227
+ def event_is_metadata?(event)
228
+ return false unless event.get("message").class == String
229
+ line = event.get("message")
230
+ version_metadata?(line) || fields_metadata?(line)
231
+ end
232
+
233
+ private
234
+ def version_metadata?(line)
235
+ line.start_with?('#Version: ')
236
+ end
237
+
238
+ private
239
+ def fields_metadata?(line)
240
+ line.start_with?('#Fields: ')
241
+ end
242
+
243
+ private
244
+ def update_metadata(metadata, event)
245
+ line = event.get('message').strip
246
+
247
+ if version_metadata?(line)
248
+ metadata[:cloudfront_version] = line.split(/#Version: (.+)/).last
249
+ end
250
+
251
+ if fields_metadata?(line)
252
+ metadata[:cloudfront_fields] = line.split(/#Fields: (.+)/).last
253
+ end
254
+ end
255
+
256
+ private
257
+ def read_file(filename, &block)
258
+ if gzip?(filename)
259
+ read_gzip_file(filename, block)
260
+ else
261
+ read_plain_file(filename, block)
262
+ end
263
+ rescue => e
264
+ # skip any broken file
265
+ @logger.error("Failed to read the file. Skip processing.", :filename => filename, :exception => e.message)
266
+ end
267
+
268
+ def read_plain_file(filename, block)
269
+ File.open(filename, 'rb') do |file|
270
+ file.each(&block)
271
+ end
272
+ end
273
+
274
+ private
275
+ def read_gzip_file(filename, block)
276
+ file_stream = FileInputStream.new(filename)
277
+ gzip_stream = GZIPInputStream.new(file_stream)
278
+ decoder = InputStreamReader.new(gzip_stream, "UTF-8")
279
+ buffered = BufferedReader.new(decoder)
280
+
281
+ while (line = buffered.readLine())
282
+ block.call(line)
283
+ end
284
+ ensure
285
+ buffered.close unless buffered.nil?
286
+ decoder.close unless decoder.nil?
287
+ gzip_stream.close unless gzip_stream.nil?
288
+ file_stream.close unless file_stream.nil?
289
+ end
290
+
291
+ private
292
+ def gzip?(filename)
293
+ filename.end_with?('.gz','.gzip')
294
+ end
295
+
296
+ private
297
+ def sincedb
298
+ @sincedb ||= if @sincedb_path.nil?
299
+ @logger.info("Using default generated file for the sincedb", :filename => sincedb_file)
300
+ SinceDB::File.new(sincedb_file)
301
+ else
302
+ @logger.info("Using the provided sincedb_path",
303
+ :sincedb_path => @sincedb_path)
304
+ SinceDB::File.new(@sincedb_path)
305
+ end
306
+ end
307
+
308
+ private
309
+ def sincedb_file
310
+ digest = Digest::MD5.hexdigest("#{@bucket}+#{@prefix}")
311
+ dir = File.join(LogStash::SETTINGS.get_value("path.data"), "plugins", "inputs", "s3")
312
+ FileUtils::mkdir_p(dir)
313
+ path = File.join(dir, "sincedb_#{digest}")
314
+
315
+ # Migrate old default sincedb path to new one.
316
+ if ENV["HOME"]
317
+ # This is the old file path including the old digest mechanism.
318
+ # It remains as a way to automatically upgrade users with the old default ($HOME)
319
+ # to the new default (path.data)
320
+ old = File.join(ENV["HOME"], ".sincedb_" + Digest::MD5.hexdigest("#{@bucket}+#{@prefix}"))
321
+ if File.exist?(old)
322
+ logger.info("Migrating old sincedb in $HOME to {path.data}")
323
+ FileUtils.mv(old, path)
324
+ end
325
+ end
326
+
327
+ path
328
+ end
329
+
330
+ def symbolized_settings
331
+ @symbolized_settings ||= symbolize(@additional_settings)
332
+ end
333
+
334
+ def symbolize(hash)
335
+ return hash unless hash.is_a?(Hash)
336
+ symbolized = {}
337
+ hash.each { |key, value| symbolized[key.to_sym] = symbolize(value) }
338
+ symbolized
339
+ end
340
+
341
+ private
342
+ def old_sincedb_file
343
+ end
344
+
345
+ private
346
+ def ignore_filename?(filename)
347
+ if @prefix == filename
348
+ return true
349
+ elsif filename.end_with?("/")
350
+ return true
351
+ elsif (@backup_add_prefix && @backup_to_bucket == @bucket && filename =~ /^#{backup_add_prefix}/)
352
+ return true
353
+ elsif @exclude_pattern.nil?
354
+ return false
355
+ elsif filename =~ Regexp.new(@exclude_pattern)
356
+ return true
357
+ else
358
+ return false
359
+ end
360
+ end
361
+
362
+ private
363
+ def process_log(queue, key)
364
+ object = @s3bucket.object(key)
365
+
366
+ filename = File.join(temporary_directory, File.basename(key))
367
+ if download_remote_file(object, filename)
368
+ if process_local_log(queue, filename, key)
369
+ lastmod = object.last_modified
370
+ backup_to_bucket(object)
371
+ backup_to_dir(filename)
372
+ delete_file_from_bucket(object)
373
+ FileUtils.remove_entry_secure(filename, true)
374
+ sincedb.write(lastmod)
375
+ end
376
+ else
377
+ FileUtils.remove_entry_secure(filename, true)
378
+ end
379
+ end
380
+
381
+ private
382
+ # Stream the remove file to the local disk
383
+ #
384
+ # @param [S3Object] Reference to the remove S3 objec to download
385
+ # @param [String] The Temporary filename to stream to.
386
+ # @return [Boolean] True if the file was completely downloaded
387
+ def download_remote_file(remote_object, local_filename)
388
+ completed = false
389
+ @logger.debug("S3 input: Download remote file", :remote_key => remote_object.key, :local_filename => local_filename)
390
+ File.open(local_filename, 'wb') do |s3file|
391
+ return completed if stop?
392
+ begin
393
+ remote_object.get(:response_target => s3file)
394
+ completed = true
395
+ rescue Aws::Errors::ServiceError => e
396
+ @logger.warn("S3 input: Unable to download remote file", :remote_key => remote_object.key, :message => e.message)
397
+ end
398
+ end
399
+ completed
400
+ end
401
+
402
+ private
403
+ def delete_file_from_bucket(object)
404
+ if @delete and @backup_to_bucket.nil?
405
+ object.delete()
406
+ end
407
+ end
408
+
409
+ private
410
+ def get_s3object
411
+ options = symbolized_settings.merge(aws_options_hash || {})
412
+ s3 = Aws::S3::Resource.new(options)
413
+ end
414
+
415
+ private
416
+ module SinceDB
417
+ class File
418
+ def initialize(file)
419
+ @sincedb_path = file
420
+ end
421
+
422
+ def newer?(date)
423
+ date > read
424
+ end
425
+
426
+ def read
427
+ if ::File.exists?(@sincedb_path)
428
+ content = ::File.read(@sincedb_path).chomp.strip
429
+ # If the file was created but we didn't have the time to write to it
430
+ return content.empty? ? Time.new(0) : Time.parse(content)
431
+ else
432
+ return Time.new(0)
433
+ end
434
+ end
435
+
436
+ def write(since = nil)
437
+ since = Time.now() if since.nil?
438
+ ::File.open(@sincedb_path, 'w') { |file| file.write(since.to_s) }
439
+ end
440
+ end
441
+ end
442
+ end # class LogStash::Inputs::S3
@@ -0,0 +1,30 @@
1
+ Gem::Specification.new do |s|
2
+
3
+ s.name = 'logstash-input-s3-local'
4
+ s.version = '3.3.5'
5
+ s.licenses = ['Apache-2.0']
6
+ s.summary = "Streams events from files in a S3 bucket"
7
+ s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
8
+ s.authors = ["Elastic"]
9
+ s.email = 'info@elastic.co'
10
+ s.homepage = "http://www.elastic.co/guide/en/logstash/current/index.html"
11
+ s.require_paths = ["lib"]
12
+
13
+ # Files
14
+ s.files = Dir["lib/**/*","spec/**/*","*.gemspec","*.md","CONTRIBUTORS","Gemfile","LICENSE","NOTICE.TXT", "vendor/jar-dependencies/**/*.jar", "vendor/jar-dependencies/**/*.rb", "VERSION", "docs/**/*"]
15
+
16
+ # Tests
17
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
18
+
19
+ # Special flag to let us know this is actually a logstash plugin
20
+ s.metadata = { "logstash_plugin" => "true", "logstash_group" => "input" }
21
+
22
+ # Gem dependencies
23
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 2.1.12", "<= 2.99"
24
+ s.add_runtime_dependency 'logstash-mixin-aws', '>= 4.3.0'
25
+ s.add_runtime_dependency 'stud', '~> 0.0.18'
26
+ # s.add_runtime_dependency 'aws-sdk-resources', '>= 2.0.33'
27
+ s.add_development_dependency 'logstash-devutils'
28
+ s.add_development_dependency "logstash-codec-json"
29
+ s.add_development_dependency "logstash-codec-multiline"
30
+ end
@@ -0,0 +1,4 @@
1
+ #Version: 1.0
2
+ #Fields: date time x-edge-location c-ip x-event sc-bytes x-cf-status x-cf-client-id cs-uri-stem cs-uri-query c-referrer x-page-url​ c-user-agent x-sname x-sname-query x-file-ext x-sid
3
+ 2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
4
+ 2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1
Binary file
Binary file
@@ -0,0 +1,2 @@
1
+ 2015-01-01T02:52:45.866722Z no "GET http://www.logstash.com:80/utfmadness/��4od HTTP/1.1"
2
+
@@ -0,0 +1,2 @@
1
+ { "hello": "world" }
2
+ { "hello": "awesome world" }
@@ -0,0 +1,2 @@
1
+ { "message": ["GET", 32, "/health"] }
2
+ { "message": true }
@@ -0,0 +1,6 @@
1
+ __SEPARATOR__
2
+ file:1 record:1 line:1
3
+ file:1 record:1 line:2
4
+ __SEPARATOR__
5
+ file:1 record:2 line:1
6
+ file:1 record:2 line:2
@@ -0,0 +1,2 @@
1
+ 2010-03-12 23:51:20 SEA4 192.0.2.147 connect 2014 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 - - - -
2
+ 2010-03-12 23:51:21 SEA4 192.0.2.222 play 3914 OK bfd8a98bee0840d9b871b7f6ade9908f rtmp://shqshne4jdp4b6.cloudfront.net/cfx/st​ key=value http://player.longtailvideo.com/player.swf http://www.longtailvideo.com/support/jw-player-setup-wizard?example=204 LNX%2010,0,32,18 myvideo p=2&q=4 flv 1