logstash-input-file 4.0.5 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -3
  3. data/JAR_VERSION +1 -0
  4. data/docs/index.asciidoc +195 -37
  5. data/lib/filewatch/bootstrap.rb +74 -0
  6. data/lib/filewatch/discoverer.rb +94 -0
  7. data/lib/filewatch/helper.rb +65 -0
  8. data/lib/filewatch/observing_base.rb +97 -0
  9. data/lib/filewatch/observing_read.rb +23 -0
  10. data/lib/filewatch/observing_tail.rb +22 -0
  11. data/lib/filewatch/read_mode/handlers/base.rb +81 -0
  12. data/lib/filewatch/read_mode/handlers/read_file.rb +47 -0
  13. data/lib/filewatch/read_mode/handlers/read_zip_file.rb +57 -0
  14. data/lib/filewatch/read_mode/processor.rb +117 -0
  15. data/lib/filewatch/settings.rb +67 -0
  16. data/lib/filewatch/sincedb_collection.rb +215 -0
  17. data/lib/filewatch/sincedb_record_serializer.rb +70 -0
  18. data/lib/filewatch/sincedb_value.rb +87 -0
  19. data/lib/filewatch/tail_mode/handlers/base.rb +124 -0
  20. data/lib/filewatch/tail_mode/handlers/create.rb +17 -0
  21. data/lib/filewatch/tail_mode/handlers/create_initial.rb +21 -0
  22. data/lib/filewatch/tail_mode/handlers/delete.rb +11 -0
  23. data/lib/filewatch/tail_mode/handlers/grow.rb +11 -0
  24. data/lib/filewatch/tail_mode/handlers/shrink.rb +20 -0
  25. data/lib/filewatch/tail_mode/handlers/timeout.rb +10 -0
  26. data/lib/filewatch/tail_mode/handlers/unignore.rb +37 -0
  27. data/lib/filewatch/tail_mode/processor.rb +209 -0
  28. data/lib/filewatch/watch.rb +107 -0
  29. data/lib/filewatch/watched_file.rb +226 -0
  30. data/lib/filewatch/watched_files_collection.rb +84 -0
  31. data/lib/filewatch/winhelper.rb +65 -0
  32. data/lib/jars/filewatch-1.0.0.jar +0 -0
  33. data/lib/logstash/inputs/delete_completed_file_handler.rb +9 -0
  34. data/lib/logstash/inputs/file.rb +162 -107
  35. data/lib/logstash/inputs/file_listener.rb +61 -0
  36. data/lib/logstash/inputs/log_completed_file_handler.rb +13 -0
  37. data/logstash-input-file.gemspec +5 -4
  38. data/spec/filewatch/buftok_spec.rb +24 -0
  39. data/spec/filewatch/reading_spec.rb +128 -0
  40. data/spec/filewatch/sincedb_record_serializer_spec.rb +71 -0
  41. data/spec/filewatch/spec_helper.rb +120 -0
  42. data/spec/filewatch/tailing_spec.rb +440 -0
  43. data/spec/filewatch/watched_file_spec.rb +38 -0
  44. data/spec/filewatch/watched_files_collection_spec.rb +73 -0
  45. data/spec/filewatch/winhelper_spec.rb +22 -0
  46. data/spec/fixtures/compressed.log.gz +0 -0
  47. data/spec/fixtures/compressed.log.gzip +0 -0
  48. data/spec/fixtures/invalid_utf8.gbk.log +2 -0
  49. data/spec/fixtures/no-final-newline.log +2 -0
  50. data/spec/fixtures/uncompressed.log +2 -0
  51. data/spec/{spec_helper.rb → helpers/spec_helper.rb} +14 -41
  52. data/spec/inputs/file_read_spec.rb +155 -0
  53. data/spec/inputs/{file_spec.rb → file_tail_spec.rb} +55 -52
  54. metadata +96 -28
@@ -0,0 +1,94 @@
1
+ # encoding: utf-8
2
+ require "logstash/util/loggable"
3
+
4
+ module FileWatch
5
+ class Discoverer
6
+ # given a path or glob will prepare for and discover files to watch
7
+ # if they are not excluded or ignorable
8
+ # they are added to the watched_files collection and
9
+ # associated with a sincedb entry if one can be found
10
+ include LogStash::Util::Loggable
11
+
12
+ def initialize(watched_files_collection, sincedb_collection, settings)
13
+ @watching = []
14
+ @exclude = []
15
+ @watched_files_collection = watched_files_collection
16
+ @sincedb_collection = sincedb_collection
17
+ @settings = settings
18
+ @settings.exclude.each { |p| @exclude << p }
19
+ end
20
+
21
+ def add_path(path)
22
+ return if @watching.member?(path)
23
+ @watching << path
24
+ discover_files(path)
25
+ self
26
+ end
27
+
28
+ def discover
29
+ @watching.each do |path|
30
+ discover_files(path)
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def can_exclude?(watched_file, new_discovery)
37
+ @exclude.each do |pattern|
38
+ if watched_file.pathname.fnmatch?(pattern)
39
+ if new_discovery
40
+ logger.debug("Discoverer can_exclude?: #{watched_file.path}: skipping " +
41
+ "because it matches exclude #{pattern}")
42
+ end
43
+ watched_file.unwatch
44
+ return true
45
+ end
46
+ end
47
+ false
48
+ end
49
+
50
+ def discover_files(path)
51
+ globbed = Dir.glob(path)
52
+ globbed = [path] if globbed.empty?
53
+ logger.debug("Discoverer found files, count: #{globbed.size}")
54
+ globbed.each do |file|
55
+ logger.debug("Discoverer found file, path: #{file}")
56
+ pathname = Pathname.new(file)
57
+ next unless pathname.file?
58
+ next if pathname.symlink?
59
+ new_discovery = false
60
+ watched_file = @watched_files_collection.watched_file_by_path(file)
61
+ if watched_file.nil?
62
+ logger.debug("Discoverer discover_files: #{path}: new: #{file} (exclude is #{@exclude.inspect})")
63
+ new_discovery = true
64
+ watched_file = WatchedFile.new(pathname, pathname.stat, @settings)
65
+ end
66
+ # if it already unwatched or its excluded then we can skip
67
+ next if watched_file.unwatched? || can_exclude?(watched_file, new_discovery)
68
+
69
+ if new_discovery
70
+ if watched_file.file_ignorable?
71
+ logger.debug("Discoverer discover_files: #{file}: skipping because it was last modified more than #{@settings.ignore_older} seconds ago")
72
+ # on discovery ignorable watched_files are put into the ignored state and that
73
+ # updates the size from the internal stat
74
+ # so the existing contents are not read.
75
+ # because, normally, a newly discovered file will
76
+ # have a watched_file size of zero
77
+ # they are still added to the collection so we know they are there for the next periodic discovery
78
+ watched_file.ignore
79
+ end
80
+ # now add the discovered file to the watched_files collection and adjust the sincedb collections
81
+ @watched_files_collection.add(watched_file)
82
+ # initially when the sincedb collection is filled with records from the persistence file
83
+ # each value is not associated with a watched file
84
+ # a sincedb_value can be:
85
+ # unassociated
86
+ # associated with this watched_file
87
+ # associated with a different watched_file
88
+ @sincedb_collection.associate(watched_file)
89
+ end
90
+ # at this point the watched file is created, is in the db but not yet opened or being processed
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,65 @@
1
+ # encoding: utf-8
2
+ # code downloaded from Ruby on Rails 4.2.1
3
+ # https://raw.githubusercontent.com/rails/rails/v4.2.1/activesupport/lib/active_support/core_ext/file/atomic.rb
4
+ # change method name to avoid borking active_support and vice versa
5
+ require 'fileutils'
6
+
7
+ module FileHelper
8
+ extend self
9
+ # Write to a file atomically. Useful for situations where you don't
10
+ # want other processes or threads to see half-written files.
11
+ #
12
+ # File.write_atomically('important.file') do |file|
13
+ # file.write('hello')
14
+ # end
15
+ def write_atomically(file_name)
16
+
17
+ if File.exist?(file_name)
18
+ # Get original file permissions
19
+ old_stat = File.stat(file_name)
20
+ else
21
+ # If not possible, probe which are the default permissions in the
22
+ # destination directory.
23
+ old_stat = probe_stat_in(File.dirname(file_name))
24
+ end
25
+
26
+ mode = old_stat ? old_stat.mode : nil
27
+
28
+ # Create temporary file with identical permissions
29
+ temp_file = File.new(rand_filename(file_name), "w", mode)
30
+ temp_file.binmode
31
+ return_val = yield temp_file
32
+ temp_file.close
33
+
34
+ # Overwrite original file with temp file
35
+ File.rename(temp_file.path, file_name)
36
+
37
+ # Unable to get permissions of the original file => return
38
+ return return_val if old_stat.nil?
39
+
40
+ # Set correct uid/gid on new file
41
+ File.chown(old_stat.uid, old_stat.gid, file_name) if old_stat
42
+
43
+ return_val
44
+ end
45
+
46
+ def device?(file_name)
47
+ File.chardev?(file_name) || File.blockdev?(file_name)
48
+ end
49
+
50
+ # Private utility method.
51
+ def probe_stat_in(dir) #:nodoc:
52
+ basename = rand_filename(".permissions_check")
53
+ file_name = File.join(dir, basename)
54
+ FileUtils.touch(file_name)
55
+ File.stat(file_name)
56
+ rescue
57
+ # ...
58
+ ensure
59
+ FileUtils.rm_f(file_name) if File.exist?(file_name)
60
+ end
61
+
62
+ def rand_filename(prefix)
63
+ [ prefix, Thread.current.object_id, Process.pid, rand(1000000) ].join('.')
64
+ end
65
+ end
@@ -0,0 +1,97 @@
1
+ # encoding: utf-8
2
+
3
+ ## Interface API topology
4
+ # ObservingBase module (this file)
5
+ # is a module mixin proving common constructor and external API for File Input Plugin interaction
6
+ # calls build_specific_processor on ObservingRead or ObservingTail
7
+ # ObservingRead and ObservingTail
8
+ # provides the External API method subscribe(observer = NullObserver.new)
9
+ # build_specific_processor(settings) - provide a Tail or Read specific Processor.
10
+ # TailMode::Processor or ReadMode::Processor
11
+ # initialize_handlers(sincedb_collection, observer) - called when the observer subscribes to changes in a Mode,
12
+ # builds mode specific handler instances with references to the observer
13
+ # process_closed(watched_files) - provide specific processing of watched_files in the closed state
14
+ # process_ignored(watched_files) - provide specific processing of watched_files in the ignored state
15
+ # process_watched(watched_files) - provide specific processing of watched_files in the watched state
16
+ # process_active(watched_files) - provide specific processing of watched_files in the active state
17
+ # These methods can call "handler" methods that delegate to the specific Handler classes.
18
+ # TailMode::Handlers module namespace
19
+ # contains the Handler classes that deals with Tail mode file lifecycle "events".
20
+ # The TailMode::Handlers::Base
21
+ # handle(watched_file) - this method calls handle_specifically defined in a subclass
22
+ # handle_specifically(watched_file) - this is a noop method
23
+ # update_existing_specifically(watched_file, sincedb_value) - this is a noop method
24
+ # Each handler extends the Base class to provide specific implementations of these two methods:
25
+ # handle_specifically(watched_file)
26
+ # update_existing_specifically(watched_file, sincedb_value)
27
+ # ReadMode::Handlers module namespace
28
+ # contains the Handler classes that deals with Read mode file lifecycle "events".
29
+ # The ReadMode::Handlers::Base
30
+ # handle(watched_file) - this method calls handle_specifically defined in a subclass
31
+ # handle_specifically(watched_file) - this is a noop method
32
+ # Each handler extends the Base class to provide specific implementations of this method:
33
+ # handle_specifically(watched_file)
34
+
35
+ module FileWatch
36
+ module ObservingBase
37
+ attr_reader :watch, :sincedb_collection, :settings
38
+
39
+ def initialize(opts={})
40
+ options = {
41
+ :sincedb_write_interval => 10,
42
+ :stat_interval => 1,
43
+ :discover_interval => 5,
44
+ :exclude => [],
45
+ :start_new_files_at => :end,
46
+ :delimiter => "\n",
47
+ :file_chunk_count => FIXNUM_MAX,
48
+ :file_sort_by => "last_modified",
49
+ :file_sort_direction => "asc",
50
+ }.merge(opts)
51
+ unless options.include?(:sincedb_path)
52
+ raise NoSinceDBPathGiven.new("No sincedb_path set in options. This should have been added in the main LogStash::Inputs::File class")
53
+ end
54
+ @settings = Settings.from_options(options)
55
+ build_watch_and_dependencies
56
+ end
57
+
58
+ def build_watch_and_dependencies
59
+ logger.info("START, creating Discoverer, Watch with file and sincedb collections")
60
+ watched_files_collection = WatchedFilesCollection.new(@settings)
61
+ @sincedb_collection = SincedbCollection.new(@settings)
62
+ @sincedb_collection.open
63
+ discoverer = Discoverer.new(watched_files_collection, @sincedb_collection, @settings)
64
+ @watch = Watch.new(discoverer, watched_files_collection, @settings)
65
+ @watch.add_processor build_specific_processor(@settings)
66
+ end
67
+
68
+ def watch_this(path)
69
+ @watch.watch(path)
70
+ end
71
+
72
+ def sincedb_write(reason=nil)
73
+ # can be invoked from the file input
74
+ @sincedb_collection.write(reason)
75
+ end
76
+
77
+ # quit is a sort-of finalizer,
78
+ # it should be called for clean up
79
+ # before the instance is disposed of.
80
+ def quit
81
+ logger.info("QUIT - closing all files and shutting down.")
82
+ @watch.quit # <-- should close all the files
83
+ # sincedb_write("shutting down")
84
+ end
85
+
86
+ # close_file(path) is to be used by external code
87
+ # when it knows that it is completely done with a file.
88
+ # Other files or folders may still be being watched.
89
+ # Caution, once unwatched, a file can't be watched again
90
+ # unless a new instance of this class begins watching again.
91
+ # The sysadmin should rename, move or delete the file.
92
+ def close_file(path)
93
+ @watch.unwatch(path)
94
+ sincedb_write
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,23 @@
1
+ # encoding: utf-8
2
+ require "logstash/util/loggable"
3
+ require_relative "read_mode/processor"
4
+
5
+ module FileWatch
6
+ class ObservingRead
7
+ include LogStash::Util::Loggable
8
+ include ObservingBase
9
+
10
+ def subscribe(observer)
11
+ # observer here is the file input
12
+ watch.subscribe(observer, sincedb_collection)
13
+ sincedb_collection.write("read mode subscribe complete - shutting down")
14
+ end
15
+
16
+ private
17
+
18
+ def build_specific_processor(settings)
19
+ ReadMode::Processor.new(settings)
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,22 @@
1
+ # encoding: utf-8
2
+ require "logstash/util/loggable"
3
+ require_relative 'tail_mode/processor'
4
+
5
+ module FileWatch
6
+ class ObservingTail
7
+ include LogStash::Util::Loggable
8
+ include ObservingBase
9
+
10
+ def subscribe(observer)
11
+ # observer here is the file input
12
+ watch.subscribe(observer, sincedb_collection)
13
+ sincedb_collection.write("tail mode subscribe complete - shutting down")
14
+ end
15
+
16
+ private
17
+
18
+ def build_specific_processor(settings)
19
+ TailMode::Processor.new(settings)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ require "logstash/util/loggable"
3
+
4
+ module FileWatch module ReadMode module Handlers
5
+ class Base
6
+ include LogStash::Util::Loggable
7
+
8
+ attr_reader :sincedb_collection
9
+
10
+ def initialize(sincedb_collection, observer, settings)
11
+ @settings = settings
12
+ @sincedb_collection = sincedb_collection
13
+ @observer = observer
14
+ end
15
+
16
+ def handle(watched_file)
17
+ logger.debug("handling: #{watched_file.path}")
18
+ unless watched_file.has_listener?
19
+ watched_file.set_listener(@observer)
20
+ end
21
+ handle_specifically(watched_file)
22
+ end
23
+
24
+ def handle_specifically(watched_file)
25
+ # some handlers don't need to define this method
26
+ end
27
+
28
+ private
29
+
30
+ def open_file(watched_file)
31
+ return true if watched_file.file_open?
32
+ logger.debug("opening #{watched_file.path}")
33
+ begin
34
+ watched_file.open
35
+ rescue
36
+ # don't emit this message too often. if a file that we can't
37
+ # read is changing a lot, we'll try to open it more often, and spam the logs.
38
+ now = Time.now.to_i
39
+ logger.warn("opening OPEN_WARN_INTERVAL is '#{OPEN_WARN_INTERVAL}'")
40
+ if watched_file.last_open_warning_at.nil? || now - watched_file.last_open_warning_at > OPEN_WARN_INTERVAL
41
+ logger.warn("failed to open #{watched_file.path}: #{$!.inspect}, #{$!.backtrace.take(3)}")
42
+ watched_file.last_open_warning_at = now
43
+ else
44
+ logger.debug("suppressed warning for `failed to open` #{watched_file.path}: #{$!.inspect}")
45
+ end
46
+ watched_file.watch # set it back to watch so we can try it again
47
+ end
48
+ if watched_file.file_open?
49
+ watched_file.listener.opened
50
+ true
51
+ else
52
+ false
53
+ end
54
+ end
55
+
56
+ def add_or_update_sincedb_collection(watched_file)
57
+ sincedb_value = @sincedb_collection.find(watched_file)
58
+ if sincedb_value.nil?
59
+ add_new_value_sincedb_collection(watched_file)
60
+ elsif sincedb_value.watched_file == watched_file
61
+ update_existing_sincedb_collection_value(watched_file, sincedb_value)
62
+ else
63
+ logger.warn? && logger.warn("mismatch on sincedb_value.watched_file, this should have been handled by Discoverer")
64
+ end
65
+ watched_file.initial_completed
66
+ end
67
+
68
+ def update_existing_sincedb_collection_value(watched_file, sincedb_value)
69
+ logger.debug("update_existing_sincedb_collection_value: #{watched_file.path}, last value #{sincedb_value.position}, cur size #{watched_file.last_stat_size}")
70
+ # sincedb_value is the source of truth
71
+ watched_file.update_bytes_read(sincedb_value.position)
72
+ end
73
+
74
+ def add_new_value_sincedb_collection(watched_file)
75
+ sincedb_value = SincedbValue.new(0)
76
+ sincedb_value.set_watched_file(watched_file)
77
+ logger.debug("add_new_value_sincedb_collection: #{watched_file.path}", "position" => sincedb_value.position)
78
+ sincedb_collection.set(watched_file.sincedb_key, sincedb_value)
79
+ end
80
+ end
81
+ end end end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ module FileWatch module ReadMode module Handlers
4
+ class ReadFile < Base
5
+ def handle_specifically(watched_file)
6
+ if open_file(watched_file)
7
+ add_or_update_sincedb_collection(watched_file) unless sincedb_collection.member?(watched_file.sincedb_key)
8
+ # if the `file_chunk_count` * `file_chunk_size` is less than the file size
9
+ # then this method will be executed multiple times
10
+ # and the seek is moved to just after a line boundary as recorded in the sincedb
11
+ # for each run - so we reset the buffer
12
+ watched_file.reset_buffer
13
+ watched_file.file_seek(watched_file.bytes_read)
14
+ changed = false
15
+ @settings.file_chunk_count.times do
16
+ begin
17
+ lines = watched_file.buffer_extract(watched_file.file_read(@settings.file_chunk_size))
18
+ logger.warn("read_to_eof: no delimiter found in current chunk") if lines.empty?
19
+ changed = true
20
+ lines.each do |line|
21
+ watched_file.listener.accept(line)
22
+ sincedb_collection.increment(watched_file.sincedb_key, line.bytesize + @settings.delimiter_byte_size)
23
+ end
24
+ rescue EOFError
25
+ # flush the buffer now in case there is no final delimiter
26
+ line = watched_file.buffer.flush
27
+ watched_file.listener.accept(line) unless line.empty?
28
+ watched_file.listener.eof
29
+ watched_file.file_close
30
+ sincedb_collection.unset_watched_file(watched_file)
31
+ watched_file.listener.deleted
32
+ watched_file.unwatch
33
+ break
34
+ rescue Errno::EWOULDBLOCK, Errno::EINTR
35
+ watched_file.listener.error
36
+ break
37
+ rescue => e
38
+ logger.error("read_to_eof: general error reading #{watched_file.path} - error: #{e.inspect}")
39
+ watched_file.listener.error
40
+ break
41
+ end
42
+ end
43
+ sincedb_collection.request_disk_flush if changed
44
+ end
45
+ end
46
+ end
47
+ end end end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ require 'java'
3
+ java_import java.io.InputStream
4
+ java_import java.io.InputStreamReader
5
+ java_import java.io.FileInputStream
6
+ java_import java.io.BufferedReader
7
+ java_import java.util.zip.GZIPInputStream
8
+ java_import java.util.zip.ZipException
9
+
10
+ module FileWatch module ReadMode module Handlers
11
+ class ReadZipFile < Base
12
+ def handle_specifically(watched_file)
13
+ add_or_update_sincedb_collection(watched_file) unless sincedb_collection.member?(watched_file.sincedb_key)
14
+ # can't really stripe read a zip file, its all or nothing.
15
+ watched_file.listener.opened
16
+ # what do we do about quit when we have just begun reading the zipped file (e.g. pipeline reloading)
17
+ # should we track lines read in the sincedb and
18
+ # fast forward through the lines until we reach unseen content?
19
+ # meaning that we can quit in the middle of a zip file
20
+ begin
21
+ file_stream = FileInputStream.new(watched_file.path)
22
+ gzip_stream = GZIPInputStream.new(file_stream)
23
+ decoder = InputStreamReader.new(gzip_stream, "UTF-8")
24
+ buffered = BufferedReader.new(decoder)
25
+ while (line = buffered.readLine(false))
26
+ watched_file.listener.accept(line)
27
+ end
28
+ watched_file.listener.eof
29
+ rescue ZipException => e
30
+ logger.error("Cannot decompress the gzip file at path: #{watched_file.path}")
31
+ watched_file.listener.error
32
+ else
33
+ sincedb_collection.store_last_read(watched_file.sincedb_key, watched_file.last_stat_size)
34
+ sincedb_collection.request_disk_flush
35
+ watched_file.listener.deleted
36
+ watched_file.unwatch
37
+ ensure
38
+ # rescue each close individually so all close attempts are tried
39
+ close_and_ignore_ioexception(buffered) unless buffered.nil?
40
+ close_and_ignore_ioexception(decoder) unless decoder.nil?
41
+ close_and_ignore_ioexception(gzip_stream) unless gzip_stream.nil?
42
+ close_and_ignore_ioexception(file_stream) unless file_stream.nil?
43
+ end
44
+ sincedb_collection.unset_watched_file(watched_file)
45
+ end
46
+
47
+ private
48
+
49
+ def close_and_ignore_ioexception(closeable)
50
+ begin
51
+ closeable.close
52
+ rescue Exception => e # IOException can be thrown by any of the Java classes that implement the Closable interface.
53
+ logger.warn("Ignoring an IOException when closing an instance of #{closeable.class.name}", "exception" => e)
54
+ end
55
+ end
56
+ end
57
+ end end end