logstash-input-file 4.0.5 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -3
  3. data/JAR_VERSION +1 -0
  4. data/docs/index.asciidoc +195 -37
  5. data/lib/filewatch/bootstrap.rb +74 -0
  6. data/lib/filewatch/discoverer.rb +94 -0
  7. data/lib/filewatch/helper.rb +65 -0
  8. data/lib/filewatch/observing_base.rb +97 -0
  9. data/lib/filewatch/observing_read.rb +23 -0
  10. data/lib/filewatch/observing_tail.rb +22 -0
  11. data/lib/filewatch/read_mode/handlers/base.rb +81 -0
  12. data/lib/filewatch/read_mode/handlers/read_file.rb +47 -0
  13. data/lib/filewatch/read_mode/handlers/read_zip_file.rb +57 -0
  14. data/lib/filewatch/read_mode/processor.rb +117 -0
  15. data/lib/filewatch/settings.rb +67 -0
  16. data/lib/filewatch/sincedb_collection.rb +215 -0
  17. data/lib/filewatch/sincedb_record_serializer.rb +70 -0
  18. data/lib/filewatch/sincedb_value.rb +87 -0
  19. data/lib/filewatch/tail_mode/handlers/base.rb +124 -0
  20. data/lib/filewatch/tail_mode/handlers/create.rb +17 -0
  21. data/lib/filewatch/tail_mode/handlers/create_initial.rb +21 -0
  22. data/lib/filewatch/tail_mode/handlers/delete.rb +11 -0
  23. data/lib/filewatch/tail_mode/handlers/grow.rb +11 -0
  24. data/lib/filewatch/tail_mode/handlers/shrink.rb +20 -0
  25. data/lib/filewatch/tail_mode/handlers/timeout.rb +10 -0
  26. data/lib/filewatch/tail_mode/handlers/unignore.rb +37 -0
  27. data/lib/filewatch/tail_mode/processor.rb +209 -0
  28. data/lib/filewatch/watch.rb +107 -0
  29. data/lib/filewatch/watched_file.rb +226 -0
  30. data/lib/filewatch/watched_files_collection.rb +84 -0
  31. data/lib/filewatch/winhelper.rb +65 -0
  32. data/lib/jars/filewatch-1.0.0.jar +0 -0
  33. data/lib/logstash/inputs/delete_completed_file_handler.rb +9 -0
  34. data/lib/logstash/inputs/file.rb +162 -107
  35. data/lib/logstash/inputs/file_listener.rb +61 -0
  36. data/lib/logstash/inputs/log_completed_file_handler.rb +13 -0
  37. data/logstash-input-file.gemspec +5 -4
  38. data/spec/filewatch/buftok_spec.rb +24 -0
  39. data/spec/filewatch/reading_spec.rb +128 -0
  40. data/spec/filewatch/sincedb_record_serializer_spec.rb +71 -0
  41. data/spec/filewatch/spec_helper.rb +120 -0
  42. data/spec/filewatch/tailing_spec.rb +440 -0
  43. data/spec/filewatch/watched_file_spec.rb +38 -0
  44. data/spec/filewatch/watched_files_collection_spec.rb +73 -0
  45. data/spec/filewatch/winhelper_spec.rb +22 -0
  46. data/spec/fixtures/compressed.log.gz +0 -0
  47. data/spec/fixtures/compressed.log.gzip +0 -0
  48. data/spec/fixtures/invalid_utf8.gbk.log +2 -0
  49. data/spec/fixtures/no-final-newline.log +2 -0
  50. data/spec/fixtures/uncompressed.log +2 -0
  51. data/spec/{spec_helper.rb → helpers/spec_helper.rb} +14 -41
  52. data/spec/inputs/file_read_spec.rb +155 -0
  53. data/spec/inputs/{file_spec.rb → file_tail_spec.rb} +55 -52
  54. metadata +96 -28
@@ -0,0 +1,94 @@
1
+ # encoding: utf-8
2
+ require "logstash/util/loggable"
3
+
4
+ module FileWatch
5
+ class Discoverer
6
+ # given a path or glob will prepare for and discover files to watch
7
+ # if they are not excluded or ignorable
8
+ # they are added to the watched_files collection and
9
+ # associated with a sincedb entry if one can be found
10
+ include LogStash::Util::Loggable
11
+
12
+ def initialize(watched_files_collection, sincedb_collection, settings)
13
+ @watching = []
14
+ @exclude = []
15
+ @watched_files_collection = watched_files_collection
16
+ @sincedb_collection = sincedb_collection
17
+ @settings = settings
18
+ @settings.exclude.each { |p| @exclude << p }
19
+ end
20
+
21
+ def add_path(path)
22
+ return if @watching.member?(path)
23
+ @watching << path
24
+ discover_files(path)
25
+ self
26
+ end
27
+
28
+ def discover
29
+ @watching.each do |path|
30
+ discover_files(path)
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def can_exclude?(watched_file, new_discovery)
37
+ @exclude.each do |pattern|
38
+ if watched_file.pathname.fnmatch?(pattern)
39
+ if new_discovery
40
+ logger.debug("Discoverer can_exclude?: #{watched_file.path}: skipping " +
41
+ "because it matches exclude #{pattern}")
42
+ end
43
+ watched_file.unwatch
44
+ return true
45
+ end
46
+ end
47
+ false
48
+ end
49
+
50
+ def discover_files(path)
51
+ globbed = Dir.glob(path)
52
+ globbed = [path] if globbed.empty?
53
+ logger.debug("Discoverer found files, count: #{globbed.size}")
54
+ globbed.each do |file|
55
+ logger.debug("Discoverer found file, path: #{file}")
56
+ pathname = Pathname.new(file)
57
+ next unless pathname.file?
58
+ next if pathname.symlink?
59
+ new_discovery = false
60
+ watched_file = @watched_files_collection.watched_file_by_path(file)
61
+ if watched_file.nil?
62
+ logger.debug("Discoverer discover_files: #{path}: new: #{file} (exclude is #{@exclude.inspect})")
63
+ new_discovery = true
64
+ watched_file = WatchedFile.new(pathname, pathname.stat, @settings)
65
+ end
66
+ # if it already unwatched or its excluded then we can skip
67
+ next if watched_file.unwatched? || can_exclude?(watched_file, new_discovery)
68
+
69
+ if new_discovery
70
+ if watched_file.file_ignorable?
71
+ logger.debug("Discoverer discover_files: #{file}: skipping because it was last modified more than #{@settings.ignore_older} seconds ago")
72
+ # on discovery ignorable watched_files are put into the ignored state and that
73
+ # updates the size from the internal stat
74
+ # so the existing contents are not read.
75
+ # because, normally, a newly discovered file will
76
+ # have a watched_file size of zero
77
+ # they are still added to the collection so we know they are there for the next periodic discovery
78
+ watched_file.ignore
79
+ end
80
+ # now add the discovered file to the watched_files collection and adjust the sincedb collections
81
+ @watched_files_collection.add(watched_file)
82
+ # initially when the sincedb collection is filled with records from the persistence file
83
+ # each value is not associated with a watched file
84
+ # a sincedb_value can be:
85
+ # unassociated
86
+ # associated with this watched_file
87
+ # associated with a different watched_file
88
+ @sincedb_collection.associate(watched_file)
89
+ end
90
+ # at this point the watched file is created, is in the db but not yet opened or being processed
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,65 @@
1
+ # encoding: utf-8
2
+ # code downloaded from Ruby on Rails 4.2.1
3
+ # https://raw.githubusercontent.com/rails/rails/v4.2.1/activesupport/lib/active_support/core_ext/file/atomic.rb
4
+ # change method name to avoid borking active_support and vice versa
5
+ require 'fileutils'
6
+
7
+ module FileHelper
8
+ extend self
9
+ # Write to a file atomically. Useful for situations where you don't
10
+ # want other processes or threads to see half-written files.
11
+ #
12
+ # File.write_atomically('important.file') do |file|
13
+ # file.write('hello')
14
+ # end
15
+ def write_atomically(file_name)
16
+
17
+ if File.exist?(file_name)
18
+ # Get original file permissions
19
+ old_stat = File.stat(file_name)
20
+ else
21
+ # If not possible, probe which are the default permissions in the
22
+ # destination directory.
23
+ old_stat = probe_stat_in(File.dirname(file_name))
24
+ end
25
+
26
+ mode = old_stat ? old_stat.mode : nil
27
+
28
+ # Create temporary file with identical permissions
29
+ temp_file = File.new(rand_filename(file_name), "w", mode)
30
+ temp_file.binmode
31
+ return_val = yield temp_file
32
+ temp_file.close
33
+
34
+ # Overwrite original file with temp file
35
+ File.rename(temp_file.path, file_name)
36
+
37
+ # Unable to get permissions of the original file => return
38
+ return return_val if old_stat.nil?
39
+
40
+ # Set correct uid/gid on new file
41
+ File.chown(old_stat.uid, old_stat.gid, file_name) if old_stat
42
+
43
+ return_val
44
+ end
45
+
46
+ def device?(file_name)
47
+ File.chardev?(file_name) || File.blockdev?(file_name)
48
+ end
49
+
50
+ # Private utility method.
51
+ def probe_stat_in(dir) #:nodoc:
52
+ basename = rand_filename(".permissions_check")
53
+ file_name = File.join(dir, basename)
54
+ FileUtils.touch(file_name)
55
+ File.stat(file_name)
56
+ rescue
57
+ # ...
58
+ ensure
59
+ FileUtils.rm_f(file_name) if File.exist?(file_name)
60
+ end
61
+
62
+ def rand_filename(prefix)
63
+ [ prefix, Thread.current.object_id, Process.pid, rand(1000000) ].join('.')
64
+ end
65
+ end
@@ -0,0 +1,97 @@
1
+ # encoding: utf-8
2
+
3
+ ## Interface API topology
4
+ # ObservingBase module (this file)
5
+ # is a module mixin proving common constructor and external API for File Input Plugin interaction
6
+ # calls build_specific_processor on ObservingRead or ObservingTail
7
+ # ObservingRead and ObservingTail
8
+ # provides the External API method subscribe(observer = NullObserver.new)
9
+ # build_specific_processor(settings) - provide a Tail or Read specific Processor.
10
+ # TailMode::Processor or ReadMode::Processor
11
+ # initialize_handlers(sincedb_collection, observer) - called when the observer subscribes to changes in a Mode,
12
+ # builds mode specific handler instances with references to the observer
13
+ # process_closed(watched_files) - provide specific processing of watched_files in the closed state
14
+ # process_ignored(watched_files) - provide specific processing of watched_files in the ignored state
15
+ # process_watched(watched_files) - provide specific processing of watched_files in the watched state
16
+ # process_active(watched_files) - provide specific processing of watched_files in the active state
17
+ # These methods can call "handler" methods that delegate to the specific Handler classes.
18
+ # TailMode::Handlers module namespace
19
+ # contains the Handler classes that deals with Tail mode file lifecycle "events".
20
+ # The TailMode::Handlers::Base
21
+ # handle(watched_file) - this method calls handle_specifically defined in a subclass
22
+ # handle_specifically(watched_file) - this is a noop method
23
+ # update_existing_specifically(watched_file, sincedb_value) - this is a noop method
24
+ # Each handler extends the Base class to provide specific implementations of these two methods:
25
+ # handle_specifically(watched_file)
26
+ # update_existing_specifically(watched_file, sincedb_value)
27
+ # ReadMode::Handlers module namespace
28
+ # contains the Handler classes that deals with Read mode file lifecycle "events".
29
+ # The ReadMode::Handlers::Base
30
+ # handle(watched_file) - this method calls handle_specifically defined in a subclass
31
+ # handle_specifically(watched_file) - this is a noop method
32
+ # Each handler extends the Base class to provide specific implementations of this method:
33
+ # handle_specifically(watched_file)
34
+
35
+ module FileWatch
36
+ module ObservingBase
37
+ attr_reader :watch, :sincedb_collection, :settings
38
+
39
+ def initialize(opts={})
40
+ options = {
41
+ :sincedb_write_interval => 10,
42
+ :stat_interval => 1,
43
+ :discover_interval => 5,
44
+ :exclude => [],
45
+ :start_new_files_at => :end,
46
+ :delimiter => "\n",
47
+ :file_chunk_count => FIXNUM_MAX,
48
+ :file_sort_by => "last_modified",
49
+ :file_sort_direction => "asc",
50
+ }.merge(opts)
51
+ unless options.include?(:sincedb_path)
52
+ raise NoSinceDBPathGiven.new("No sincedb_path set in options. This should have been added in the main LogStash::Inputs::File class")
53
+ end
54
+ @settings = Settings.from_options(options)
55
+ build_watch_and_dependencies
56
+ end
57
+
58
+ def build_watch_and_dependencies
59
+ logger.info("START, creating Discoverer, Watch with file and sincedb collections")
60
+ watched_files_collection = WatchedFilesCollection.new(@settings)
61
+ @sincedb_collection = SincedbCollection.new(@settings)
62
+ @sincedb_collection.open
63
+ discoverer = Discoverer.new(watched_files_collection, @sincedb_collection, @settings)
64
+ @watch = Watch.new(discoverer, watched_files_collection, @settings)
65
+ @watch.add_processor build_specific_processor(@settings)
66
+ end
67
+
68
+ def watch_this(path)
69
+ @watch.watch(path)
70
+ end
71
+
72
+ def sincedb_write(reason=nil)
73
+ # can be invoked from the file input
74
+ @sincedb_collection.write(reason)
75
+ end
76
+
77
+ # quit is a sort-of finalizer,
78
+ # it should be called for clean up
79
+ # before the instance is disposed of.
80
+ def quit
81
+ logger.info("QUIT - closing all files and shutting down.")
82
+ @watch.quit # <-- should close all the files
83
+ # sincedb_write("shutting down")
84
+ end
85
+
86
+ # close_file(path) is to be used by external code
87
+ # when it knows that it is completely done with a file.
88
+ # Other files or folders may still be being watched.
89
+ # Caution, once unwatched, a file can't be watched again
90
+ # unless a new instance of this class begins watching again.
91
+ # The sysadmin should rename, move or delete the file.
92
+ def close_file(path)
93
+ @watch.unwatch(path)
94
+ sincedb_write
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,23 @@
1
+ # encoding: utf-8
2
+ require "logstash/util/loggable"
3
+ require_relative "read_mode/processor"
4
+
5
+ module FileWatch
6
+ class ObservingRead
7
+ include LogStash::Util::Loggable
8
+ include ObservingBase
9
+
10
+ def subscribe(observer)
11
+ # observer here is the file input
12
+ watch.subscribe(observer, sincedb_collection)
13
+ sincedb_collection.write("read mode subscribe complete - shutting down")
14
+ end
15
+
16
+ private
17
+
18
+ def build_specific_processor(settings)
19
+ ReadMode::Processor.new(settings)
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,22 @@
1
+ # encoding: utf-8
2
+ require "logstash/util/loggable"
3
+ require_relative 'tail_mode/processor'
4
+
5
+ module FileWatch
6
+ class ObservingTail
7
+ include LogStash::Util::Loggable
8
+ include ObservingBase
9
+
10
+ def subscribe(observer)
11
+ # observer here is the file input
12
+ watch.subscribe(observer, sincedb_collection)
13
+ sincedb_collection.write("tail mode subscribe complete - shutting down")
14
+ end
15
+
16
+ private
17
+
18
+ def build_specific_processor(settings)
19
+ TailMode::Processor.new(settings)
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,81 @@
1
+ # encoding: utf-8
2
+ require "logstash/util/loggable"
3
+
4
+ module FileWatch module ReadMode module Handlers
5
+ class Base
6
+ include LogStash::Util::Loggable
7
+
8
+ attr_reader :sincedb_collection
9
+
10
+ def initialize(sincedb_collection, observer, settings)
11
+ @settings = settings
12
+ @sincedb_collection = sincedb_collection
13
+ @observer = observer
14
+ end
15
+
16
+ def handle(watched_file)
17
+ logger.debug("handling: #{watched_file.path}")
18
+ unless watched_file.has_listener?
19
+ watched_file.set_listener(@observer)
20
+ end
21
+ handle_specifically(watched_file)
22
+ end
23
+
24
+ def handle_specifically(watched_file)
25
+ # some handlers don't need to define this method
26
+ end
27
+
28
+ private
29
+
30
+ def open_file(watched_file)
31
+ return true if watched_file.file_open?
32
+ logger.debug("opening #{watched_file.path}")
33
+ begin
34
+ watched_file.open
35
+ rescue
36
+ # don't emit this message too often. if a file that we can't
37
+ # read is changing a lot, we'll try to open it more often, and spam the logs.
38
+ now = Time.now.to_i
39
+ logger.warn("opening OPEN_WARN_INTERVAL is '#{OPEN_WARN_INTERVAL}'")
40
+ if watched_file.last_open_warning_at.nil? || now - watched_file.last_open_warning_at > OPEN_WARN_INTERVAL
41
+ logger.warn("failed to open #{watched_file.path}: #{$!.inspect}, #{$!.backtrace.take(3)}")
42
+ watched_file.last_open_warning_at = now
43
+ else
44
+ logger.debug("suppressed warning for `failed to open` #{watched_file.path}: #{$!.inspect}")
45
+ end
46
+ watched_file.watch # set it back to watch so we can try it again
47
+ end
48
+ if watched_file.file_open?
49
+ watched_file.listener.opened
50
+ true
51
+ else
52
+ false
53
+ end
54
+ end
55
+
56
+ def add_or_update_sincedb_collection(watched_file)
57
+ sincedb_value = @sincedb_collection.find(watched_file)
58
+ if sincedb_value.nil?
59
+ add_new_value_sincedb_collection(watched_file)
60
+ elsif sincedb_value.watched_file == watched_file
61
+ update_existing_sincedb_collection_value(watched_file, sincedb_value)
62
+ else
63
+ logger.warn? && logger.warn("mismatch on sincedb_value.watched_file, this should have been handled by Discoverer")
64
+ end
65
+ watched_file.initial_completed
66
+ end
67
+
68
+ def update_existing_sincedb_collection_value(watched_file, sincedb_value)
69
+ logger.debug("update_existing_sincedb_collection_value: #{watched_file.path}, last value #{sincedb_value.position}, cur size #{watched_file.last_stat_size}")
70
+ # sincedb_value is the source of truth
71
+ watched_file.update_bytes_read(sincedb_value.position)
72
+ end
73
+
74
+ def add_new_value_sincedb_collection(watched_file)
75
+ sincedb_value = SincedbValue.new(0)
76
+ sincedb_value.set_watched_file(watched_file)
77
+ logger.debug("add_new_value_sincedb_collection: #{watched_file.path}", "position" => sincedb_value.position)
78
+ sincedb_collection.set(watched_file.sincedb_key, sincedb_value)
79
+ end
80
+ end
81
+ end end end
@@ -0,0 +1,47 @@
1
+ # encoding: utf-8
2
+
3
+ module FileWatch module ReadMode module Handlers
4
+ class ReadFile < Base
5
+ def handle_specifically(watched_file)
6
+ if open_file(watched_file)
7
+ add_or_update_sincedb_collection(watched_file) unless sincedb_collection.member?(watched_file.sincedb_key)
8
+ # if the `file_chunk_count` * `file_chunk_size` is less than the file size
9
+ # then this method will be executed multiple times
10
+ # and the seek is moved to just after a line boundary as recorded in the sincedb
11
+ # for each run - so we reset the buffer
12
+ watched_file.reset_buffer
13
+ watched_file.file_seek(watched_file.bytes_read)
14
+ changed = false
15
+ @settings.file_chunk_count.times do
16
+ begin
17
+ lines = watched_file.buffer_extract(watched_file.file_read(@settings.file_chunk_size))
18
+ logger.warn("read_to_eof: no delimiter found in current chunk") if lines.empty?
19
+ changed = true
20
+ lines.each do |line|
21
+ watched_file.listener.accept(line)
22
+ sincedb_collection.increment(watched_file.sincedb_key, line.bytesize + @settings.delimiter_byte_size)
23
+ end
24
+ rescue EOFError
25
+ # flush the buffer now in case there is no final delimiter
26
+ line = watched_file.buffer.flush
27
+ watched_file.listener.accept(line) unless line.empty?
28
+ watched_file.listener.eof
29
+ watched_file.file_close
30
+ sincedb_collection.unset_watched_file(watched_file)
31
+ watched_file.listener.deleted
32
+ watched_file.unwatch
33
+ break
34
+ rescue Errno::EWOULDBLOCK, Errno::EINTR
35
+ watched_file.listener.error
36
+ break
37
+ rescue => e
38
+ logger.error("read_to_eof: general error reading #{watched_file.path} - error: #{e.inspect}")
39
+ watched_file.listener.error
40
+ break
41
+ end
42
+ end
43
+ sincedb_collection.request_disk_flush if changed
44
+ end
45
+ end
46
+ end
47
+ end end end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ require 'java'
3
+ java_import java.io.InputStream
4
+ java_import java.io.InputStreamReader
5
+ java_import java.io.FileInputStream
6
+ java_import java.io.BufferedReader
7
+ java_import java.util.zip.GZIPInputStream
8
+ java_import java.util.zip.ZipException
9
+
10
+ module FileWatch module ReadMode module Handlers
11
+ class ReadZipFile < Base
12
+ def handle_specifically(watched_file)
13
+ add_or_update_sincedb_collection(watched_file) unless sincedb_collection.member?(watched_file.sincedb_key)
14
+ # can't really stripe read a zip file, its all or nothing.
15
+ watched_file.listener.opened
16
+ # what do we do about quit when we have just begun reading the zipped file (e.g. pipeline reloading)
17
+ # should we track lines read in the sincedb and
18
+ # fast forward through the lines until we reach unseen content?
19
+ # meaning that we can quit in the middle of a zip file
20
+ begin
21
+ file_stream = FileInputStream.new(watched_file.path)
22
+ gzip_stream = GZIPInputStream.new(file_stream)
23
+ decoder = InputStreamReader.new(gzip_stream, "UTF-8")
24
+ buffered = BufferedReader.new(decoder)
25
+ while (line = buffered.readLine(false))
26
+ watched_file.listener.accept(line)
27
+ end
28
+ watched_file.listener.eof
29
+ rescue ZipException => e
30
+ logger.error("Cannot decompress the gzip file at path: #{watched_file.path}")
31
+ watched_file.listener.error
32
+ else
33
+ sincedb_collection.store_last_read(watched_file.sincedb_key, watched_file.last_stat_size)
34
+ sincedb_collection.request_disk_flush
35
+ watched_file.listener.deleted
36
+ watched_file.unwatch
37
+ ensure
38
+ # rescue each close individually so all close attempts are tried
39
+ close_and_ignore_ioexception(buffered) unless buffered.nil?
40
+ close_and_ignore_ioexception(decoder) unless decoder.nil?
41
+ close_and_ignore_ioexception(gzip_stream) unless gzip_stream.nil?
42
+ close_and_ignore_ioexception(file_stream) unless file_stream.nil?
43
+ end
44
+ sincedb_collection.unset_watched_file(watched_file)
45
+ end
46
+
47
+ private
48
+
49
+ def close_and_ignore_ioexception(closeable)
50
+ begin
51
+ closeable.close
52
+ rescue Exception => e # IOException can be thrown by any of the Java classes that implement the Closable interface.
53
+ logger.warn("Ignoring an IOException when closing an instance of #{closeable.class.name}", "exception" => e)
54
+ end
55
+ end
56
+ end
57
+ end end end