logstash-input-file 4.0.5 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -3
  3. data/JAR_VERSION +1 -0
  4. data/docs/index.asciidoc +195 -37
  5. data/lib/filewatch/bootstrap.rb +74 -0
  6. data/lib/filewatch/discoverer.rb +94 -0
  7. data/lib/filewatch/helper.rb +65 -0
  8. data/lib/filewatch/observing_base.rb +97 -0
  9. data/lib/filewatch/observing_read.rb +23 -0
  10. data/lib/filewatch/observing_tail.rb +22 -0
  11. data/lib/filewatch/read_mode/handlers/base.rb +81 -0
  12. data/lib/filewatch/read_mode/handlers/read_file.rb +47 -0
  13. data/lib/filewatch/read_mode/handlers/read_zip_file.rb +57 -0
  14. data/lib/filewatch/read_mode/processor.rb +117 -0
  15. data/lib/filewatch/settings.rb +67 -0
  16. data/lib/filewatch/sincedb_collection.rb +215 -0
  17. data/lib/filewatch/sincedb_record_serializer.rb +70 -0
  18. data/lib/filewatch/sincedb_value.rb +87 -0
  19. data/lib/filewatch/tail_mode/handlers/base.rb +124 -0
  20. data/lib/filewatch/tail_mode/handlers/create.rb +17 -0
  21. data/lib/filewatch/tail_mode/handlers/create_initial.rb +21 -0
  22. data/lib/filewatch/tail_mode/handlers/delete.rb +11 -0
  23. data/lib/filewatch/tail_mode/handlers/grow.rb +11 -0
  24. data/lib/filewatch/tail_mode/handlers/shrink.rb +20 -0
  25. data/lib/filewatch/tail_mode/handlers/timeout.rb +10 -0
  26. data/lib/filewatch/tail_mode/handlers/unignore.rb +37 -0
  27. data/lib/filewatch/tail_mode/processor.rb +209 -0
  28. data/lib/filewatch/watch.rb +107 -0
  29. data/lib/filewatch/watched_file.rb +226 -0
  30. data/lib/filewatch/watched_files_collection.rb +84 -0
  31. data/lib/filewatch/winhelper.rb +65 -0
  32. data/lib/jars/filewatch-1.0.0.jar +0 -0
  33. data/lib/logstash/inputs/delete_completed_file_handler.rb +9 -0
  34. data/lib/logstash/inputs/file.rb +162 -107
  35. data/lib/logstash/inputs/file_listener.rb +61 -0
  36. data/lib/logstash/inputs/log_completed_file_handler.rb +13 -0
  37. data/logstash-input-file.gemspec +5 -4
  38. data/spec/filewatch/buftok_spec.rb +24 -0
  39. data/spec/filewatch/reading_spec.rb +128 -0
  40. data/spec/filewatch/sincedb_record_serializer_spec.rb +71 -0
  41. data/spec/filewatch/spec_helper.rb +120 -0
  42. data/spec/filewatch/tailing_spec.rb +440 -0
  43. data/spec/filewatch/watched_file_spec.rb +38 -0
  44. data/spec/filewatch/watched_files_collection_spec.rb +73 -0
  45. data/spec/filewatch/winhelper_spec.rb +22 -0
  46. data/spec/fixtures/compressed.log.gz +0 -0
  47. data/spec/fixtures/compressed.log.gzip +0 -0
  48. data/spec/fixtures/invalid_utf8.gbk.log +2 -0
  49. data/spec/fixtures/no-final-newline.log +2 -0
  50. data/spec/fixtures/uncompressed.log +2 -0
  51. data/spec/{spec_helper.rb → helpers/spec_helper.rb} +14 -41
  52. data/spec/inputs/file_read_spec.rb +155 -0
  53. data/spec/inputs/{file_spec.rb → file_tail_spec.rb} +55 -52
  54. metadata +96 -28
Binary file
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ module LogStash module Inputs
4
+ class DeleteCompletedFileHandler
5
+ def handle(path)
6
+ Pathname.new(path).unlink rescue nil
7
+ end
8
+ end
9
+ end end
@@ -6,7 +6,12 @@ require "logstash/codecs/identity_map_codec"
6
6
  require "pathname"
7
7
  require "socket" # for Socket.gethostname
8
8
  require "fileutils"
9
+
9
10
  require_relative "file/patch"
11
+ require_relative "file_listener"
12
+ require_relative "delete_completed_file_handler"
13
+ require_relative "log_completed_file_handler"
14
+ require "filewatch/bootstrap"
10
15
 
11
16
  # Stream events from files, normally by tailing them in a manner
12
17
  # similar to `tail -0F` but optionally reading them from the
@@ -26,8 +31,8 @@ require_relative "file/patch"
26
31
  #
27
32
  # ==== Reading from remote network volumes
28
33
  #
29
- # The file input is not tested on remote filesystems such as NFS, Samba, s3fs-fuse, etc. These
30
- # remote filesystems typically have behaviors that are very different from local filesystems and
34
+ # The file input is not tested on remote filesystems such as NFS, Samba, s3fs-fuse, etc. These
35
+ # remote filesystems typically have behaviors that are very different from local filesystems and
31
36
  # are therefore unlikely to work correctly when used with the file input.
32
37
  #
33
38
  # ==== Tracking of current position in watched files
@@ -77,8 +82,8 @@ require_relative "file/patch"
77
82
  # to the rotation and its reopening under the new name (an interval
78
83
  # determined by the `stat_interval` and `discover_interval` options)
79
84
  # will not get picked up.
80
-
81
- class LogStash::Inputs::File < LogStash::Inputs::Base
85
+ module LogStash module Inputs
86
+ class File < LogStash::Inputs::Base
82
87
  config_name "file"
83
88
 
84
89
  # The path(s) to the file(s) to use as an input.
@@ -144,8 +149,7 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
144
149
 
145
150
  # The file input closes any files that were last read the specified
146
151
  # timespan in seconds ago.
147
- # This has different implications depending on if a file is being tailed or
148
- # read. If tailing, and there is a large time gap in incoming data the file
152
+ # If tailing, and there is a large time gap in incoming data the file
149
153
  # can be closed (allowing other files to be opened) but will be queued for
150
154
  # reopening when new data is detected. If reading, the file will be closed
151
155
  # after closed_older seconds from when the last bytes were read.
@@ -160,10 +164,66 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
160
164
  # The default of 4095 is set in filewatch.
161
165
  config :max_open_files, :validate => :number
162
166
 
167
+ # What mode do you want the file input to operate in.
168
+ # Tail a few files or read many content-complete files
169
+ # The default is tail
170
+ # If "read" is specified then the following other settings are ignored
171
+ # `start_position` (files are always read from the beginning)
172
+ # `delimiter` (files are assumed to use \n or \r (or both) as line endings)
173
+ # `close_older` (files are automatically 'closed' when EOF is reached)
174
+ # If "read" is specified then the following settings are heeded
175
+ # `ignore_older` (older files are not processed)
176
+ # "read" mode now supports gzip file processing
177
+ config :mode, :validate => [ "tail", "read"], :default => "tail"
178
+
179
+ # When in 'read' mode, what action should be carried out when a file is done with.
180
+ # If 'delete' is specified then the file will be deleted.
181
+ # If 'log' is specified then the full path of the file is logged to the file specified
182
+ # in the `file_completed_log_path` setting.
183
+ config :file_completed_action, :validate => ["delete", "log", "log_and_delete"], :default => "delete"
184
+
185
+ # Which file should the completely read file paths be appended to.
186
+ # Only specify this path to a file when `file_completed_action` is 'log' or 'log_and_delete'.
187
+ # IMPORTANT: this file is appended to only - it could become very large. You are responsible for file rotation.
188
+ config :file_completed_log_path, :validate => :string
189
+
190
+ # The sincedb entry now has a last active timestamp associated with it.
191
+ # If no changes are detected in tracked files in the last N days their sincedb
192
+ # tracking record will expire and not be persisted.
193
+ # This option protects against the well known inode recycling problem. (add reference)
194
+ config :sincedb_clean_after, :validate => :number, :default => 14 # days
195
+
196
+ # File content is read off disk in blocks or chunks, then using whatever the set delimiter
197
+ # is, lines are extracted from the chunk. Specify the size in bytes of each chunk.
198
+ # See `file_chunk_count` to see why and when to change this from the default.
199
+ # The default set internally is 32768 (32KB)
200
+ config :file_chunk_size, :validate => :number, :default => FileWatch::FILE_READ_SIZE
201
+
202
+ # When combined with the `file_chunk_size`, this option sets how many chunks
203
+ # are read from each file before moving to the next active file.
204
+ # e.g. a `chunk_count` of 32 with the default `file_chunk_size` will process
205
+ # 1MB from each active file. See the option `max_open_files` for more info.
206
+ # The default set internally is very large, 4611686018427387903. By default
207
+ # the file is read to the end before moving to the next active file.
208
+ config :file_chunk_count, :validate => :number, :default => FileWatch::FIXNUM_MAX
209
+
210
+ # Which attribute of a discovered file should be used to sort the discovered files.
211
+ # Files can be sort by modified date or full path alphabetic.
212
+ # The default is `last_modified`
213
+ # Previously the processing order of the discovered files was OS dependent.
214
+ config :file_sort_by, :validate => ["last_modified", "path"], :default => "last_modified"
215
+
216
+ # Choose between ascending and descending order when also choosing between
217
+ # `last_modified` and `path` file_sort_by options.
218
+ # If ingesting the newest data first is important then opt for last_modified + desc
219
+ # If ingesting the oldest data first is important then opt for last_modified + asc
220
+ # If you use a special naming convention for the file full paths then
221
+ # perhaps path + asc will help to achieve the goal of controlling the order of file ingestion
222
+ config :file_sort_direction, :validate => ["asc", "desc"], :default => "asc"
223
+
163
224
  public
164
225
  def register
165
226
  require "addressable/uri"
166
- require "filewatch/tail"
167
227
  require "digest/md5"
168
228
  @logger.trace("Registering file input", :path => @path)
169
229
  @host = Socket.gethostname.force_encoding(Encoding::UTF_8)
@@ -171,7 +231,7 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
171
231
  # won't in older versions of Logstash, then we need to set it to nil.
172
232
  settings = defined?(LogStash::SETTINGS) ? LogStash::SETTINGS : nil
173
233
 
174
- @tail_config = {
234
+ @filewatch_config = {
175
235
  :exclude => @exclude,
176
236
  :stat_interval => @stat_interval,
177
237
  :discover_interval => @discover_interval,
@@ -179,9 +239,16 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
179
239
  :delimiter => @delimiter,
180
240
  :ignore_older => @ignore_older,
181
241
  :close_older => @close_older,
182
- :max_open_files => @max_open_files
242
+ :max_open_files => @max_open_files,
243
+ :sincedb_clean_after => @sincedb_clean_after,
244
+ :file_chunk_count => @file_chunk_count,
245
+ :file_chunk_size => @file_chunk_size,
246
+ :file_sort_by => @file_sort_by,
247
+ :file_sort_direction => @file_sort_direction,
183
248
  }
184
249
 
250
+ @completed_file_handlers = []
251
+
185
252
  @path.each do |path|
186
253
  if Pathname.new(path).relative?
187
254
  raise ArgumentError.new("File paths must be absolute, relative path specified: #{path}")
@@ -189,132 +256,84 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
189
256
  end
190
257
 
191
258
  if @sincedb_path.nil?
192
- if settings
193
- datapath = File.join(settings.get_value("path.data"), "plugins", "inputs", "file")
194
- # Ensure that the filepath exists before writing, since it's deeply nested.
195
- FileUtils::mkdir_p datapath
196
- @sincedb_path = File.join(datapath, ".sincedb_" + Digest::MD5.hexdigest(@path.join(",")))
259
+ base_sincedb_path = build_sincedb_base_from_settings(settings) || build_sincedb_base_from_env
260
+ @sincedb_path = build_random_sincedb_filename(base_sincedb_path)
261
+ @logger.info('No sincedb_path set, generating one based on the "path" setting', :sincedb_path => @sincedb_path.to_s, :path => @path)
262
+ else
263
+ @sincedb_path = Pathname.new(@sincedb_path)
264
+ if @sincedb_path.directory?
265
+ raise ArgumentError.new("The \"sincedb_path\" argument must point to a file, received a directory: \"#{@sincedb_path}\"")
197
266
  end
198
267
  end
199
268
 
200
- # This section is going to be deprecated eventually, as path.data will be
201
- # the default, not an environment variable (SINCEDB_DIR or HOME)
202
- if @sincedb_path.nil? # If it is _still_ nil...
203
- if ENV["SINCEDB_DIR"].nil? && ENV["HOME"].nil?
204
- @logger.error("No SINCEDB_DIR or HOME environment variable set, I don't know where " \
205
- "to keep track of the files I'm watching. Either set " \
206
- "HOME or SINCEDB_DIR in your environment, or set sincedb_path in " \
207
- "in your Logstash config for the file input with " \
208
- "path '#{@path.inspect}'")
209
- raise # TODO(sissel): HOW DO I FAIL PROPERLY YO
210
- end
211
-
212
- #pick SINCEDB_DIR if available, otherwise use HOME
213
- sincedb_dir = ENV["SINCEDB_DIR"] || ENV["HOME"]
214
-
215
- # Join by ',' to make it easy for folks to know their own sincedb
216
- # generated path (vs, say, inspecting the @path array)
217
- @sincedb_path = File.join(sincedb_dir, ".sincedb_" + Digest::MD5.hexdigest(@path.join(",")))
218
-
219
- # Migrate any old .sincedb to the new file (this is for version <=1.1.1 compatibility)
220
- old_sincedb = File.join(sincedb_dir, ".sincedb")
221
- if File.exists?(old_sincedb)
222
- @logger.debug("Renaming old ~/.sincedb to new one", :old => old_sincedb,
223
- :new => @sincedb_path)
224
- File.rename(old_sincedb, @sincedb_path)
269
+ @filewatch_config[:sincedb_path] = @sincedb_path
270
+
271
+ @filewatch_config[:start_new_files_at] = @start_position.to_sym
272
+
273
+ if @file_completed_action.include?('log')
274
+ if @file_completed_log_path.nil?
275
+ raise ArgumentError.new('The "file_completed_log_path" setting must be provided when the "file_completed_action" is set to "log" or "log_and_delete"')
276
+ else
277
+ @file_completed_log_path = Pathname.new(@file_completed_log_path)
278
+ unless @file_completed_log_path.exist?
279
+ begin
280
+ FileUtils.touch(@file_completed_log_path)
281
+ rescue
282
+ raise ArgumentError.new("The \"file_completed_log_path\" file can't be created: #{@file_completed_log_path}")
283
+ end
284
+ end
225
285
  end
226
-
227
- @logger.info("No sincedb_path set, generating one based on the file path",
228
- :sincedb_path => @sincedb_path, :path => @path)
229
- end
230
-
231
- if File.directory?(@sincedb_path)
232
- raise ArgumentError.new("The \"sincedb_path\" argument must point to a file, received a directory: \"#{@sincedb_path}\"")
233
286
  end
234
287
 
235
- @tail_config[:sincedb_path] = @sincedb_path
236
-
237
- if @start_position == "beginning"
238
- @tail_config[:start_new_files_at] = :beginning
288
+ if tail_mode?
289
+ @watcher_class = FileWatch::ObservingTail
290
+ else
291
+ @watcher_class = FileWatch::ObservingRead
292
+ if @file_completed_action.include?('log')
293
+ @completed_file_handlers << LogCompletedFileHandler.new(@file_completed_log_path)
294
+ end
295
+ if @file_completed_action.include?('delete')
296
+ @completed_file_handlers << DeleteCompletedFileHandler.new
297
+ end
239
298
  end
240
-
241
299
  @codec = LogStash::Codecs::IdentityMapCodec.new(@codec)
242
300
  end # def register
243
301
 
244
- class ListenerTail
245
- # use attr_reader to define noop methods
246
- attr_reader :input, :path, :data
247
- attr_reader :deleted, :created, :error, :eof
248
-
249
- # construct with upstream state
250
- def initialize(path, input)
251
- @path, @input = path, input
252
- end
253
-
254
- def timed_out
255
- input.codec.evict(path)
256
- end
257
-
258
- def accept(data)
259
- # and push transient data filled dup listener downstream
260
- input.log_line_received(path, data)
261
- input.codec.accept(dup_adding_state(data))
262
- end
263
-
264
- def process_event(event)
265
- event.set("[@metadata][path]", path)
266
- event.set("path", path) if !event.include?("path")
267
- input.post_process_this(event)
268
- end
269
-
270
- def add_state(data)
271
- @data = data
272
- self
273
- end
274
-
275
- private
276
-
277
- # duplicate and add state for downstream
278
- def dup_adding_state(line)
279
- self.class.new(path, input).add_state(line)
280
- end
281
- end
282
-
283
- class FlushableListener < ListenerTail
284
- attr_writer :path
285
- end
286
-
287
302
  def listener_for(path)
288
303
  # path is the identity
289
- ListenerTail.new(path, self)
304
+ FileListener.new(path, self)
290
305
  end
291
306
 
292
- def begin_tailing
307
+ def start_processing
293
308
  # if the pipeline restarts this input,
294
309
  # make sure previous files are closed
295
310
  stop
296
- # use observer listener api
297
- @tail = FileWatch::Tail.new_observing(@tail_config)
298
- @tail.logger = @logger
299
- @path.each { |path| @tail.tail(path) }
311
+ @watcher = @watcher_class.new(@filewatch_config)
312
+ @path.each { |path| @watcher.watch_this(path) }
300
313
  end
301
314
 
302
315
  def run(queue)
303
- begin_tailing
316
+ start_processing
304
317
  @queue = queue
305
- @tail.subscribe(self)
318
+ @watcher.subscribe(self) # halts here until quit is called
306
319
  exit_flush
307
320
  end # def run
308
321
 
309
322
  def post_process_this(event)
310
323
  event.set("[@metadata][host]", @host)
311
- event.set("host", @host) if !event.include?("host")
324
+ event.set("host", @host) unless event.include?("host")
312
325
  decorate(event)
313
326
  @queue << event
314
327
  end
315
328
 
329
+ def handle_deletable_path(path)
330
+ return if tail_mode?
331
+ return if @completed_file_handlers.empty?
332
+ @completed_file_handlers.each { |handler| handler.handle(path) }
333
+ end
334
+
316
335
  def log_line_received(path, line)
317
- return if !@logger.debug?
336
+ return unless @logger.debug?
318
337
  @logger.debug("Received line", :path => path, :text => line)
319
338
  end
320
339
 
@@ -322,14 +341,50 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
322
341
  # in filewatch >= 0.6.7, quit will closes and forget all files
323
342
  # but it will write their last read positions to since_db
324
343
  # beforehand
325
- if @tail
344
+ if @watcher
326
345
  @codec.close
327
- @tail.quit
346
+ @watcher.quit
328
347
  end
329
348
  end
330
349
 
331
350
  private
332
351
 
352
+ def build_sincedb_base_from_settings(settings)
353
+ logstash_data_path = settings.get_value("path.data")
354
+ Pathname.new(logstash_data_path).join("plugins", "inputs", "file").tap do |path|
355
+ # Ensure that the filepath exists before writing, since it's deeply nested.
356
+ path.mkpath
357
+ end
358
+ end
359
+
360
+ def build_sincedb_base_from_env
361
+ # This section is going to be deprecated eventually, as path.data will be
362
+ # the default, not an environment variable (SINCEDB_DIR or LOGSTASH_HOME)
363
+ if ENV["SINCEDB_DIR"].nil? && ENV["LOGSTASH_HOME"].nil?
364
+ @logger.error("No SINCEDB_DIR or LOGSTASH_HOME environment variable set, I don't know where " \
365
+ "to keep track of the files I'm watching. Either set " \
366
+ "LOGSTASH_HOME or SINCEDB_DIR in your environment, or set sincedb_path in " \
367
+ "in your Logstash config for the file input with " \
368
+ "path '#{@path.inspect}'")
369
+ raise ArgumentError.new('The "sincedb_path" setting was not given and the environment variables "SINCEDB_DIR" or "LOGSTASH_HOME" are not set so we cannot build a file path for the sincedb')
370
+ end
371
+ Pathname.new(ENV["SINCEDB_DIR"] || ENV["LOGSTASH_HOME"])
372
+ end
373
+
374
+ def build_random_sincedb_filename(pathname)
375
+ # Join by ',' to make it easy for folks to know their own sincedb
376
+ # generated path (vs, say, inspecting the @path array)
377
+ pathname.join(".sincedb_" + Digest::MD5.hexdigest(@path.join(",")))
378
+ end
379
+
380
+ def tail_mode?
381
+ @mode == "tail"
382
+ end
383
+
384
+ def read_mode?
385
+ !tail_mode?
386
+ end
387
+
333
388
  def exit_flush
334
389
  listener = FlushableListener.new("none", self)
335
390
  if @codec.identity_count.zero?
@@ -345,4 +400,4 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
345
400
  @codec.flush_mapped(listener)
346
401
  end
347
402
  end
348
- end # class LogStash::Inputs::File
403
+ end end end# class LogStash::Inputs::File
@@ -0,0 +1,61 @@
1
+ # encoding: utf-8
2
+
3
+ module LogStash module Inputs
4
+ # As and when a new WatchedFile is processed FileWatch asks for an instance of this class for the
5
+ # file path of that WatchedFile. All subsequent callbacks are sent via this listener instance.
6
+ # The file is essentially a stream and the path is the identity of that stream.
7
+ class FileListener
8
+ attr_reader :input, :path, :data
9
+ # construct with link back to the input plugin instance.
10
+ def initialize(path, input)
11
+ @path, @input = path, input
12
+ @data = nil
13
+ end
14
+
15
+ def opened
16
+ end
17
+
18
+ def eof
19
+ end
20
+
21
+ def error
22
+ end
23
+
24
+ def timed_out
25
+ input.codec.evict(path)
26
+ end
27
+
28
+ def deleted
29
+ input.codec.evict(path)
30
+ input.handle_deletable_path(path)
31
+ end
32
+
33
+ def accept(data)
34
+ # and push transient data filled dup listener downstream
35
+ input.log_line_received(path, data)
36
+ input.codec.accept(dup_adding_state(data))
37
+ end
38
+
39
+ def process_event(event)
40
+ event.set("[@metadata][path]", path)
41
+ event.set("path", path) unless event.include?("path")
42
+ input.post_process_this(event)
43
+ end
44
+
45
+ def add_state(data)
46
+ @data = data
47
+ self
48
+ end
49
+
50
+ private
51
+
52
+ # duplicate and add state for downstream
53
+ def dup_adding_state(line)
54
+ self.class.new(path, input).add_state(line)
55
+ end
56
+ end
57
+
58
+ class FlushableListener < FileListener
59
+ attr_writer :path
60
+ end
61
+ end end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ module LogStash module Inputs
4
+ class LogCompletedFileHandler
5
+ def initialize(log_completed_file_path)
6
+ @log_completed_file_path = Pathname.new(log_completed_file_path)
7
+ end
8
+
9
+ def handle(path)
10
+ @log_completed_file_path.open("a") { |fd| fd.puts(path) }
11
+ end
12
+ end
13
+ end end