logstash-input-file 4.0.5 → 4.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +25 -3
  3. data/JAR_VERSION +1 -0
  4. data/docs/index.asciidoc +195 -37
  5. data/lib/filewatch/bootstrap.rb +74 -0
  6. data/lib/filewatch/discoverer.rb +94 -0
  7. data/lib/filewatch/helper.rb +65 -0
  8. data/lib/filewatch/observing_base.rb +97 -0
  9. data/lib/filewatch/observing_read.rb +23 -0
  10. data/lib/filewatch/observing_tail.rb +22 -0
  11. data/lib/filewatch/read_mode/handlers/base.rb +81 -0
  12. data/lib/filewatch/read_mode/handlers/read_file.rb +47 -0
  13. data/lib/filewatch/read_mode/handlers/read_zip_file.rb +57 -0
  14. data/lib/filewatch/read_mode/processor.rb +117 -0
  15. data/lib/filewatch/settings.rb +67 -0
  16. data/lib/filewatch/sincedb_collection.rb +215 -0
  17. data/lib/filewatch/sincedb_record_serializer.rb +70 -0
  18. data/lib/filewatch/sincedb_value.rb +87 -0
  19. data/lib/filewatch/tail_mode/handlers/base.rb +124 -0
  20. data/lib/filewatch/tail_mode/handlers/create.rb +17 -0
  21. data/lib/filewatch/tail_mode/handlers/create_initial.rb +21 -0
  22. data/lib/filewatch/tail_mode/handlers/delete.rb +11 -0
  23. data/lib/filewatch/tail_mode/handlers/grow.rb +11 -0
  24. data/lib/filewatch/tail_mode/handlers/shrink.rb +20 -0
  25. data/lib/filewatch/tail_mode/handlers/timeout.rb +10 -0
  26. data/lib/filewatch/tail_mode/handlers/unignore.rb +37 -0
  27. data/lib/filewatch/tail_mode/processor.rb +209 -0
  28. data/lib/filewatch/watch.rb +107 -0
  29. data/lib/filewatch/watched_file.rb +226 -0
  30. data/lib/filewatch/watched_files_collection.rb +84 -0
  31. data/lib/filewatch/winhelper.rb +65 -0
  32. data/lib/jars/filewatch-1.0.0.jar +0 -0
  33. data/lib/logstash/inputs/delete_completed_file_handler.rb +9 -0
  34. data/lib/logstash/inputs/file.rb +162 -107
  35. data/lib/logstash/inputs/file_listener.rb +61 -0
  36. data/lib/logstash/inputs/log_completed_file_handler.rb +13 -0
  37. data/logstash-input-file.gemspec +5 -4
  38. data/spec/filewatch/buftok_spec.rb +24 -0
  39. data/spec/filewatch/reading_spec.rb +128 -0
  40. data/spec/filewatch/sincedb_record_serializer_spec.rb +71 -0
  41. data/spec/filewatch/spec_helper.rb +120 -0
  42. data/spec/filewatch/tailing_spec.rb +440 -0
  43. data/spec/filewatch/watched_file_spec.rb +38 -0
  44. data/spec/filewatch/watched_files_collection_spec.rb +73 -0
  45. data/spec/filewatch/winhelper_spec.rb +22 -0
  46. data/spec/fixtures/compressed.log.gz +0 -0
  47. data/spec/fixtures/compressed.log.gzip +0 -0
  48. data/spec/fixtures/invalid_utf8.gbk.log +2 -0
  49. data/spec/fixtures/no-final-newline.log +2 -0
  50. data/spec/fixtures/uncompressed.log +2 -0
  51. data/spec/{spec_helper.rb → helpers/spec_helper.rb} +14 -41
  52. data/spec/inputs/file_read_spec.rb +155 -0
  53. data/spec/inputs/{file_spec.rb → file_tail_spec.rb} +55 -52
  54. metadata +96 -28
Binary file
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ module LogStash module Inputs
4
+ class DeleteCompletedFileHandler
5
+ def handle(path)
6
+ Pathname.new(path).unlink rescue nil
7
+ end
8
+ end
9
+ end end
@@ -6,7 +6,12 @@ require "logstash/codecs/identity_map_codec"
6
6
  require "pathname"
7
7
  require "socket" # for Socket.gethostname
8
8
  require "fileutils"
9
+
9
10
  require_relative "file/patch"
11
+ require_relative "file_listener"
12
+ require_relative "delete_completed_file_handler"
13
+ require_relative "log_completed_file_handler"
14
+ require "filewatch/bootstrap"
10
15
 
11
16
  # Stream events from files, normally by tailing them in a manner
12
17
  # similar to `tail -0F` but optionally reading them from the
@@ -26,8 +31,8 @@ require_relative "file/patch"
26
31
  #
27
32
  # ==== Reading from remote network volumes
28
33
  #
29
- # The file input is not tested on remote filesystems such as NFS, Samba, s3fs-fuse, etc. These
30
- # remote filesystems typically have behaviors that are very different from local filesystems and
34
+ # The file input is not tested on remote filesystems such as NFS, Samba, s3fs-fuse, etc. These
35
+ # remote filesystems typically have behaviors that are very different from local filesystems and
31
36
  # are therefore unlikely to work correctly when used with the file input.
32
37
  #
33
38
  # ==== Tracking of current position in watched files
@@ -77,8 +82,8 @@ require_relative "file/patch"
77
82
  # to the rotation and its reopening under the new name (an interval
78
83
  # determined by the `stat_interval` and `discover_interval` options)
79
84
  # will not get picked up.
80
-
81
- class LogStash::Inputs::File < LogStash::Inputs::Base
85
+ module LogStash module Inputs
86
+ class File < LogStash::Inputs::Base
82
87
  config_name "file"
83
88
 
84
89
  # The path(s) to the file(s) to use as an input.
@@ -144,8 +149,7 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
144
149
 
145
150
  # The file input closes any files that were last read the specified
146
151
  # timespan in seconds ago.
147
- # This has different implications depending on if a file is being tailed or
148
- # read. If tailing, and there is a large time gap in incoming data the file
152
+ # If tailing, and there is a large time gap in incoming data the file
149
153
  # can be closed (allowing other files to be opened) but will be queued for
150
154
  # reopening when new data is detected. If reading, the file will be closed
151
155
  # after closed_older seconds from when the last bytes were read.
@@ -160,10 +164,66 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
160
164
  # The default of 4095 is set in filewatch.
161
165
  config :max_open_files, :validate => :number
162
166
 
167
+ # What mode do you want the file input to operate in.
168
+ # Tail a few files or read many content-complete files
169
+ # The default is tail
170
+ # If "read" is specified then the following other settings are ignored
171
+ # `start_position` (files are always read from the beginning)
172
+ # `delimiter` (files are assumed to use \n or \r (or both) as line endings)
173
+ # `close_older` (files are automatically 'closed' when EOF is reached)
174
+ # If "read" is specified then the following settings are heeded
175
+ # `ignore_older` (older files are not processed)
176
+ # "read" mode now supports gzip file processing
177
+ config :mode, :validate => [ "tail", "read"], :default => "tail"
178
+
179
+ # When in 'read' mode, what action should be carried out when a file is done with.
180
+ # If 'delete' is specified then the file will be deleted.
181
+ # If 'log' is specified then the full path of the file is logged to the file specified
182
+ # in the `file_completed_log_path` setting.
183
+ config :file_completed_action, :validate => ["delete", "log", "log_and_delete"], :default => "delete"
184
+
185
+ # Which file should the completely read file paths be appended to.
186
+ # Only specify this path to a file when `file_completed_action` is 'log' or 'log_and_delete'.
187
+ # IMPORTANT: this file is appended to only - it could become very large. You are responsible for file rotation.
188
+ config :file_completed_log_path, :validate => :string
189
+
190
+ # The sincedb entry now has a last active timestamp associated with it.
191
+ # If no changes are detected in tracked files in the last N days their sincedb
192
+ # tracking record will expire and not be persisted.
193
+ # This option protects against the well known inode recycling problem. (add reference)
194
+ config :sincedb_clean_after, :validate => :number, :default => 14 # days
195
+
196
+ # File content is read off disk in blocks or chunks, then using whatever the set delimiter
197
+ # is, lines are extracted from the chunk. Specify the size in bytes of each chunk.
198
+ # See `file_chunk_count` to see why and when to change this from the default.
199
+ # The default set internally is 32768 (32KB)
200
+ config :file_chunk_size, :validate => :number, :default => FileWatch::FILE_READ_SIZE
201
+
202
+ # When combined with the `file_chunk_size`, this option sets how many chunks
203
+ # are read from each file before moving to the next active file.
204
+ # e.g. a `chunk_count` of 32 with the default `file_chunk_size` will process
205
+ # 1MB from each active file. See the option `max_open_files` for more info.
206
+ # The default set internally is very large, 4611686018427387903. By default
207
+ # the file is read to the end before moving to the next active file.
208
+ config :file_chunk_count, :validate => :number, :default => FileWatch::FIXNUM_MAX
209
+
210
+ # Which attribute of a discovered file should be used to sort the discovered files.
211
+ # Files can be sort by modified date or full path alphabetic.
212
+ # The default is `last_modified`
213
+ # Previously the processing order of the discovered files was OS dependent.
214
+ config :file_sort_by, :validate => ["last_modified", "path"], :default => "last_modified"
215
+
216
+ # Choose between ascending and descending order when also choosing between
217
+ # `last_modified` and `path` file_sort_by options.
218
+ # If ingesting the newest data first is important then opt for last_modified + desc
219
+ # If ingesting the oldest data first is important then opt for last_modified + asc
220
+ # If you use a special naming convention for the file full paths then
221
+ # perhaps path + asc will help to achieve the goal of controlling the order of file ingestion
222
+ config :file_sort_direction, :validate => ["asc", "desc"], :default => "asc"
223
+
163
224
  public
164
225
  def register
165
226
  require "addressable/uri"
166
- require "filewatch/tail"
167
227
  require "digest/md5"
168
228
  @logger.trace("Registering file input", :path => @path)
169
229
  @host = Socket.gethostname.force_encoding(Encoding::UTF_8)
@@ -171,7 +231,7 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
171
231
  # won't in older versions of Logstash, then we need to set it to nil.
172
232
  settings = defined?(LogStash::SETTINGS) ? LogStash::SETTINGS : nil
173
233
 
174
- @tail_config = {
234
+ @filewatch_config = {
175
235
  :exclude => @exclude,
176
236
  :stat_interval => @stat_interval,
177
237
  :discover_interval => @discover_interval,
@@ -179,9 +239,16 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
179
239
  :delimiter => @delimiter,
180
240
  :ignore_older => @ignore_older,
181
241
  :close_older => @close_older,
182
- :max_open_files => @max_open_files
242
+ :max_open_files => @max_open_files,
243
+ :sincedb_clean_after => @sincedb_clean_after,
244
+ :file_chunk_count => @file_chunk_count,
245
+ :file_chunk_size => @file_chunk_size,
246
+ :file_sort_by => @file_sort_by,
247
+ :file_sort_direction => @file_sort_direction,
183
248
  }
184
249
 
250
+ @completed_file_handlers = []
251
+
185
252
  @path.each do |path|
186
253
  if Pathname.new(path).relative?
187
254
  raise ArgumentError.new("File paths must be absolute, relative path specified: #{path}")
@@ -189,132 +256,84 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
189
256
  end
190
257
 
191
258
  if @sincedb_path.nil?
192
- if settings
193
- datapath = File.join(settings.get_value("path.data"), "plugins", "inputs", "file")
194
- # Ensure that the filepath exists before writing, since it's deeply nested.
195
- FileUtils::mkdir_p datapath
196
- @sincedb_path = File.join(datapath, ".sincedb_" + Digest::MD5.hexdigest(@path.join(",")))
259
+ base_sincedb_path = build_sincedb_base_from_settings(settings) || build_sincedb_base_from_env
260
+ @sincedb_path = build_random_sincedb_filename(base_sincedb_path)
261
+ @logger.info('No sincedb_path set, generating one based on the "path" setting', :sincedb_path => @sincedb_path.to_s, :path => @path)
262
+ else
263
+ @sincedb_path = Pathname.new(@sincedb_path)
264
+ if @sincedb_path.directory?
265
+ raise ArgumentError.new("The \"sincedb_path\" argument must point to a file, received a directory: \"#{@sincedb_path}\"")
197
266
  end
198
267
  end
199
268
 
200
- # This section is going to be deprecated eventually, as path.data will be
201
- # the default, not an environment variable (SINCEDB_DIR or HOME)
202
- if @sincedb_path.nil? # If it is _still_ nil...
203
- if ENV["SINCEDB_DIR"].nil? && ENV["HOME"].nil?
204
- @logger.error("No SINCEDB_DIR or HOME environment variable set, I don't know where " \
205
- "to keep track of the files I'm watching. Either set " \
206
- "HOME or SINCEDB_DIR in your environment, or set sincedb_path in " \
207
- "in your Logstash config for the file input with " \
208
- "path '#{@path.inspect}'")
209
- raise # TODO(sissel): HOW DO I FAIL PROPERLY YO
210
- end
211
-
212
- #pick SINCEDB_DIR if available, otherwise use HOME
213
- sincedb_dir = ENV["SINCEDB_DIR"] || ENV["HOME"]
214
-
215
- # Join by ',' to make it easy for folks to know their own sincedb
216
- # generated path (vs, say, inspecting the @path array)
217
- @sincedb_path = File.join(sincedb_dir, ".sincedb_" + Digest::MD5.hexdigest(@path.join(",")))
218
-
219
- # Migrate any old .sincedb to the new file (this is for version <=1.1.1 compatibility)
220
- old_sincedb = File.join(sincedb_dir, ".sincedb")
221
- if File.exists?(old_sincedb)
222
- @logger.debug("Renaming old ~/.sincedb to new one", :old => old_sincedb,
223
- :new => @sincedb_path)
224
- File.rename(old_sincedb, @sincedb_path)
269
+ @filewatch_config[:sincedb_path] = @sincedb_path
270
+
271
+ @filewatch_config[:start_new_files_at] = @start_position.to_sym
272
+
273
+ if @file_completed_action.include?('log')
274
+ if @file_completed_log_path.nil?
275
+ raise ArgumentError.new('The "file_completed_log_path" setting must be provided when the "file_completed_action" is set to "log" or "log_and_delete"')
276
+ else
277
+ @file_completed_log_path = Pathname.new(@file_completed_log_path)
278
+ unless @file_completed_log_path.exist?
279
+ begin
280
+ FileUtils.touch(@file_completed_log_path)
281
+ rescue
282
+ raise ArgumentError.new("The \"file_completed_log_path\" file can't be created: #{@file_completed_log_path}")
283
+ end
284
+ end
225
285
  end
226
-
227
- @logger.info("No sincedb_path set, generating one based on the file path",
228
- :sincedb_path => @sincedb_path, :path => @path)
229
- end
230
-
231
- if File.directory?(@sincedb_path)
232
- raise ArgumentError.new("The \"sincedb_path\" argument must point to a file, received a directory: \"#{@sincedb_path}\"")
233
286
  end
234
287
 
235
- @tail_config[:sincedb_path] = @sincedb_path
236
-
237
- if @start_position == "beginning"
238
- @tail_config[:start_new_files_at] = :beginning
288
+ if tail_mode?
289
+ @watcher_class = FileWatch::ObservingTail
290
+ else
291
+ @watcher_class = FileWatch::ObservingRead
292
+ if @file_completed_action.include?('log')
293
+ @completed_file_handlers << LogCompletedFileHandler.new(@file_completed_log_path)
294
+ end
295
+ if @file_completed_action.include?('delete')
296
+ @completed_file_handlers << DeleteCompletedFileHandler.new
297
+ end
239
298
  end
240
-
241
299
  @codec = LogStash::Codecs::IdentityMapCodec.new(@codec)
242
300
  end # def register
243
301
 
244
- class ListenerTail
245
- # use attr_reader to define noop methods
246
- attr_reader :input, :path, :data
247
- attr_reader :deleted, :created, :error, :eof
248
-
249
- # construct with upstream state
250
- def initialize(path, input)
251
- @path, @input = path, input
252
- end
253
-
254
- def timed_out
255
- input.codec.evict(path)
256
- end
257
-
258
- def accept(data)
259
- # and push transient data filled dup listener downstream
260
- input.log_line_received(path, data)
261
- input.codec.accept(dup_adding_state(data))
262
- end
263
-
264
- def process_event(event)
265
- event.set("[@metadata][path]", path)
266
- event.set("path", path) if !event.include?("path")
267
- input.post_process_this(event)
268
- end
269
-
270
- def add_state(data)
271
- @data = data
272
- self
273
- end
274
-
275
- private
276
-
277
- # duplicate and add state for downstream
278
- def dup_adding_state(line)
279
- self.class.new(path, input).add_state(line)
280
- end
281
- end
282
-
283
- class FlushableListener < ListenerTail
284
- attr_writer :path
285
- end
286
-
287
302
  def listener_for(path)
288
303
  # path is the identity
289
- ListenerTail.new(path, self)
304
+ FileListener.new(path, self)
290
305
  end
291
306
 
292
- def begin_tailing
307
+ def start_processing
293
308
  # if the pipeline restarts this input,
294
309
  # make sure previous files are closed
295
310
  stop
296
- # use observer listener api
297
- @tail = FileWatch::Tail.new_observing(@tail_config)
298
- @tail.logger = @logger
299
- @path.each { |path| @tail.tail(path) }
311
+ @watcher = @watcher_class.new(@filewatch_config)
312
+ @path.each { |path| @watcher.watch_this(path) }
300
313
  end
301
314
 
302
315
  def run(queue)
303
- begin_tailing
316
+ start_processing
304
317
  @queue = queue
305
- @tail.subscribe(self)
318
+ @watcher.subscribe(self) # halts here until quit is called
306
319
  exit_flush
307
320
  end # def run
308
321
 
309
322
  def post_process_this(event)
310
323
  event.set("[@metadata][host]", @host)
311
- event.set("host", @host) if !event.include?("host")
324
+ event.set("host", @host) unless event.include?("host")
312
325
  decorate(event)
313
326
  @queue << event
314
327
  end
315
328
 
329
+ def handle_deletable_path(path)
330
+ return if tail_mode?
331
+ return if @completed_file_handlers.empty?
332
+ @completed_file_handlers.each { |handler| handler.handle(path) }
333
+ end
334
+
316
335
  def log_line_received(path, line)
317
- return if !@logger.debug?
336
+ return unless @logger.debug?
318
337
  @logger.debug("Received line", :path => path, :text => line)
319
338
  end
320
339
 
@@ -322,14 +341,50 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
322
341
  # in filewatch >= 0.6.7, quit will closes and forget all files
323
342
  # but it will write their last read positions to since_db
324
343
  # beforehand
325
- if @tail
344
+ if @watcher
326
345
  @codec.close
327
- @tail.quit
346
+ @watcher.quit
328
347
  end
329
348
  end
330
349
 
331
350
  private
332
351
 
352
+ def build_sincedb_base_from_settings(settings)
353
+ logstash_data_path = settings.get_value("path.data")
354
+ Pathname.new(logstash_data_path).join("plugins", "inputs", "file").tap do |path|
355
+ # Ensure that the filepath exists before writing, since it's deeply nested.
356
+ path.mkpath
357
+ end
358
+ end
359
+
360
+ def build_sincedb_base_from_env
361
+ # This section is going to be deprecated eventually, as path.data will be
362
+ # the default, not an environment variable (SINCEDB_DIR or LOGSTASH_HOME)
363
+ if ENV["SINCEDB_DIR"].nil? && ENV["LOGSTASH_HOME"].nil?
364
+ @logger.error("No SINCEDB_DIR or LOGSTASH_HOME environment variable set, I don't know where " \
365
+ "to keep track of the files I'm watching. Either set " \
366
+ "LOGSTASH_HOME or SINCEDB_DIR in your environment, or set sincedb_path in " \
367
+ "in your Logstash config for the file input with " \
368
+ "path '#{@path.inspect}'")
369
+ raise ArgumentError.new('The "sincedb_path" setting was not given and the environment variables "SINCEDB_DIR" or "LOGSTASH_HOME" are not set so we cannot build a file path for the sincedb')
370
+ end
371
+ Pathname.new(ENV["SINCEDB_DIR"] || ENV["LOGSTASH_HOME"])
372
+ end
373
+
374
+ def build_random_sincedb_filename(pathname)
375
+ # Join by ',' to make it easy for folks to know their own sincedb
376
+ # generated path (vs, say, inspecting the @path array)
377
+ pathname.join(".sincedb_" + Digest::MD5.hexdigest(@path.join(",")))
378
+ end
379
+
380
+ def tail_mode?
381
+ @mode == "tail"
382
+ end
383
+
384
+ def read_mode?
385
+ !tail_mode?
386
+ end
387
+
333
388
  def exit_flush
334
389
  listener = FlushableListener.new("none", self)
335
390
  if @codec.identity_count.zero?
@@ -345,4 +400,4 @@ class LogStash::Inputs::File < LogStash::Inputs::Base
345
400
  @codec.flush_mapped(listener)
346
401
  end
347
402
  end
348
- end # class LogStash::Inputs::File
403
+ end end end# class LogStash::Inputs::File
@@ -0,0 +1,61 @@
1
+ # encoding: utf-8
2
+
3
+ module LogStash module Inputs
4
+ # As and when a new WatchedFile is processed FileWatch asks for an instance of this class for the
5
+ # file path of that WatchedFile. All subsequent callbacks are sent via this listener instance.
6
+ # The file is essentially a stream and the path is the identity of that stream.
7
+ class FileListener
8
+ attr_reader :input, :path, :data
9
+ # construct with link back to the input plugin instance.
10
+ def initialize(path, input)
11
+ @path, @input = path, input
12
+ @data = nil
13
+ end
14
+
15
+ def opened
16
+ end
17
+
18
+ def eof
19
+ end
20
+
21
+ def error
22
+ end
23
+
24
+ def timed_out
25
+ input.codec.evict(path)
26
+ end
27
+
28
+ def deleted
29
+ input.codec.evict(path)
30
+ input.handle_deletable_path(path)
31
+ end
32
+
33
+ def accept(data)
34
+ # and push transient data filled dup listener downstream
35
+ input.log_line_received(path, data)
36
+ input.codec.accept(dup_adding_state(data))
37
+ end
38
+
39
+ def process_event(event)
40
+ event.set("[@metadata][path]", path)
41
+ event.set("path", path) unless event.include?("path")
42
+ input.post_process_this(event)
43
+ end
44
+
45
+ def add_state(data)
46
+ @data = data
47
+ self
48
+ end
49
+
50
+ private
51
+
52
+ # duplicate and add state for downstream
53
+ def dup_adding_state(line)
54
+ self.class.new(path, input).add_state(line)
55
+ end
56
+ end
57
+
58
+ class FlushableListener < FileListener
59
+ attr_writer :path
60
+ end
61
+ end end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ module LogStash module Inputs
4
+ class LogCompletedFileHandler
5
+ def initialize(log_completed_file_path)
6
+ @log_completed_file_path = Pathname.new(log_completed_file_path)
7
+ end
8
+
9
+ def handle(path)
10
+ @log_completed_file_path.open("a") { |fd| fd.puts(path) }
11
+ end
12
+ end
13
+ end end