RubyGems - logstash-input-csvfile - Versions diffs - 0.0.6 - Mend

logstash-input-csvfile 0.0.6

Files changed (4) hide show

checksums.yaml +7 -0
data/lib/logstash/inputs/csvfile.rb +234 -0
data/spec/inputs/csvfile_spec.rb +521 -0
metadata +124 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: a3f4dde1199a122dfa627e710320adb46356b423
+  data.tar.gz: 91dfb639e4dbcc82d53297d6ef4b18b359042002
+SHA512:
+  metadata.gz: 671ea579d1cc6ec358c7695f22a59f06f1a67f269389e384c9fc527aa662c4d8c0c5f373c6837f264da040cf177824c1e86e4e5f13fd5126cef0dba75370dd74
+  data.tar.gz: 86321481379fc771b41b7919576624446d41a6add7264e7862357fb71404cdea6c8d3b0327aeca916c6f3932a3866832ee5a136b992296de3b8862ca4c22fb11

data/lib/logstash/inputs/csvfile.rb ADDED

@@ -0,0 +1,234 @@
+# encoding: utf-8
+require "logstash/inputs/file"
+require "logstash/namespace"
+require "csv"
+# Subclass of logstash-input-file that parses CSV lines, with support for first-line schemas.
+# Set first_line_defines_columns => true to enable this behavior.
+# Statically defined columns are also supported, a la logstash-filter-csv, via the columns param.
+# first_line_defines_columns => true takes precedence, though.
+#
+# Since multiple files may be being read by the same plugin instance, and each can have
+# a distinct schema, this plugin records a schema per source file (as defined by the
+# event's path attribute) in a hash.  When it receives an event for a file it doesn't
+# know it reads/parses that file's first line to obtain the schema.  This method supports
+# resuming processing after logstash restarts in mid-file.
+#
+# I considered extending logstash-filter-csv for to do this, but felt that the only reliable
+# way to support streaming csv read was to explicitly read it from the file's schema row
+# (and cache it so subsequent row performance for that file is good.)  Since we cannot count
+# on a logstash filter having read-access to the file, or even processing events that originate
+# from files I rejected this approach.  By definition, a file input plugin must have read-access
+# to the file it's sourcing data from.
+#
+# This plugin borrows most of its csv parsing logic from logstash-filter-csv.
+#
+# This plugin extends logstash-input-file by overriding its decorate method.  Note that
+# logstash-input-plugin 0.0.10, released with Logstash 1.5, doesn't set the event's
+# path element before calling decorate (which this plugin requires), so gemspec insists
+# on logstash-input-file 1.1.0
+#
+class LogStash::Inputs::CSVFile < LogStash::Inputs::File
+  config_name "csvfile"
+  # Define a list of column names (in the order they appear in the CSV,
+  # as if it were a header line). If `columns` is not configured, or there
+  # are not enough columns specified, the default column names are
+  # "column1", "column2", etc. In the case that there are more columns
+  # in the data than specified in this column list, extra columns will be auto-numbered:
+  # (e.g. "user_defined_1", "user_defined_2", "column3", "column4", etc.)
+  config :columns, :validate => :array, :default => []
+  # Bool flag enables sourcing of column names from first event (line) of each file.
+  #  A dynamic alternative to explicitly defining columns in the column attribute
+  config :first_line_defines_columns, :validate => :boolean, :default => false
+  # Define the column separator value. If this is not specified, the default
+  # is a comma `,`.
+  # Optional.
+  config :separator, :validate => :string, :default => ","
+  # Define the character used to quote CSV fields. If this is not specified
+  # the default is a double quote `"`.
+  # Optional.
+  config :quote_char, :validate => :string, :default => '"'
+  # Define target field for placing the data.
+  # Defaults to writing to the root of the event.
+  config :target, :validate => :string
+  # The maximum time a csv file's schema can be unused (in hours) before
+  # it is automatically scrubbed to avoid memory leakage.
+  # If an event for that file arrives subsequently the schema will be
+  # reconstituted (albeit with the penalty of schema row re-read from file).
+  #
+  # Cache scrubbing occurs inline only when new new files are detected to minimize
+  # perf impact on most CSV events.  Since new file detection time is the only time
+  # the cache actually grows, and we're expecting to pay the schema-read penalty then
+  # anyway, it's an optimal time to scrub.
+  #
+  # 0 disables, but memory will grow.  OK if you're routinely restarting logstash.
+  config :max_cached_schema_age_hours, :validate => :number, :default => 24
+  # To handle cases where there's other content in the file before the schema row
+  # that you'll want to ignore. For instance, you can skip leading blank lines
+  # before the schema by matching to non-blank lines using "^.+$"
+  # Note that the plugin will still emit events for pre-schema rows, albeit with
+  # no attributes (for blank lines) or default-named attributes (if the pre-schema
+  # lines do parse as valid CSV).
+  config :schema_pattern_to_match, :validate => :string
+  # To support testing.  Adds attributes to events regarding schema cache behavior.
+  config :add_schema_cache_telemetry_to_event, :validate => :boolean, :default => false
+  public
+  def register
+    @fileColumns = Hash.new
+    @schemaTouchedTimes = Hash.new
+    super()
+    @logger.warn("schema cache scrubbing disabled.  Memory use will grow over time.") if @max_cached_schema_age_hours <= 0
+  end
+  def decorate(event)
+    super(event)
+    message = event["message"]
+    return if !message
+    begin
+      values = CSV.parse_line(message, :col_sep => @separator, :quote_char => @quote_char)
+      return if values.length == 0
+      # Get names for the columns.
+      if @first_line_defines_columns
+        @logger.debug? && @logger.debug("handling csv in first_line_defines_columns mode", :message => message, :columns => @columns)
+        cols = getSchemaForFile(event, values)
+      else
+        @logger.debug? && @logger.debug("handling csv in explicitly defined columns mode", :message => message, :columns => @columns)
+        cols = @columns
+      end
+      # Determine where to write the new attributes
+      if @target.nil?
+        # Default is to write to the root of the event.
+        dest = event
+      else
+        dest = event[@target] ||= {}
+      end
+      # Add the per-column attributes (as long as this isn't the event from the schema defining row)
+      if !event["_csvmetadata"]
+        values.each_index do |i|
+        field_name = cols[i] || "column#{i+1}"
+        dest[field_name] = values[i]
+      end
+    end
+    rescue => e
+      event.tag "_csvparsefailure"
+      @logger.warn("Trouble parsing csv", :message => message, :exception => e)
+      return
+    end # begin
+  end # decorate()
+  def getSchemaForFile(event, parsedValues)
+    path = event["path"]
+    if !path
+      @logger.warn("No path in event.  Cannot retrieve a schema for this event.")
+      return []
+    end
+    @logger.debug? && @logger.debug("Getting schema for file", :path => path)
+    schema = getCachedSchemaForFile(path)
+    if schema
+      @logger.debug? && @logger.debug("Using cached schema", :cols => schema)
+      event["_schemacachetelemetry"]="cachedEntryUsed" if @add_schema_cache_telemetry_to_event
+      touchSchema(path)
+      return schema
+    end
+    @logger.debug? && @logger.debug("Event from unknown file/schema.  Reading schema from that file.", :path => path)
+    scrubSchemaCache(event) if @max_cached_schema_age_hours > 0
+    csvFileLine = readSchemaLineFromFile(path)
+    if !csvFileLine || csvFileLine.length == 0
+      @logger.warn("No suitable schema row found in file.", :path => path)
+      return []
+    end
+    schema = CSV.parse_line(csvFileLine, :col_sep => @separator, :quote_char => @quote_char)
+    addSchemaToCache(path, schema)
+    @logger.debug? && @logger.debug("Schema read from file:", :path => path, :cols => schema)
+    if @add_schema_cache_telemetry_to_event
+      event["_schemacachetelemetry"]="newEntryCreated"
+      event["_cache_touch_time"]=Time.now
+    end
+    # Special handling for the schema row event: tag _csvmetadata and don't return individual column attributes
+    if @fileColumns[path].join == parsedValues.join
+      @logger.debug? && @logger.debug("Received the schema row event.  Tagging w/ _csvmetadata", :message => message)
+      event["_csvmetadata"] = true
+      return []
+    else
+      return schema
+    end
+  end
+  def getCachedSchemaForFile(path)
+    @fileColumns[path]
+  end
+  def addSchemaToCache(path, schema)
+    @fileColumns[path] = schema
+    touchSchema(path)
+  end
+  def touchSchema(path)
+    @schemaTouchedTimes[path] = Time.now
+  end
+  def readSchemaLineFromFile(path)
+    csvFileLine = ""
+    File.open(path, "r") do |f|
+      while csvFileLine.length == 0 and csvFileLine = f.gets
+        if @schema_pattern_to_match
+          if !csvFileLine.end_with?("\n") or !csvFileLine.match(@schema_pattern_to_match)
+            csvFileLine = ""
+          end
+        end
+      end
+    end
+    csvFileLine
+  end
+  def scrubSchemaCache(event)
+    @logger.debug? && @logger.debug("Scrubbing schema cache", :size => @fileColumns.length)
+    event["_schemacachetelemetryscrubbedbeforecount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
+    expiringFiles = []
+    now = Time.now
+    @schemaTouchedTimes.each do |filename, lastReadTime|
+      if (lastReadTime + (@max_cached_schema_age_hours * 60 * 60)) < now
+        expiringFiles << filename
+        @logger.debug? && @logger.debug("Expiring schema for: ", :file => filename, :lastRead => lastReadTime)
+      end
+    end
+    expiringFiles.each do |filename|
+      @fileColumns.delete(filename)
+      @schemaTouchedTimes.delete(filename)
+      @logger.debug? && @logger.debug("Deleted schema for: ", :file => filename)
+    end
+    event["_schemacachetelemetryscrubbedaftercount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
+    @logger.debug? && @logger.debug("Done scrubbing schema cache", :size => @fileColumns.length)
+  end
+end # class LogStash::Inputs::CSVFile

data/spec/inputs/csvfile_spec.rb ADDED

@@ -0,0 +1,521 @@
+# encoding: utf-8
+require "logstash/devutils/rspec/spec_helper"
+require "tempfile"
+require "stud/temporary"
+require "logstash/inputs/csvfile"
+describe "inputs/csvfile" do
+  delimiter = (LogStash::Environment.windows? ? "\r\n" : "\n")
+  #Borrowed this first check from file_spec.rb verbatim to get the pipeline running...
+  it "should starts at the end of an existing file" do
+    tmpfile_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    conf = <<-CONFIG
+      input {
+        file {
+          type => "blah"
+          path => "#{tmpfile_path}"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+        }
+      }
+    CONFIG
+    File.open(tmpfile_path, "w") do |fd|
+      fd.puts("ignore me 1")
+      fd.puts("ignore me 2")
+    end
+    events = input(conf) do |pipeline, queue|
+      # at this point the plugins
+      # threads might still be initializing so we cannot know when the
+      # file plugin will have seen the original file, it could see it
+      # after the first(s) hello world appends below, hence the
+      # retry logic.
+      events = []
+      retries = 0
+      while retries < 20
+        File.open(tmpfile_path, "a") do |fd|
+          fd.puts("hello")
+          fd.puts("world")
+        end
+        if queue.size >= 2
+          events = 2.times.collect { queue.pop }
+          break
+        end
+        sleep(0.1)
+        retries += 1
+      end
+      events
+    end #input block
+    insist { events[0]["message"] } == "hello"
+    insist { events[1]["message"] } == "world"
+  end #it
+  it "should parse csv columns into event attributes using default column names" do
+    tmpfile_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    conf = <<-CONFIG
+      input {
+        csvfile {
+          path => "#{tmpfile_path}"
+          start_position => "beginning"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+        }
+      }
+    CONFIG
+    File.open(tmpfile_path, "a") do |fd|
+      fd.puts("first,second,third")
+      fd.puts('"fou,rth","fifth"')              #Quoting check
+      fd.puts("sixth,seventh,eighth,ninth")
+    end
+    events = input(conf) do |pipeline, queue|
+      3.times.collect { queue.pop }
+    end
+    insist { events[0]["column1"] } == "first"
+    insist { events[0]["column2"] } == "second"
+    insist { events[0]["column3"] } == "third"
+    insist { events[1]["column1"] } == "fou,rth"  #Not a typo: quoting check
+    insist { events[1]["column2"] } == "fifth"
+    insist { events[2]["column1"] } == "sixth"
+    insist { events[2]["column2"] } == "seventh"
+    insist { events[2]["column3"] } == "eighth"
+    insist { events[2]["column4"] } == "ninth"
+  end #it
+  it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
+    tmpfile_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    conf = <<-CONFIG
+      input {
+        csvfile {
+          path => "#{tmpfile_path}"
+          start_position => "beginning"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+          separator => ";"
+          columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
+        }
+      }
+    CONFIG
+    File.open(tmpfile_path, "a") do |fd|
+      fd.puts("first;second;third")
+      fd.puts("fourth;fifth")
+      fd.puts("sixth;sev,enth;eighth;ninth")
+    end
+    events = input(conf) do |pipeline, queue|
+      3.times.collect { queue.pop }
+    end
+    insist { events[0]["FIRST_COL"] } == "first"
+    insist { events[0]["SECOND_COL"] } == "second"
+    insist { events[0]["THIRD_COL"] } == "third"
+    insist { events[1]["FIRST_COL"] } == "fourth"
+    insist { events[1]["SECOND_COL"] } == "fifth"
+    insist { events[2]["FIRST_COL"] } == "sixth"
+    insist { events[2]["SECOND_COL"] } == "sev,enth"
+    insist { events[2]["THIRD_COL"] } == "eighth"
+    insist { events[2]["column4"] } == "ninth"
+  end #it
+  it "should parse csv columns into attributes using column names defined on the csv files 0th row with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
+    tmpfile_path = Stud::Temporary.pathname
+    tmpfile2_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    conf = <<-CONFIG
+      input {
+        csvfile {
+          path => "#{tmpfile_path}"
+          path => "#{tmpfile2_path}"
+          start_position => "beginning"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+       	  first_line_defines_columns => true
+        }
+      }
+    CONFIG
+    File.open(tmpfile_path, "a") do |fd|
+      fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
+      fd.puts("first,second,third")
+      fd.puts("fourth,fifth")
+      fd.puts("sixth,seventh,eighth,ninth")
+    end
+    events = input(conf) do |pipeline, queue|
+      4.times.collect { queue.pop }
+    end
+    insist { events[0]["_csvmetadata"] } == true
+    insist { events[1]["A_COLUMN"] } == "first"
+    insist { events[1]["B_COLUMN"] } == "second"
+    insist { events[1]["C_COLUMN"] } == "third"
+    insist { events[2]["A_COLUMN"] } == "fourth"
+    insist { events[2]["B_COLUMN"] } == "fifth"
+    insist { events[3]["A_COLUMN"] } == "sixth"
+    insist { events[3]["B_COLUMN"] } == "seventh"
+    insist { events[3]["C_COLUMN"] } == "eighth"
+    insist { events[3]["column4"] } == "ninth"
+    File.open(tmpfile2_path, "a") do |fd|
+      fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
+      fd.puts("first,second,third")
+      fd.puts("fourth,fifth")
+      fd.puts("sixth,seventh,eighth,ninth")
+    end
+    events = input(conf) do |pipeline, queue|
+      4.times.collect { queue.pop }
+    end
+    insist { events[0]["_csvmetadata"] } == true
+    insist { events[1]["D_COLUMN"] } == "first"
+    insist { events[1]["E_COLUMN"] } == "second"
+    insist { events[1]["F_COLUMN"] } == "third"
+    insist { events[2]["D_COLUMN"] } == "fourth"
+    insist { events[2]["E_COLUMN"] } == "fifth"
+    insist { events[3]["D_COLUMN"] } == "sixth"
+    insist { events[3]["E_COLUMN"] } == "seventh"
+    insist { events[3]["F_COLUMN"] } == "eighth"
+    insist { events[3]["column4"] } == "ninth"
+  end #it
+  it "should parse csv columns into attributes using column names defined on the first file row that matches the schema_pattern_to_match with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
+    tmpfile_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    conf = <<-CONFIG
+      input {
+        csvfile {
+          path => "#{tmpfile_path}"
+          start_position => "beginning"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+       	  first_line_defines_columns => true
+          schema_pattern_to_match => "^.+$"
+        }
+      }
+    CONFIG
+    File.open(tmpfile_path, "a") do |fd|
+      fd.puts("")
+      fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
+      fd.puts("first,second,third")
+      fd.puts("fourth,fifth")
+      fd.puts("sixth,seventh,eighth,ninth")
+    end
+    events = input(conf) do |pipeline, queue|
+      5.times.collect { queue.pop }
+    end
+    insist { events[1]["_csvmetadata"] } == true
+    insist { events[2]["A_COLUMN"] } == "first"
+    insist { events[2]["B_COLUMN"] } == "second"
+    insist { events[2]["C_COLUMN"] } == "third"
+    insist { events[3]["A_COLUMN"] } == "fourth"
+    insist { events[3]["B_COLUMN"] } == "fifth"
+    insist { events[4]["A_COLUMN"] } == "sixth"
+    insist { events[4]["B_COLUMN"] } == "seventh"
+    insist { events[4]["C_COLUMN"] } == "eighth"
+    insist { events[4]["column4"] } == "ninth"
+  end #it
+  it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
+    tmpfile_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    conf = <<-CONFIG
+      input {
+        csvfile {
+          path => "#{tmpfile_path}"
+          start_position => "beginning"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+          separator => ";"
+          columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
+        }
+      }
+    CONFIG
+    File.open(tmpfile_path, "a") do |fd|
+      fd.puts("first;second;third")
+      fd.puts("fourth;fifth")
+      fd.puts("sixth;sev,enth;eighth;ninth")
+    end
+    events = input(conf) do |pipeline, queue|
+      3.times.collect { queue.pop }
+    end
+    insist { events[0]["FIRST_COL"] } == "first"
+    insist { events[0]["SECOND_COL"] } == "second"
+    insist { events[0]["THIRD_COL"] } == "third"
+    insist { events[1]["FIRST_COL"] } == "fourth"
+    insist { events[1]["SECOND_COL"] } == "fifth"
+    insist { events[2]["FIRST_COL"] } == "sixth"
+    insist { events[2]["SECOND_COL"] } == "sev,enth"
+    insist { events[2]["THIRD_COL"] } == "eighth"
+    insist { events[2]["column4"] } == "ninth"
+  end #it
+  it "should cache schemas per file" do
+    tmpfile_path = Stud::Temporary.pathname
+    tmpfile2_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    conf = <<-CONFIG
+      input {
+        csvfile {
+          path => "#{tmpfile_path}"
+          path => "#{tmpfile2_path}"
+          start_position => "beginning"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+       	  first_line_defines_columns => true
+          add_schema_cache_telemetry_to_event => true
+        }
+      }
+    CONFIG
+    events = input(conf) do |pipeline, queue|
+      File.open(tmpfile_path, "a") do |fd|
+        fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
+        fd.puts("first,second,third")
+      end
+      sleep 1
+      File.open(tmpfile2_path, "a") do |fd|
+        fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
+        fd.puts("1st,2nd,3rd")
+      end
+      4.times.collect { queue.pop }
+    end
+    insist { events[0]["_csvmetadata"] } == true
+    insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
+    insist { events[1]["A_COLUMN"] } == "first"
+    insist { events[1]["B_COLUMN"] } == "second"
+    insist { events[1]["C_COLUMN"] } == "third"
+    insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
+    insist { events[2]["_csvmetadata"] } == true
+    insist { events[2]["_schemacachetelemetry"] } == "newEntryCreated"
+    insist { events[3]["D_COLUMN"] } == "1st"
+    insist { events[3]["E_COLUMN"] } == "2nd"
+    insist { events[3]["F_COLUMN"] } == "3rd"
+    insist { events[3]["_schemacachetelemetry"] } == "cachedEntryUsed"
+  end #it
+  it "should resume processing of a csv file after logstash restarts" do
+    tmpfile_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    # Set up to expire cache entries after 10s of being untouched.  Request that telemetry be added to the event to make cache usage visible.
+    conf = <<-CONFIG
+      input {
+        csvfile {
+          path => "#{tmpfile_path}"
+          start_position => "beginning"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+       	  first_line_defines_columns => true
+          add_schema_cache_telemetry_to_event => true
+        }
+      }
+    CONFIG
+    File.open(tmpfile_path, "a") do |fd|
+      fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
+      fd.puts("first,second,third")
+    end
+    events = input(conf) do |pipeline, queue|
+      2.times.collect { queue.pop }
+    end
+    insist { events[0]["_csvmetadata"] } == true
+    insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
+    insist { events[1]["A_COLUMN"] } == "first"
+    insist { events[1]["B_COLUMN"] } == "second"
+    insist { events[1]["C_COLUMN"] } == "third"
+    insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
+    File.open(tmpfile_path, "a") do |fd|
+      fd.puts("fourth,fifth,sixth")
+    end
+    events = input(conf) do |pipeline, queue|
+      1.times.collect { queue.pop }
+    end
+    insist { events[0]["A_COLUMN"] } == "fourth"
+    insist { events[0]["B_COLUMN"] } == "fifth"
+    insist { events[0]["C_COLUMN"] } == "sixth"
+    insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
+  end #it
+  it "should expire schema cache entries if untouched for more than their configured lifetime (10s in this case)" do
+    # This was tricky to write.  Key points:
+    # - Utilizes a special white-box mode of the plugin that exposes what its doing with its schema cache in telemetry attributes.
+    # - While cache durations are typically in multiple hours, for testing we dial it back to 10s via a small fractional number.
+    # - All the various file IO has to go into the input block
+    # - The queue reads are sprinkled throughout to synchronize the test proc with logstash's file processing.
+    # - Put the insists right after the queue reads to better tie the inputs with the expected outputs.
+    puts "\nThe caching test now running will take a while... (~30s)"
+    tmpfile_path = Stud::Temporary.pathname
+    tmpfile2_path = Stud::Temporary.pathname
+    tmpfile3_path = Stud::Temporary.pathname
+    sincedb_path = Stud::Temporary.pathname
+    conf = <<-CONFIG
+      input {
+        csvfile {
+          path => "#{tmpfile_path}"
+          path => "#{tmpfile2_path}"
+          path => "#{tmpfile3_path}"
+          start_position => "beginning"
+          sincedb_path => "#{sincedb_path}"
+          delimiter => "#{delimiter}"
+       	  first_line_defines_columns => true
+          max_cached_schema_age_hours => 0.0027777777777778
+          add_schema_cache_telemetry_to_event => true
+          discover_interval => 1
+        }
+      }
+    CONFIG
+    events = input(conf) do |pipeline, queue|
+      # File1 Initial Entries.  File 1's schema will be cached.
+      File.open(tmpfile_path, "a") do |fd|
+        fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
+        fd.puts("first,second,third")
+      end
+      # Verify File1 schema was cached and schema row was tagged as csvmetadata
+      event = queue.pop
+      insist { event["_schemacachetelemetry"] } == "newEntryCreated"
+      insist { event["_csvmetadata"] } == true
+      # Verify that cached File1 schema was used to decode row2 of File1
+      event = queue.pop
+      insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
+      insist { event["A_COLUMN"] } == "first"
+      insist { event["B_COLUMN"] } == "second"
+      insist { event["C_COLUMN"] } == "third"
+      # File2 Initial Entries
+      File.open(tmpfile2_path, "a") do |fd|
+        fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
+        fd.puts("1st,2nd,3rd")
+      end
+      # Verify File2 schema was cached and schema row was tagged as csvmetadata
+      event = queue.pop
+      insist { event["_schemacachetelemetry"] } == "newEntryCreated"
+      insist { event["_csvmetadata"] } == true
+      # Verify that cached File2 schema was used to decode row2 of File2
+      event = queue.pop
+      insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
+      insist { event["D_COLUMN"] } == "1st"
+      insist { event["E_COLUMN"] } == "2nd"
+      insist { event["F_COLUMN"] } == "3rd"
+      # Touch File1 before its cached schema entries expires (<10s), refreshing the entry.
+      sleep 5
+      File.open(tmpfile_path, "a") do |fd|
+        fd.puts("fourth,fifth,sixth")
+      end
+      # Verify that still-cached File1 schema was used to decode newly added row of File1
+      event = queue.pop
+      insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
+      insist { event["A_COLUMN"] } == "fourth"
+      insist { event["B_COLUMN"] } == "fifth"
+      insist { event["C_COLUMN"] } == "sixth"
+      # Touch File1 again after File2's cache entry expires.
+      sleep 10
+      File.open(tmpfile_path, "a") do |fd|
+        fd.puts("seventh,eighth,ninth")
+      end
+      # Verify that File1's entry hasn't expired, by virtue of the previous touch refreshing it.
+      event = queue.pop
+      insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
+      insist { event["A_COLUMN"] } == "seventh"
+      insist { event["B_COLUMN"] } == "eighth"
+      insist { event["C_COLUMN"] } == "ninth"
+      # Touch File3. Creation of its cache entry forces purge of File2's expired entry, which is made visible via telemetry.
+      sleep 1
+      File.open(tmpfile3_path, "a") do |fd|
+        fd.puts("X_COLUMN,Y_COLUMN,Z_COLUMN")
+        fd.puts("erste,zweite,dritte")
+      end
+      # Verify that scrubbing of expired cache entries takes place, reducing cached count from 2 (File1 & File2) to 1 (Just File1).
+      #  (Scrubbing takes place before creation of File3's schema entry in the cache.)
+      event = queue.pop
+      insist { event["_csvmetadata"] } == true
+      insist { event["_schemacachetelemetry"] } == "newEntryCreated"
+      insist { event["_schemacachetelemetryscrubbedbeforecount"] } == 2
+      insist { event["_schemacachetelemetryscrubbedaftercount"] } == 1
+      # Verify that File3's schema did in fact get cached.
+      event = queue.pop
+      insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
+      insist { event["X_COLUMN"] } == "erste"
+      insist { event["Y_COLUMN"] } == "zweite"
+      insist { event["Z_COLUMN"] } == "dritte"
+      # File2 post-expiration entry.  Should re-create the File2 cache entry.
+      sleep 1
+      File.open(tmpfile2_path, "a") do |fd|
+        fd.puts("4th,5th,6th")
+      end
+      # Verify that File2's schema gets recreated (but not transmitted as an event since this isn't the natural row0 read).
+      event = queue.pop
+      insist { event["_schemacachetelemetry"] } == "newEntryCreated"
+      insist { event["D_COLUMN"] } == "4th"
+      insist { event["E_COLUMN"] } == "5th"
+      insist { event["F_COLUMN"] } == "6th"
+    end #input block
+  end #it
+end

metadata ADDED

@@ -0,0 +1,124 @@
+--- !ruby/object:Gem::Specification
+name: logstash-input-csvfile
+version: !ruby/object:Gem::Version
+  version: 0.0.6
+platform: ruby
+authors:
+- jweite
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2016-02-11 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 3.0.0
+  name: logstash-core
+  prerelease: false
+  type: :runtime
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 3.0.0
+- !ruby/object:Gem::Dependency
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  name: logstash-codec-plain
+  prerelease: false
+  type: :runtime
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: 1.0.1
+  name: logstash-input-file
+  prerelease: false
+  type: :runtime
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: 1.0.1
+- !ruby/object:Gem::Dependency
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  name: stud
+  prerelease: false
+  type: :runtime
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  name: logstash-devutils
+  prerelease: false
+  type: :development
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program
+email: jweite@yahoo.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/logstash/inputs/csvfile.rb
+- spec/inputs/csvfile_spec.rb
+homepage: ''
+licenses:
+- Apache License (2.0)
+metadata:
+  logstash_plugin: 'true'
+  logstash_group: input
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.5
+signing_key:
+specification_version: 4
+summary: Extends logstash-input-file to parse csv files, optionally respecting 'first-line schemas'
+test_files:
+- spec/inputs/csvfile_spec.rb