RubyGems - logstash-filter-grok - Versions diffs - 0.1.0 - Mend

logstash-filter-grok 0.1.0

Files changed (11) hide show

checksums.yaml +15 -0
data/.gitignore +4 -0
data/Gemfile +3 -0
data/LICENSE +13 -0
data/Rakefile +6 -0
data/lib/logstash/filters/grok.rb +363 -0
data/logstash-filter-grok.gemspec +29 -0
data/rakelib/publish.rake +9 -0
data/rakelib/vendor.rake +169 -0
data/spec/filters/grok_spec.rb +648 -0
metadata +104 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    ZDZkNTcxMWY2ZWE0ZGMyYTczNGYzYzRjNDdmMDI4NzE5N2IwN2Q5Mg==
+  data.tar.gz: !binary |-
+    YTJlZDBhZDg2ODViMzNkZjNhMjZmZDc2OTQ2MTFlYTM1MTgyOGNiNA==
+SHA512:
+  metadata.gz: !binary |-
+    ZDJlZTRlYzBmYmFjYmRhNzA1OGE5ZTljN2ZkZGNkNzQ0ZTlhNDI0MzNmMTcx
+    NjFlNjU5MDgwYjI4ZTY3MTkzMWM5ODFmMjAyZGFlMWYzZTI3YjhjMWU0OGNh
+    NTFjYzg2NjRiODNmOWM1NTY0ZGJhMzRlZTdkY2QzN2ZlYjU0OTM=
+  data.tar.gz: !binary |-
+    ZDk4MGU3MzIzNGJkYTk3OGRhYjhiNjIyNTYzYzg1NGU2YzU3ZDQzNGNiZWMw
+    NThhNTBhMDczNmQ0OTM1NTIyYTRmZjkzZTFmNTcxYzliMWVmM2JiNTc2MTVl
+    YzhiMDNjM2RlNTI2MjU0OTdmZmE5NzljYmM0NTRhMjg1YmFiYjY=

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+*.gem
+Gemfile.lock
+.bundle
+vendor

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source 'http://rubygems.org'
+gem 'rake'
+gem 'gem_publisher'

data/LICENSE ADDED Viewed

@@ -0,0 +1,13 @@
+Copyright (c) 2012-2014 Elasticsearch <http://www.elasticsearch.org>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+@files=[]
+task :default do
+  system("rake -T")
+end

data/lib/logstash/filters/grok.rb ADDED Viewed

@@ -0,0 +1,363 @@
+# encoding: utf-8
+require "logstash/filters/base"
+require "logstash/namespace"
+require "logstash/environment"
+require "logstash/patterns/core"
+require "set"
+# Parse arbitrary text and structure it.
+#
+# Grok is currently the best way in logstash to parse crappy unstructured log
+# data into something structured and queryable.
+#
+# This tool is perfect for syslog logs, apache and other webserver logs, mysql
+# logs, and in general, any log format that is generally written for humans
+# and not computer consumption.
+#
+# Logstash ships with about 120 patterns by default. You can find them here:
+# <https://github.com/logstash/logstash/tree/v%VERSION%/patterns>. You can add
+# your own trivially. (See the patterns_dir setting)
+#
+# If you need help building patterns to match your logs, you will find the
+# <http://grokdebug.herokuapp.com> too quite useful!
+#
+# #### Grok Basics
+#
+# Grok works by combining text patterns into something that matches your
+# logs.
+#
+# The syntax for a grok pattern is `%{SYNTAX:SEMANTIC}`
+#
+# The `SYNTAX` is the name of the pattern that will match your text. For
+# example, "3.44" will be matched by the NUMBER pattern and "55.3.244.1" will
+# be matched by the IP pattern. The syntax is how you match.
+#
+# The `SEMANTIC` is the identifier you give to the piece of text being matched.
+# For example, "3.44" could be the duration of an event, so you could call it
+# simply 'duration'. Further, a string "55.3.244.1" might identify the 'client'
+# making a request.
+#
+# For the above example, your grok filter would look something like this:
+#
+# %{NUMBER:duration} %{IP:client}
+#
+# Optionally you can add a data type conversion to your grok pattern. By default
+# all semantics are saved as strings. If you wish to convert a semantic's data type,
+# for example change a string to an integer then suffix it with the target data type.
+# For example `%{NUMBER:num:int}` which converts the 'num' semantic from a string to an
+# integer. Currently the only supported conversions are `int` and `float`.
+#
+# #### Example
+#
+# With that idea of a syntax and semantic, we can pull out useful fields from a
+# sample log like this fictional http request log:
+#
+#     55.3.244.1 GET /index.html 15824 0.043
+#
+# The pattern for this could be:
+#
+#     %{IP:client} %{WORD:method} %{URIPATHPARAM:request} %{NUMBER:bytes} %{NUMBER:duration}
+#
+# A more realistic example, let's read these logs from a file:
+#
+#     input {
+#       file {
+#         path => "/var/log/http.log"
+#       }
+#     }
+#     filter {
+#       grok {
+#         match => { "message" => "%{IP:client} %{WORD:method} %{URIPATHPARAM:request} %{NUMBER:bytes} %{NUMBER:duration}" }
+#       }
+#     }
+#
+# After the grok filter, the event will have a few extra fields in it:
+#
+# * client: 55.3.244.1
+# * method: GET
+# * request: /index.html
+# * bytes: 15824
+# * duration: 0.043
+#
+# #### Regular Expressions
+#
+# Grok sits on top of regular expressions, so any regular expressions are valid
+# in grok as well. The regular expression library is Oniguruma, and you can see
+# the full supported regexp syntax [on the Onigiruma
+# site](http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt).
+#
+# #### Custom Patterns
+#
+# Sometimes logstash doesn't have a pattern you need. For this, you have
+# a few options.
+#
+# First, you can use the Oniguruma syntax for 'named capture' which will
+# let you match a piece of text and save it as a field:
+#
+#     (?<field_name>the pattern here)
+#
+# For example, postfix logs have a 'queue id' that is an 10 or 11-character
+# hexadecimal value. I can capture that easily like this:
+#
+#     (?<queue_id>[0-9A-F]{10,11})
+#
+# Alternately, you can create a custom patterns file.
+#
+# * Create a directory called `patterns` with a file in it called `extra`
+#   (the file name doesn't matter, but name it meaningfully for yourself)
+# * In that file, write the pattern you need as the pattern name, a space, then
+#   the regexp for that pattern.
+#
+# For example, doing the postfix queue id example as above:
+#
+#     # contents of ./patterns/postfix:
+#     POSTFIX_QUEUEID [0-9A-F]{10,11}
+#
+# Then use the `patterns_dir` setting in this plugin to tell logstash where
+# your custom patterns directory is. Here's a full example with a sample log:
+#
+#     Jan  1 06:25:43 mailserver14 postfix/cleanup[21403]: BEF25A72965: message-id=<20130101142543.5828399CCAF@mailserver14.example.com>
+#
+#     filter {
+#       grok {
+#         patterns_dir => "./patterns"
+#         match => { "message" => "%{SYSLOGBASE} %{POSTFIX_QUEUEID:queue_id}: %{GREEDYDATA:syslog_message}" }
+#       }
+#     }
+#
+# The above will match and result in the following fields:
+#
+# * timestamp: Jan  1 06:25:43
+# * logsource: mailserver14
+# * program: postfix/cleanup
+# * pid: 21403
+# * queue_id: BEF25A72965
+# * syslog_message: message-id=<20130101142543.5828399CCAF@mailserver14.example.com>
+#
+# The `timestamp`, `logsource`, `program`, and `pid` fields come from the
+# SYSLOGBASE pattern which itself is defined by other patterns.
+class LogStash::Filters::Grok < LogStash::Filters::Base
+  config_name "grok"
+  milestone 3
+  # Specify a pattern to parse with. This will match the 'message' field.
+  #
+  # If you want to match other fields than message, use the 'match' setting.
+  # Multiple patterns is fine.
+  config :pattern, :validate => :array, :deprecated => "You should use this instead: match => { \"message\" => \"your pattern here\" }"
+  # A hash of matches of field => value
+  #
+  # For example:
+  #
+  #     filter {
+  #       grok { match => { "message" => "Duration: %{NUMBER:duration}" } }
+  #     }
+  #
+  # Alternatively, using the old array syntax:
+  #
+  #     filter {
+  #       grok { match => [ "message", "Duration: %{NUMBER:duration}" ] }
+  #     }
+  #
+  config :match, :validate => :hash, :default => {}
+  #
+  # logstash ships by default with a bunch of patterns, so you don't
+  # necessarily need to define this yourself unless you are adding additional
+  # patterns.
+  #
+  # Pattern files are plain text with format:
+  #
+  #     NAME PATTERN
+  #
+  # For example:
+  #
+  #     NUMBER \d+
+  config :patterns_dir, :validate => :array, :default => []
+  # Drop if matched. Note, this feature may not stay. It is preferable to combine
+  # grok + grep filters to do parsing + dropping.
+  config :drop_if_match, :validate => :boolean, :default => false
+  # Break on first match. The first successful match by grok will result in the
+  # filter being finished. If you want grok to try all patterns (maybe you are
+  # parsing different things), then set this to false.
+  config :break_on_match, :validate => :boolean, :default => true
+  # If true, only store named captures from grok.
+  config :named_captures_only, :validate => :boolean, :default => true
+  # If true, keep empty captures as event fields.
+  config :keep_empty_captures, :validate => :boolean, :default => false
+  # If true, make single-value fields simply that value, not an array
+  # containing that one value.
+  config :singles, :validate => :boolean, :default => true, :deprecated => "This behavior is the default now, you don't need to set it."
+  # Append values to the 'tags' field when there has been no
+  # successful match
+  config :tag_on_failure, :validate => :array, :default => ["_grokparsefailure"]
+  # The fields to overwrite.
+  #
+  # This allows you to overwrite a value in a field that already exists.
+  #
+  # For example, if you have a syslog line in the 'message' field, you can
+  # overwrite the 'message' field with part of the match like so:
+  #
+  #     filter {
+  #       grok {
+  #         match => { "message" => "%{SYSLOGBASE} %{DATA:message}" }
+  #         overwrite => [ "message" ]
+  #       }
+  #     }
+  #
+  #  In this case, a line like "May 29 16:37:11 sadness logger: hello world"
+  #  will be parsed and 'hello world' will overwrite the original message.
+  config :overwrite, :validate => :array, :default => []
+  # Detect if we are running from a jarfile, pick the right path.
+  @@patterns_path ||= Set.new
+#@@patterns_path += [LogStash::Environment.pattern_path("*")]
+  @@patterns_path += [LogStash::Patterns::Core.path]
+  public
+  def initialize(params)
+    super(params)
+    @match["message"] ||= []
+    @match["message"] += @pattern if @pattern # the config 'pattern' value (array)
+    # a cache of capture name handler methods.
+    @handlers = {}
+  end
+  public
+  def register
+    require "grok-pure" # rubygem 'jls-grok'
+    @patternfiles = []
+    # Have @@patterns_path show first. Last-in pattern definitions win; this
+    # will let folks redefine built-in patterns at runtime.
+    @patterns_dir = @@patterns_path.to_a + @patterns_dir
+    @logger.info? and @logger.info("Grok patterns path", :patterns_dir => @patterns_dir)
+    @patterns_dir.each do |path|
+      if File.directory?(path)
+        path = File.join(path, "*")
+      end
+      Dir.glob(path).each do |file|
+        @logger.info? and @logger.info("Grok loading patterns from file", :path => file)
+        @patternfiles << file
+      end
+    end
+    @patterns = Hash.new { |h,k| h[k] = [] }
+    @logger.info? and @logger.info("Match data", :match => @match)
+    @match.each do |field, patterns|
+      patterns = [patterns] if patterns.is_a?(String)
+      @logger.info? and @logger.info("Grok compile", :field => field, :patterns => patterns)
+      patterns.each do |pattern|
+        @logger.debug? and @logger.debug("regexp: #{@type}/#{field}", :pattern => pattern)
+        grok = Grok.new
+        grok.logger = @logger unless @logger.nil?
+        add_patterns_from_files(@patternfiles, grok)
+        grok.compile(pattern, @named_captures_only)
+        @patterns[field] << grok
+      end
+    end # @match.each
+  end # def register
+  public
+  def filter(event)
+    return unless filter?(event)
+    matched = false
+    done = false
+    @logger.debug? and @logger.debug("Running grok filter", :event => event);
+    @patterns.each do |field, groks|
+      if match(groks, field, event)
+        matched = true
+        break if @break_on_match
+      end
+      #break if done
+    end # @patterns.each
+    if matched
+      filter_matched(event)
+    else
+      # Tag this event if we can't parse it. We can use this later to
+      # reparse+reindex logs if we improve the patterns given.
+      @tag_on_failure.each do |tag|
+        event["tags"] ||= []
+        event["tags"] << tag unless event["tags"].include?(tag)
+      end
+    end
+    @logger.debug? and @logger.debug("Event now: ", :event => event)
+  end # def filter
+  private
+  def match(groks, field, event)
+    input = event[field]
+    if input.is_a?(Array)
+      success = false
+      input.each do |input|
+        success |= match_against_groks(groks, input, event)
+      end
+      return success
+    else
+      return match_against_groks(groks, input, event)
+    end
+  rescue StandardError => e
+    @logger.warn("Grok regexp threw exception", :exception => e.message)
+  end
+  private
+  def match_against_groks(groks, input, event)
+    matched = false
+    groks.each do |grok|
+      # Convert anything else to string (number, hash, etc)
+      matched = grok.match_and_capture(input.to_s) do |field, value|
+        matched = true
+        handle(field, value, event)
+      end
+      break if matched and @break_on_match
+    end
+    return matched
+  end
+  private
+  def handle(field, value, event)
+    return if (value.nil? || (value.is_a?(String) && value.empty?)) unless @keep_empty_captures
+    if @overwrite.include?(field)
+      event[field] = value
+    else
+      v = event[field]
+      if v.nil?
+        event[field] = value
+      elsif v.is_a?(Array)
+        event[field] << value
+      elsif v.is_a?(String)
+        # Promote to array since we aren't overwriting.
+        event[field] = [v, value]
+      end
+    end
+  end
+  private
+  def add_patterns_from_files(paths, grok)
+    paths.each do |path|
+      if !File.exists?(path)
+        raise "Grok pattern file does not exist: #{path}"
+      end
+      grok.add_patterns_from_file(path)
+    end
+  end # def add_patterns_from_files
+end # class LogStash::Filters::Grok

data/logstash-filter-grok.gemspec ADDED Viewed

@@ -0,0 +1,29 @@
+Gem::Specification.new do |s|
+  s.name            = 'logstash-filter-grok'
+  s.version         = '0.1.0'
+  s.licenses        = ['Apache License (2.0)']
+  s.summary         = "Parse arbitrary text and structure it."
+  s.description     = "Grok is currently the best way in logstash to parse crappy unstructured log data into something structured and queryable."
+  s.authors         = ["Elasticsearch"]
+  s.email           = 'richard.pijnenburg@elasticsearch.com'
+  s.homepage        = "http://logstash.net/"
+  s.require_paths = ["lib"]
+  # Files
+  s.files = `git ls-files`.split($\)
+  # Tests
+  s.test_files = s.files.grep(%r{^(test|spec|features)/})
+  # Special flag to let us know this is actually a logstash plugin
+  s.metadata = { "logstash_plugin" => "true", "group" => "filter" }
+  # Gem dependencies
+  s.add_runtime_dependency 'logstash', '>= 1.4.0', '< 2.0.0'
+  s.add_runtime_dependency 'jls-grok', ['0.11.0']
+  s.add_runtime_dependency 'logstash-patterns-core'
+end

data/rakelib/publish.rake ADDED Viewed

@@ -0,0 +1,9 @@
+require "gem_publisher"
+desc "Publish gem to RubyGems.org"
+task :publish_gem do |t|
+  gem_file = Dir.glob(File.expand_path('../*.gemspec',File.dirname(__FILE__))).first
+  gem = GemPublisher.publish_if_updated(gem_file, :rubygems)
+  puts "Published #{gem}" if gem
+end

data/rakelib/vendor.rake ADDED Viewed

@@ -0,0 +1,169 @@
+require "net/http"
+require "uri"
+require "digest/sha1"
+def vendor(*args)
+  return File.join("vendor", *args)
+end
+directory "vendor/" => ["vendor"] do |task, args|
+  mkdir task.name
+end
+def fetch(url, sha1, output)
+  puts "Downloading #{url}"
+  actual_sha1 = download(url, output)
+  if actual_sha1 != sha1
+    fail "SHA1 does not match (expected '#{sha1}' but got '#{actual_sha1}')"
+  end
+end # def fetch
+def file_fetch(url, sha1)
+  filename = File.basename( URI(url).path )
+  output = "vendor/#{filename}"
+  task output => [ "vendor/" ] do
+    begin
+      actual_sha1 = file_sha1(output)
+      if actual_sha1 != sha1
+        fetch(url, sha1, output)
+      end
+    rescue Errno::ENOENT
+      fetch(url, sha1, output)
+    end
+  end.invoke
+  return output
+end
+def file_sha1(path)
+  digest = Digest::SHA1.new
+  fd = File.new(path, "r")
+  while true
+    begin
+      digest << fd.sysread(16384)
+    rescue EOFError
+      break
+    end
+  end
+  return digest.hexdigest
+ensure
+  fd.close if fd
+end
+def download(url, output)
+  uri = URI(url)
+  digest = Digest::SHA1.new
+  tmp = "#{output}.tmp"
+  Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == "https")) do |http|
+    request = Net::HTTP::Get.new(uri.path)
+    http.request(request) do |response|
+      fail "HTTP fetch failed for #{url}. #{response}" if [200, 301].include?(response.code)
+      size = (response["content-length"].to_i || -1).to_f
+      count = 0
+      File.open(tmp, "w") do |fd|
+        response.read_body do |chunk|
+          fd.write(chunk)
+          digest << chunk
+          if size > 0 && $stdout.tty?
+            count += chunk.bytesize
+            $stdout.write(sprintf("\r%0.2f%%", count/size * 100))
+          end
+        end
+      end
+      $stdout.write("\r      \r") if $stdout.tty?
+    end
+  end
+  File.rename(tmp, output)
+  return digest.hexdigest
+rescue SocketError => e
+  puts "Failure while downloading #{url}: #{e}"
+  raise
+ensure
+  File.unlink(tmp) if File.exist?(tmp)
+end # def download
+def untar(tarball, &block)
+  require "archive/tar/minitar"
+  tgz = Zlib::GzipReader.new(File.open(tarball))
+  # Pull out typesdb
+  tar = Archive::Tar::Minitar::Input.open(tgz)
+  tar.each do |entry|
+    path = block.call(entry)
+    next if path.nil?
+    parent = File.dirname(path)
+    mkdir_p parent unless File.directory?(parent)
+    # Skip this file if the output file is the same size
+    if entry.directory?
+      mkdir path unless File.directory?(path)
+    else
+      entry_mode = entry.instance_eval { @mode } & 0777
+      if File.exists?(path)
+        stat = File.stat(path)
+        # TODO(sissel): Submit a patch to archive-tar-minitar upstream to
+        # expose headers in the entry.
+        entry_size = entry.instance_eval { @size }
+        # If file sizes are same, skip writing.
+        next if stat.size == entry_size && (stat.mode & 0777) == entry_mode
+      end
+      puts "Extracting #{entry.full_name} from #{tarball} #{entry_mode.to_s(8)}"
+      File.open(path, "w") do |fd|
+        # eof? check lets us skip empty files. Necessary because the API provided by
+        # Archive::Tar::Minitar::Reader::EntryStream only mostly acts like an
+        # IO object. Something about empty files in this EntryStream causes
+        # IO.copy_stream to throw "can't convert nil into String" on JRuby
+        # TODO(sissel): File a bug about this.
+        while !entry.eof?
+          chunk = entry.read(16384)
+          fd.write(chunk)
+        end
+          #IO.copy_stream(entry, fd)
+      end
+      File.chmod(entry_mode, path)
+    end
+  end
+  tar.close
+  File.unlink(tarball) if File.file?(tarball)
+end # def untar
+def ungz(file)
+  outpath = file.gsub('.gz', '')
+  tgz = Zlib::GzipReader.new(File.open(file))
+  begin
+    File.open(outpath, "w") do |out|
+      IO::copy_stream(tgz, out)
+    end
+    File.unlink(file)
+  rescue
+    File.unlink(outpath) if File.file?(outpath)
+   raise
+  end
+  tgz.close
+end
+desc "Process any vendor files required for this plugin"
+task "vendor" do |task, args|
+  @files.each do |file|
+    download = file_fetch(file['url'], file['sha1'])
+    if download =~ /.tar.gz/
+      prefix = download.gsub('.tar.gz', '').gsub('vendor/', '')
+      untar(download) do |entry|
+        if !file['files'].nil?
+          next unless file['files'].include?(entry.full_name.gsub(prefix, ''))
+          out = entry.full_name.split("/").last
+        end
+        File.join('vendor', out)
+      end
+    elsif download =~ /.gz/
+      ungz(download)
+    end
+  end
+end

data/spec/filters/grok_spec.rb ADDED Viewed

@@ -0,0 +1,648 @@
+# encoding: utf-8
+require "spec_helper"
+require "logstash/filters/grok"
+describe LogStash::Filters::Grok do
+  describe "simple syslog line" do
+    # The logstash config goes here.
+    # At this time, only filters are supported.
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{SYSLOGLINE}" }
+          singles => true
+          overwrite => [ "message" ]
+        }
+      }
+    CONFIG
+    sample "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from camomile.cloud9.net[168.100.1.3]" do
+      insist { subject["tags"] }.nil?
+      insist { subject["logsource"] } == "evita"
+      insist { subject["timestamp"] } == "Mar 16 00:01:25"
+      insist { subject["message"] } == "connect from camomile.cloud9.net[168.100.1.3]"
+      insist { subject["program"] } == "postfix/smtpd"
+      insist { subject["pid"] } == "1713"
+    end
+  end
+  describe "ietf 5424 syslog line" do
+    # The logstash config goes here.
+    # At this time, only filters are supported.
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{SYSLOG5424LINE}" }
+          singles => true
+        }
+      }
+    CONFIG
+    sample "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug 4123 - [id1 foo=\"bar\"][id2 baz=\"something\"] Hello, syslog." do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "191"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2009-06-30T18:30:00+02:00"
+      insist { subject["syslog5424_host"] } == "paxton.local"
+      insist { subject["syslog5424_app"] } == "grokdebug"
+      insist { subject["syslog5424_proc"] } == "4123"
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == "[id1 foo=\"bar\"][id2 baz=\"something\"]"
+      insist { subject["syslog5424_msg"] } == "Hello, syslog."
+    end
+    sample "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug - - [id1 foo=\"bar\"] No process ID." do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "191"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2009-06-30T18:30:00+02:00"
+      insist { subject["syslog5424_host"] } == "paxton.local"
+      insist { subject["syslog5424_app"] } == "grokdebug"
+      insist { subject["syslog5424_proc"] } == nil
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == "[id1 foo=\"bar\"]"
+      insist { subject["syslog5424_msg"] } == "No process ID."
+    end
+    sample "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug 4123 - - No structured data." do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "191"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2009-06-30T18:30:00+02:00"
+      insist { subject["syslog5424_host"] } == "paxton.local"
+      insist { subject["syslog5424_app"] } == "grokdebug"
+      insist { subject["syslog5424_proc"] } == "4123"
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == nil
+      insist { subject["syslog5424_msg"] } == "No structured data."
+    end
+    sample "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug - - - No PID or SD." do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "191"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2009-06-30T18:30:00+02:00"
+      insist { subject["syslog5424_host"] } == "paxton.local"
+      insist { subject["syslog5424_app"] } == "grokdebug"
+      insist { subject["syslog5424_proc"] } == nil
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == nil
+      insist { subject["syslog5424_msg"] } == "No PID or SD."
+    end
+    sample "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug 4123 -  Missing structured data." do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "191"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2009-06-30T18:30:00+02:00"
+      insist { subject["syslog5424_host"] } == "paxton.local"
+      insist { subject["syslog5424_app"] } == "grokdebug"
+      insist { subject["syslog5424_proc"] } == "4123"
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == nil
+      insist { subject["syslog5424_msg"] } == "Missing structured data."
+    end
+    sample "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug  4123 - - Additional spaces." do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "191"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2009-06-30T18:30:00+02:00"
+      insist { subject["syslog5424_host"] } == "paxton.local"
+      insist { subject["syslog5424_app"] } == "grokdebug"
+      insist { subject["syslog5424_proc"] } == "4123"
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == nil
+      insist { subject["syslog5424_msg"] } == "Additional spaces."
+    end
+    sample "<191>1 2009-06-30T18:30:00+02:00 paxton.local grokdebug  4123 -  Additional spaces and missing SD." do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "191"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2009-06-30T18:30:00+02:00"
+      insist { subject["syslog5424_host"] } == "paxton.local"
+      insist { subject["syslog5424_app"] } == "grokdebug"
+      insist { subject["syslog5424_proc"] } == "4123"
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == nil
+      insist { subject["syslog5424_msg"] } == "Additional spaces and missing SD."
+    end
+    sample "<30>1 2014-04-04T16:44:07+02:00 osctrl01 dnsmasq-dhcp 8048 - -  Appname contains a dash" do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "30"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2014-04-04T16:44:07+02:00"
+      insist { subject["syslog5424_host"] } == "osctrl01"
+      insist { subject["syslog5424_app"] } == "dnsmasq-dhcp"
+      insist { subject["syslog5424_proc"] } == "8048"
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == nil
+      insist { subject["syslog5424_msg"] } == "Appname contains a dash"
+    end
+    sample "<30>1 2014-04-04T16:44:07+02:00 osctrl01 - 8048 - -  Appname is nil" do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog5424_pri"] } == "30"
+      insist { subject["syslog5424_ver"] } == "1"
+      insist { subject["syslog5424_ts"] } == "2014-04-04T16:44:07+02:00"
+      insist { subject["syslog5424_host"] } == "osctrl01"
+      insist { subject["syslog5424_app"] } == nil
+      insist { subject["syslog5424_proc"] } == "8048"
+      insist { subject["syslog5424_msgid"] } == nil
+      insist { subject["syslog5424_sd"] } == nil
+      insist { subject["syslog5424_msg"] } == "Appname is nil"
+    end
+  end
+  describe "parsing an event with multiple messages (array of strings)", :if => false do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "(?:hello|world) %{NUMBER}" }
+          named_captures_only => false
+        }
+      }
+    CONFIG
+    sample("message" => [ "hello 12345", "world 23456" ]) do
+      insist { subject["NUMBER"] } == [ "12345", "23456" ]
+    end
+  end
+  describe "coercing matched values" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{NUMBER:foo:int} %{NUMBER:bar:float}" }
+          singles => true
+        }
+      }
+    CONFIG
+    sample "400 454.33" do
+      insist { subject["foo"] } == 400
+      insist { subject["foo"] }.is_a?(Fixnum)
+      insist { subject["bar"] } == 454.33
+      insist { subject["bar"] }.is_a?(Float)
+    end
+  end
+  describe "in-line pattern definitions" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{FIZZLE=\\d+}" }
+          named_captures_only => false
+          singles => true
+        }
+      }
+    CONFIG
+    sample "hello 1234" do
+      insist { subject["FIZZLE"] } == "1234"
+    end
+  end
+  describe "processing selected fields" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{WORD:word}" }
+          match => { "examplefield" => "%{NUMBER:num}" }
+          break_on_match => false
+          singles => true
+        }
+      }
+    CONFIG
+    sample("message" => "hello world", "examplefield" => "12345") do
+      insist { subject["examplefield"] } == "12345"
+      insist { subject["word"] } == "hello"
+    end
+  end
+  describe "adding fields on match" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "matchme %{NUMBER:fancy}" }
+          singles => true
+          add_field => [ "new_field", "%{fancy}" ]
+        }
+      }
+    CONFIG
+    sample "matchme 1234" do
+      insist { subject["tags"] }.nil?
+      insist { subject["new_field"] } == "1234"
+    end
+    sample "this will not be matched" do
+      insist { subject["tags"] }.include?("_grokparsefailure")
+      reject { subject }.include?("new_field")
+    end
+  end
+  context "empty fields" do
+    describe "drop by default" do
+      config <<-CONFIG
+        filter {
+          grok {
+            match => { "message" => "1=%{WORD:foo1} *(2=%{WORD:foo2})?" }
+          }
+        }
+      CONFIG
+      sample "1=test" do
+        insist { subject["tags"] }.nil?
+        insist { subject }.include?("foo1")
+        # Since 'foo2' was not captured, it must not be present in the event.
+        reject { subject }.include?("foo2")
+      end
+    end
+    describe "keep if keep_empty_captures is true" do
+      config <<-CONFIG
+        filter {
+          grok {
+            match => { "message" => "1=%{WORD:foo1} *(2=%{WORD:foo2})?" }
+            keep_empty_captures => true
+          }
+        }
+      CONFIG
+      sample "1=test" do
+        insist { subject["tags"] }.nil?
+        # use .to_hash for this test, for now, because right now
+        # the Event.include? returns false for missing fields as well
+        # as for fields with nil values.
+        insist { subject.to_hash }.include?("foo2")
+        insist { subject.to_hash }.include?("foo2")
+      end
+    end
+  end
+  describe "when named_captures_only == false" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "Hello %{WORD}. %{WORD:foo}" }
+          named_captures_only => false
+          singles => true
+        }
+      }
+    CONFIG
+    sample "Hello World, yo!" do
+      insist { subject }.include?("WORD")
+      insist { subject["WORD"] } == "World"
+      insist { subject }.include?("foo")
+      insist { subject["foo"] } == "yo"
+    end
+  end
+  describe "using oniguruma named captures (?<name>regex)" do
+    context "plain regexp" do
+      config <<-'CONFIG'
+        filter {
+          grok {
+            singles => true
+            match => { "message" => "(?<foo>\w+)" }
+          }
+        }
+      CONFIG
+      sample "hello world" do
+        insist { subject["tags"] }.nil?
+        insist { subject["foo"] } == "hello"
+      end
+    end
+    context "grok patterns" do
+      config <<-'CONFIG'
+        filter {
+          grok {
+            singles => true
+            match => { "message" => "(?<timestamp>%{DATE_EU} %{TIME})" }
+          }
+        }
+      CONFIG
+      sample "fancy 12-12-12 12:12:12" do
+        insist { subject["tags"] }.nil?
+        insist { subject["timestamp"] } == "12-12-12 12:12:12"
+      end
+    end
+  end
+  describe "grok on integer types" do
+    config <<-'CONFIG'
+      filter {
+        grok {
+          match => { "status" => "^403$" }
+          add_tag => "four_oh_three"
+        }
+      }
+    CONFIG
+    sample("status" => 403) do
+      reject { subject["tags"] }.include?("_grokparsefailure")
+      insist { subject["tags"] }.include?("four_oh_three")
+    end
+  end
+  describe "grok on float types" do
+    config <<-'CONFIG'
+      filter {
+        grok {
+          match => { "version" => "^1.0$" }
+          add_tag => "one_point_oh"
+        }
+      }
+    CONFIG
+    sample("version" => 1.0) do
+      insist { subject["tags"] }.include?("one_point_oh")
+      insist { subject["tags"] }.include?("one_point_oh")
+    end
+  end
+  describe "grok on %{LOGLEVEL}" do
+    config <<-'CONFIG'
+      filter {
+        grok {
+          pattern => "%{LOGLEVEL:level}: error!"
+        }
+      }
+    CONFIG
+    log_level_names = %w(
+      trace Trace TRACE
+      debug Debug DEBUG
+      notice Notice Notice
+      info Info INFO
+      warn warning Warn Warning WARN WARNING
+      err error Err Error ERR ERROR
+      crit critical Crit Critical CRIT CRITICAL
+      fatal Fatal FATAL
+      severe Severe SEVERE
+      emerg emergency Emerg Emergency EMERG EMERGENCY
+    )
+    log_level_names.each do |level_name|
+      sample "#{level_name}: error!" do
+        insist { subject['level'] } == level_name
+      end
+    end
+  end
+  describe "tagging on failure" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "matchme %{NUMBER:fancy}" }
+          tag_on_failure => false
+        }
+      }
+    CONFIG
+    sample "matchme 1234" do
+      insist { subject["tags"] }.nil?
+    end
+    sample "this will not be matched" do
+      insist { subject["tags"] }.include?("false")
+    end
+  end
+  describe "captures named fields even if the whole text matches" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{DATE_EU:stimestamp}" }
+          singles => true
+        }
+      }
+    CONFIG
+    sample "11/01/01" do
+      insist { subject["stimestamp"] } == "11/01/01"
+    end
+  end
+  describe "allow dashes in capture names" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{WORD:foo-bar}" }
+          singles => true
+        }
+      }
+    CONFIG
+    sample "hello world" do
+      insist { subject["foo-bar"] } == "hello"
+    end
+  end
+  describe "performance test", :performance => true do
+    event_count = 100000
+    min_rate = 2000
+    max_duration = event_count / min_rate
+    input = "Nov 24 01:29:01 -0800"
+    config <<-CONFIG
+      input {
+        generator {
+          count => #{event_count}
+          message => "Mar 16 00:01:25 evita postfix/smtpd[1713]: connect from camomile.cloud9.net[168.100.1.3]"
+        }
+      }
+      filter {
+        grok {
+          match => { "message" => "%{SYSLOGLINE}" }
+          singles => true
+          overwrite => [ "message" ]
+        }
+      }
+      output { null { } }
+    CONFIG
+    2.times do
+      start = Time.now
+      agent do
+        duration = (Time.now - start)
+        puts "filters/grok parse rate: #{"%02.0f/sec" % (event_count / duration)}, elapsed: #{duration}s"
+        insist { duration } < max_duration
+      end
+    end
+  end
+  describe "singles with duplicate-named fields" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{INT:foo}|%{WORD:foo}" }
+          singles => true
+        }
+      }
+    CONFIG
+    sample "hello world" do
+      insist { subject["foo"] }.is_a?(String)
+    end
+    sample "123 world" do
+      insist { subject["foo"] }.is_a?(String)
+    end
+  end
+  describe "break_on_match default should be true and first match should exit filter" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{INT:foo}"
+                     "somefield" => "%{INT:bar}"}
+        }
+      }
+    CONFIG
+    sample("message" => "hello world 123", "somefield" => "testme abc 999") do
+      insist { subject["foo"] } == "123"
+      insist { subject["bar"] }.nil?
+    end
+  end
+  describe "break_on_match when set to false should try all patterns" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{INT:foo}"
+                     "somefield" => "%{INT:bar}"}
+          break_on_match => false
+        }
+      }
+    CONFIG
+    sample("message" => "hello world 123", "somefield" => "testme abc 999") do
+      insist { subject["foo"] } == "123"
+      insist { subject["bar"] } == "999"
+    end
+  end
+  describe "LOGSTASH-1547 - break_on_match should work on fields with multiple patterns" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => ["%{GREEDYDATA:name1}beard", "tree%{GREEDYDATA:name2}"] }
+          break_on_match => false
+        }
+      }
+    CONFIG
+    sample "treebranch" do
+      insist { subject["name2"] } == "branch"
+    end
+    sample "bushbeard" do
+      insist { subject["name1"] } == "bush"
+    end
+    sample "treebeard" do
+      insist { subject["name1"] } == "tree"
+      insist { subject["name2"] } == "beard"
+    end
+  end
+  describe "break_on_match default for array input with single grok pattern" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => "%{INT:foo}"}
+        }
+      }
+    CONFIG
+    # array input --
+    sample("message" => ["hello world 123", "line 23"]) do
+      insist { subject["foo"] } == ["123", "23"]
+      insist { subject["tags"] }.nil?
+    end
+    # array input, one of them matches
+    sample("message" => ["hello world 123", "abc"]) do
+      insist { subject["foo"] } == "123"
+      insist { subject["tags"] }.nil?
+    end
+  end
+  describe "break_on_match = true (default) for array input with multiple grok pattern" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => ["%{INT:foo}", "%{WORD:bar}"] }
+        }
+      }
+    CONFIG
+    # array input --
+    sample("message" => ["hello world 123", "line 23"]) do
+      insist { subject["foo"] } == ["123", "23"]
+      insist { subject["bar"] }.nil?
+      insist { subject["tags"] }.nil?
+    end
+    # array input, one of them matches
+    sample("message" => ["hello world", "line 23"]) do
+      insist { subject["bar"] } == "hello"
+      insist { subject["foo"] } == "23"
+      insist { subject["tags"] }.nil?
+    end
+  end
+  describe "break_on_match = false for array input with multiple grok pattern" do
+    config <<-CONFIG
+      filter {
+        grok {
+          match => { "message" => ["%{INT:foo}", "%{WORD:bar}"] }
+          break_on_match => false
+        }
+      }
+    CONFIG
+    # array input --
+    sample("message" => ["hello world 123", "line 23"]) do
+      insist { subject["foo"] } == ["123", "23"]
+      insist { subject["bar"] } == ["hello", "line"]
+      insist { subject["tags"] }.nil?
+    end
+    # array input, one of them matches
+    sample("message" => ["hello world", "line 23"]) do
+      insist { subject["bar"] } == ["hello", "line"]
+      insist { subject["foo"] } == "23"
+      insist { subject["tags"] }.nil?
+    end
+  end
+  describe  "grok with unicode" do
+    config <<-CONFIG
+      filter {
+        grok {
+          #pattern => "<%{POSINT:syslog_pri}>%{SYSLOGTIMESTAMP:syslog_timestamp} %{SYSLOGHOST:syslog_hostname} %{PROG:syslog_program}(?:\[%{POSINT:syslog_pid}\])?: %{GREEDYDATA:syslog_message}"
+          pattern => "<%{POSINT:syslog_pri}>%{SPACE}%{SYSLOGTIMESTAMP:syslog_timestamp} %{SYSLOGHOST:syslog_hostname} %{PROG:syslog_program}(:?)(?:\\[%{GREEDYDATA:syslog_pid}\\])?(:?) %{GREEDYDATA:syslog_message}"
+        }
+      }
+    CONFIG
+    sample "<22>Jan  4 07:50:46 mailmaster postfix/policy-spf[9454]: : SPF permerror (Junk encountered in record 'v=spf1 mx a:mail.domain.no ip4:192.168.0.4 �all'): Envelope-from: email@domain.no" do
+      insist { subject["tags"] }.nil?
+      insist { subject["syslog_pri"] } == "22"
+      insist { subject["syslog_program"] } == "postfix/policy-spf"
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,104 @@
+--- !ruby/object:Gem::Specification
+name: logstash-filter-grok
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Elasticsearch
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-11-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: logstash
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 2.0.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 2.0.0
+- !ruby/object:Gem::Dependency
+  name: jls-grok
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.11.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.11.0
+- !ruby/object:Gem::Dependency
+  name: logstash-patterns-core
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Grok is currently the best way in logstash to parse crappy unstructured
+  log data into something structured and queryable.
+email: richard.pijnenburg@elasticsearch.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE
+- Rakefile
+- lib/logstash/filters/grok.rb
+- logstash-filter-grok.gemspec
+- rakelib/publish.rake
+- rakelib/vendor.rake
+- spec/filters/grok_spec.rb
+homepage: http://logstash.net/
+licenses:
+- Apache License (2.0)
+metadata:
+  logstash_plugin: 'true'
+  group: filter
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.1
+signing_key:
+specification_version: 4
+summary: Parse arbitrary text and structure it.
+test_files:
+- spec/filters/grok_spec.rb