RubyGems - logstash-codec-multiline - Versions diffs - 0.1.0 - Mend

logstash-codec-multiline 0.1.0

Files changed (11) hide show

checksums.yaml +15 -0
data/.gitignore +3 -0
data/Gemfile +4 -0
data/LICENSE +13 -0
data/Rakefile +6 -0
data/lib/logstash/codecs/multiline.rb +195 -0
data/logstash-codec-multiline.gemspec +29 -0
data/rakelib/publish.rake +9 -0
data/rakelib/vendor.rake +169 -0
data/spec/codecs/multiline_spec.rb +160 -0
metadata +105 -0

checksums.yaml ADDED

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    ZmMxYzRlNDc3MDQzYTVkYmE4MjY5ZTgxMDAzZWE5ZjdkNWY0NGM0OQ==
+  data.tar.gz: !binary |-
+    ODZiZjhhMTIxNGYyMTkzNmRlZjM4MzM5NTJjM2I4YzBlMDE5NzczOQ==
+SHA512:
+  metadata.gz: !binary |-
+    NTZiZjFkMDBjOGZjNWJjNDlkYjhmNzU3NGRlMmY3MDlhNWMwOWY2OGU0YjE3
+    ZWU2MTBhY2IzM2ZkYzBjNmI0NDA4ZDU2ZGJiOTBmMGRiMzVlOGI4ZjQxMTkz
+    ZjkwZDQ5NmQ1Mjg5NmIxOGRkNWI2NTA1Y2FiYTdmZWMzZjhlZGE=
+  data.tar.gz: !binary |-
+    NGE2YzM4ZmUwOTAxZjRkOTA5Y2IyYjk1MTcyMzE5YTFiY2FhNjU4Zjk5ZWFl
+    ZWUxZDEyNjU1YTczOTY2ODM4MWRhNDMxM2RkZjBhNTA5OGYxYjQwNWYwNmEw
+    NDdmZTQ4OWIyNjZlYTRjMzUwZWJkZGI5Yjg4YWY2MTUxYjVlZTc=

data/.gitignore ADDED

@@ -0,0 +1,3 @@
+*.gem
+Gemfile.lock
+.bundle

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'http://rubygems.org'
+gem 'rake'
+gem 'gem_publisher'
+gem 'archive-tar-minitar'

data/LICENSE ADDED

@@ -0,0 +1,13 @@
+Copyright (c) 2012-2014 Elasticsearch <http://www.elasticsearch.org>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

data/Rakefile ADDED

@@ -0,0 +1,6 @@
+@files=[]
+task :default do
+  system("rake -T")
+end

data/lib/logstash/codecs/multiline.rb ADDED

@@ -0,0 +1,195 @@
+# encoding: utf-8
+require "logstash/codecs/base"
+require "logstash/util/charset"
+require "logstash/timestamp"
+# The multiline codec will collapse multiline messages and merge them into a
+# single event.
+#
+# The original goal of this codec was to allow joining of multiline messages
+# from files into a single event. For example, joining Java exception and
+# stacktrace messages into a single event.
+#
+# The config looks like this:
+#
+#     input {
+#       stdin {
+#         codec => multiline {
+#           pattern => "pattern, a regexp"
+#           negate => "true" or "false"
+#           what => "previous" or "next"
+#         }
+#       }
+#     }
+#
+# The `pattern` should match what you believe to be an indicator that the field
+# is part of a multi-line event.
+#
+# The `what` must be "previous" or "next" and indicates the relation
+# to the multi-line event.
+#
+# The `negate` can be "true" or "false" (defaults to "false"). If "true", a
+# message not matching the pattern will constitute a match of the multiline
+# filter and the `what` will be applied. (vice-versa is also true)
+#
+# For example, Java stack traces are multiline and usually have the message
+# starting at the far-left, with each subsequent line indented. Do this:
+#
+#     input {
+#       stdin {
+#         codec => multiline {
+#           pattern => "^\s"
+#           what => "previous"
+#         }
+#       }
+#     }
+#
+# This says that any line starting with whitespace belongs to the previous line.
+#
+# Another example is to merge lines not starting with a date up to the previous
+# line..
+#
+#     input {
+#       file {
+#         path => "/var/log/someapp.log"
+#         codec => multiline {
+#           # Grok pattern names are valid! :)
+#           pattern => "^%{TIMESTAMP_ISO8601} "
+#           negate => true
+#           what => previous
+#         }
+#       }
+#     }
+#
+# This says that any line not starting with a timestamp should be merged with the previous line.
+#
+# One more common example is C line continuations (backslash). Here's how to do that:
+#
+#     filter {
+#       multiline {
+#         type => "somefiletype"
+#         pattern => "\\$"
+#         what => "next"
+#       }
+#     }
+#
+# This says that any line ending with a backslash should be combined with the
+# following line.
+#
+class LogStash::Codecs::Multiline < LogStash::Codecs::Base
+  config_name "multiline"
+  milestone 3
+  # The regular expression to match.
+  config :pattern, :validate => :string, :required => true
+  # If the pattern matched, does event belong to the next or previous event?
+  config :what, :validate => ["previous", "next"], :required => true
+  # Negate the regexp pattern ('if not matched').
+  config :negate, :validate => :boolean, :default => false
+  # Logstash ships by default with a bunch of patterns, so you don't
+  # necessarily need to define this yourself unless you are adding additional
+  # patterns.
+  #
+  # Pattern files are plain text with format:
+  #
+  #     NAME PATTERN
+  #
+  # For example:
+  #
+  #     NUMBER \d+
+  config :patterns_dir, :validate => :array, :default => []
+  # The character encoding used in this input. Examples include "UTF-8"
+  # and "cp1252"
+  #
+  # This setting is useful if your log files are in Latin-1 (aka cp1252)
+  # or in another character set other than UTF-8.
+  #
+  # This only affects "plain" format logs since JSON is UTF-8 already.
+  config :charset, :validate => ::Encoding.name_list, :default => "UTF-8"
+  # Tag multiline events with a given tag. This tag will only be added
+  # to events that actually have multiple lines in them.
+  config :multiline_tag, :validate => :string, :default => "multiline"
+  public
+  def register
+    require "grok-pure" # rubygem 'jls-grok'
+    require 'logstash/patterns/core'
+    # Detect if we are running from a jarfile, pick the right path.
+    patterns_path = []
+    patterns_path += [LogStash::Patterns::Core.path]
+    @grok = Grok.new
+    @patterns_dir = patterns_path.to_a + @patterns_dir
+    @patterns_dir.each do |path|
+      if File.directory?(path)
+        path = File.join(path, "*")
+      end
+      Dir.glob(path).each do |file|
+        @logger.info("Grok loading patterns from file", :path => file)
+        @grok.add_patterns_from_file(file)
+      end
+    end
+    @grok.compile(@pattern)
+    @logger.debug("Registered multiline plugin", :type => @type, :config => @config)
+    @buffer = []
+    @handler = method("do_#{@what}".to_sym)
+    @converter = LogStash::Util::Charset.new(@charset)
+    @converter.logger = @logger
+  end # def register
+  public
+  def decode(text, &block)
+    text = @converter.convert(text)
+    match = @grok.match(text)
+    @logger.debug("Multiline", :pattern => @pattern, :text => text,
+                  :match => !match.nil?, :negate => @negate)
+    # Add negate option
+    match = (match and !@negate) || (!match and @negate)
+    @handler.call(text, match, &block)
+  end # def decode
+  def buffer(text)
+    @time = LogStash::Timestamp.now if @buffer.empty?
+    @buffer << text
+  end
+  def flush(&block)
+    if @buffer.any?
+      event = LogStash::Event.new(LogStash::Event::TIMESTAMP => @time, "message" => @buffer.join(NL))
+      # Tag multiline events
+      event.tag @multiline_tag if @multiline_tag && @buffer.size > 1
+      yield event
+      @buffer = []
+    end
+  end
+  def do_next(text, matched, &block)
+    buffer(text)
+    flush(&block) if !matched
+  end
+  def do_previous(text, matched, &block)
+    flush(&block) if !matched
+    buffer(text)
+  end
+  public
+  def encode(event)
+    # Nothing to do.
+    @on_event.call(event)
+  end # def encode
+end # class LogStash::Codecs::Plain

data/logstash-codec-multiline.gemspec ADDED

@@ -0,0 +1,29 @@
+Gem::Specification.new do |s|
+  s.name            = 'logstash-codec-multiline'
+  s.version         = '0.1.0'
+  s.licenses        = ['Apache License (2.0)']
+  s.summary         = "The multiline codec will collapse multiline messages and merge them into a single event."
+  s.description     = "The multiline codec will collapse multiline messages and merge them into a single event."
+  s.authors         = ["Elasticsearch"]
+  s.email           = 'richard.pijnenburg@elasticsearch.com'
+  s.homepage        = "http://logstash.net/"
+  s.require_paths = ["lib"]
+  # Files
+  s.files = `git ls-files`.split($\)
+  # Tests
+  s.test_files = s.files.grep(%r{^(test|spec|features)/})
+  # Special flag to let us know this is actually a logstash plugin
+  s.metadata = { "logstash_plugin" => "true", "group" => "codec" }
+  # Gem dependencies
+  s.add_runtime_dependency 'logstash', '>= 1.4.0', '< 2.0.0'
+  s.add_runtime_dependency 'logstash-patterns-core'
+  s.add_runtime_dependency 'jls-grok', [ '0.11.0' ]
+end

data/rakelib/publish.rake ADDED

@@ -0,0 +1,9 @@
+require "gem_publisher"
+desc "Publish gem to RubyGems.org"
+task :publish_gem do |t|
+  gem_file = Dir.glob(File.expand_path('../*.gemspec',File.dirname(__FILE__))).first
+  gem = GemPublisher.publish_if_updated(gem_file, :rubygems)
+  puts "Published #{gem}" if gem
+end

data/rakelib/vendor.rake ADDED

@@ -0,0 +1,169 @@
+require "net/http"
+require "uri"
+require "digest/sha1"
+def vendor(*args)
+  return File.join("vendor", *args)
+end
+directory "vendor/" => ["vendor"] do |task, args|
+  mkdir task.name
+end
+def fetch(url, sha1, output)
+  puts "Downloading #{url}"
+  actual_sha1 = download(url, output)
+  if actual_sha1 != sha1
+    fail "SHA1 does not match (expected '#{sha1}' but got '#{actual_sha1}')"
+  end
+end # def fetch
+def file_fetch(url, sha1)
+  filename = File.basename( URI(url).path )
+  output = "vendor/#{filename}"
+  task output => [ "vendor/" ] do
+    begin
+      actual_sha1 = file_sha1(output)
+      if actual_sha1 != sha1
+        fetch(url, sha1, output)
+      end
+    rescue Errno::ENOENT
+      fetch(url, sha1, output)
+    end
+  end.invoke
+  return output
+end
+def file_sha1(path)
+  digest = Digest::SHA1.new
+  fd = File.new(path, "r")
+  while true
+    begin
+      digest << fd.sysread(16384)
+    rescue EOFError
+      break
+    end
+  end
+  return digest.hexdigest
+ensure
+  fd.close if fd
+end
+def download(url, output)
+  uri = URI(url)
+  digest = Digest::SHA1.new
+  tmp = "#{output}.tmp"
+  Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == "https")) do |http|
+    request = Net::HTTP::Get.new(uri.path)
+    http.request(request) do |response|
+      fail "HTTP fetch failed for #{url}. #{response}" if [200, 301].include?(response.code)
+      size = (response["content-length"].to_i || -1).to_f
+      count = 0
+      File.open(tmp, "w") do |fd|
+        response.read_body do |chunk|
+          fd.write(chunk)
+          digest << chunk
+          if size > 0 && $stdout.tty?
+            count += chunk.bytesize
+            $stdout.write(sprintf("\r%0.2f%%", count/size * 100))
+          end
+        end
+      end
+      $stdout.write("\r      \r") if $stdout.tty?
+    end
+  end
+  File.rename(tmp, output)
+  return digest.hexdigest
+rescue SocketError => e
+  puts "Failure while downloading #{url}: #{e}"
+  raise
+ensure
+  File.unlink(tmp) if File.exist?(tmp)
+end # def download
+def untar(tarball, &block)
+  require "archive/tar/minitar"
+  tgz = Zlib::GzipReader.new(File.open(tarball))
+  # Pull out typesdb
+  tar = Archive::Tar::Minitar::Input.open(tgz)
+  tar.each do |entry|
+    path = block.call(entry)
+    next if path.nil?
+    parent = File.dirname(path)
+    mkdir_p parent unless File.directory?(parent)
+    # Skip this file if the output file is the same size
+    if entry.directory?
+      mkdir path unless File.directory?(path)
+    else
+      entry_mode = entry.instance_eval { @mode } & 0777
+      if File.exists?(path)
+        stat = File.stat(path)
+        # TODO(sissel): Submit a patch to archive-tar-minitar upstream to
+        # expose headers in the entry.
+        entry_size = entry.instance_eval { @size }
+        # If file sizes are same, skip writing.
+        next if stat.size == entry_size && (stat.mode & 0777) == entry_mode
+      end
+      puts "Extracting #{entry.full_name} from #{tarball} #{entry_mode.to_s(8)}"
+      File.open(path, "w") do |fd|
+        # eof? check lets us skip empty files. Necessary because the API provided by
+        # Archive::Tar::Minitar::Reader::EntryStream only mostly acts like an
+        # IO object. Something about empty files in this EntryStream causes
+        # IO.copy_stream to throw "can't convert nil into String" on JRuby
+        # TODO(sissel): File a bug about this.
+        while !entry.eof?
+          chunk = entry.read(16384)
+          fd.write(chunk)
+        end
+          #IO.copy_stream(entry, fd)
+      end
+      File.chmod(entry_mode, path)
+    end
+  end
+  tar.close
+  File.unlink(tarball) if File.file?(tarball)
+end # def untar
+def ungz(file)
+  outpath = file.gsub('.gz', '')
+  tgz = Zlib::GzipReader.new(File.open(file))
+  begin
+    File.open(outpath, "w") do |out|
+      IO::copy_stream(tgz, out)
+    end
+    File.unlink(file)
+  rescue
+    File.unlink(outpath) if File.file?(outpath)
+   raise
+  end
+  tgz.close
+end
+desc "Process any vendor files required for this plugin"
+task "vendor" do |task, args|
+  @files.each do |file|
+    download = file_fetch(file['url'], file['sha1'])
+    if download =~ /.tar.gz/
+      prefix = download.gsub('.tar.gz', '').gsub('vendor/', '')
+      untar(download) do |entry|
+        if !file['files'].nil?
+          next unless file['files'].include?(entry.full_name.gsub(prefix, ''))
+          out = entry.full_name.split("/").last
+        end
+        File.join('vendor', out)
+      end
+    elsif download =~ /.gz/
+      ungz(download)
+    end
+  end
+end

data/spec/codecs/multiline_spec.rb ADDED

@@ -0,0 +1,160 @@
+# encoding: utf-8
+require "logstash/codecs/multiline"
+require "logstash/event"
+require "insist"
+describe LogStash::Codecs::Multiline do
+  context "#decode" do
+    it "should be able to handle multiline events with additional lines space-indented" do
+      codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
+      lines = [ "hello world", "   second line", "another first line" ]
+      events = []
+      lines.each do |line|
+        codec.decode(line) do |event|
+          events << event
+        end
+      end
+      codec.flush { |e| events << e }
+      insist { events.size } == 2
+      insist { events[0]["message"] } == "hello world\n   second line"
+      insist { events[0]["tags"] }.include?("multiline")
+      insist { events[1]["message"] } == "another first line"
+      insist { events[1]["tags"] }.nil?
+    end
+    it "should allow custom tag added to multiline events" do
+      codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous", "multiline_tag" => "hurray" )
+      lines = [ "hello world", "   second line", "another first line" ]
+      events = []
+      lines.each do |line|
+        codec.decode(line) do |event|
+          events << event
+        end
+      end
+      codec.flush { |e| events << e }
+      insist { events.size } == 2
+      insist { events[0]["tags"] }.include?("hurray")
+      insist { events[1]["tags"] }.nil?
+    end
+    it "should allow grok patterns to be used" do
+      codec = LogStash::Codecs::Multiline.new(
+        "pattern" => "^%{NUMBER} %{TIME}",
+        "negate" => true,
+        "what" => "previous"
+      )
+      lines = [ "120913 12:04:33 first line", "second line", "third line" ]
+      events = []
+      lines.each do |line|
+        codec.decode(line) do |event|
+          events << event
+        end
+      end
+      codec.flush { |e| events << e }
+      insist { events.size } == 1
+      insist { events.first["message"] } == lines.join("\n")
+    end
+    context "using default UTF-8 charset" do
+      it "should decode valid UTF-8 input" do
+        codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
+        lines = [ "foobar", "κόσμε" ]
+        events = []
+        lines.each do |line|
+          insist { line.encoding.name } == "UTF-8"
+          insist { line.valid_encoding? } == true
+          codec.decode(line) { |event| events << event }
+        end
+        codec.flush { |e| events << e }
+        insist { events.size } == 2
+        events.zip(lines).each do |tuple|
+          insist { tuple[0]["message"] } == tuple[1]
+          insist { tuple[0]["message"].encoding.name } == "UTF-8"
+        end
+      end
+      it "should escape invalid sequences" do
+        codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
+        lines = [ "foo \xED\xB9\x81\xC3", "bar \xAD" ]
+        events = []
+        lines.each do |line|
+          insist { line.encoding.name } == "UTF-8"
+          insist { line.valid_encoding? } == false
+          codec.decode(line) { |event| events << event }
+        end
+        codec.flush { |e| events << e }
+        insist { events.size } == 2
+        events.zip(lines).each do |tuple|
+          insist { tuple[0]["message"] } == tuple[1].inspect[1..-2]
+          insist { tuple[0]["message"].encoding.name } == "UTF-8"
+        end
+      end
+    end
+    context "with valid non UTF-8 source encoding" do
+      it "should encode to UTF-8" do
+        codec = LogStash::Codecs::Multiline.new("charset" => "ISO-8859-1", "pattern" => "^\\s", "what" => "previous")
+        samples = [
+          ["foobar", "foobar"],
+          ["\xE0 Montr\xE9al", "à Montréal"],
+        ]
+        # lines = [ "foo \xED\xB9\x81\xC3", "bar \xAD" ]
+        events = []
+        samples.map{|(a, b)| a.force_encoding("ISO-8859-1")}.each do |line|
+          insist { line.encoding.name } == "ISO-8859-1"
+          insist { line.valid_encoding? } == true
+          codec.decode(line) { |event| events << event }
+        end
+        codec.flush { |e| events << e }
+        insist { events.size } == 2
+        events.zip(samples.map{|(a, b)| b}).each do |tuple|
+          insist { tuple[1].encoding.name } == "UTF-8"
+          insist { tuple[0]["message"] } == tuple[1]
+          insist { tuple[0]["message"].encoding.name } == "UTF-8"
+        end
+      end
+    end
+    context "with invalid non UTF-8 source encoding" do
+     it "should encode to UTF-8" do
+        codec = LogStash::Codecs::Multiline.new("charset" => "ASCII-8BIT", "pattern" => "^\\s", "what" => "previous")
+        samples = [
+          ["\xE0 Montr\xE9al", "� Montr�al"],
+          ["\xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5", "����������"],
+        ]
+        events = []
+        samples.map{|(a, b)| a.force_encoding("ASCII-8BIT")}.each do |line|
+          insist { line.encoding.name } == "ASCII-8BIT"
+          insist { line.valid_encoding? } == true
+          codec.decode(line) { |event| events << event }
+        end
+        codec.flush { |e| events << e }
+        insist { events.size } == 2
+        events.zip(samples.map{|(a, b)| b}).each do |tuple|
+          insist { tuple[1].encoding.name } == "UTF-8"
+          insist { tuple[0]["message"] } == tuple[1]
+          insist { tuple[0]["message"].encoding.name } == "UTF-8"
+        end
+      end
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,105 @@
+--- !ruby/object:Gem::Specification
+name: logstash-codec-multiline
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Elasticsearch
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-11-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: logstash
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 2.0.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 2.0.0
+- !ruby/object:Gem::Dependency
+  name: logstash-patterns-core
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: jls-grok
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.11.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.11.0
+description: The multiline codec will collapse multiline messages and merge them into
+  a single event.
+email: richard.pijnenburg@elasticsearch.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE
+- Rakefile
+- lib/logstash/codecs/multiline.rb
+- logstash-codec-multiline.gemspec
+- rakelib/publish.rake
+- rakelib/vendor.rake
+- spec/codecs/multiline_spec.rb
+homepage: http://logstash.net/
+licenses:
+- Apache License (2.0)
+metadata:
+  logstash_plugin: 'true'
+  group: codec
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.1
+signing_key:
+specification_version: 4
+summary: The multiline codec will collapse multiline messages and merge them into
+  a single event.
+test_files:
+- spec/codecs/multiline_spec.rb