RubyGems - logstash-codec-multiline - Versions diffs - 0.1.0 - Mend

logstash-codec-multiline 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +15 -0
data/.gitignore +3 -0
data/Gemfile +4 -0
data/LICENSE +13 -0
data/Rakefile +6 -0
data/lib/logstash/codecs/multiline.rb +195 -0
data/logstash-codec-multiline.gemspec +29 -0
data/rakelib/publish.rake +9 -0
data/rakelib/vendor.rake +169 -0
data/spec/codecs/multiline_spec.rb +160 -0
metadata +105 -0

checksums.yaml ADDED

@@ -0,0 +1,15 @@
+---
+!binary "U0hBMQ==":
+  metadata.gz: !binary |-
+    ZmMxYzRlNDc3MDQzYTVkYmE4MjY5ZTgxMDAzZWE5ZjdkNWY0NGM0OQ==
+  data.tar.gz: !binary |-
+    ODZiZjhhMTIxNGYyMTkzNmRlZjM4MzM5NTJjM2I4YzBlMDE5NzczOQ==
+SHA512:
+  metadata.gz: !binary |-
+    NTZiZjFkMDBjOGZjNWJjNDlkYjhmNzU3NGRlMmY3MDlhNWMwOWY2OGU0YjE3
+    ZWU2MTBhY2IzM2ZkYzBjNmI0NDA4ZDU2ZGJiOTBmMGRiMzVlOGI4ZjQxMTkz
+    ZjkwZDQ5NmQ1Mjg5NmIxOGRkNWI2NTA1Y2FiYTdmZWMzZjhlZGE=
+  data.tar.gz: !binary |-
+    NGE2YzM4ZmUwOTAxZjRkOTA5Y2IyYjk1MTcyMzE5YTFiY2FhNjU4Zjk5ZWFl
+    ZWUxZDEyNjU1YTczOTY2ODM4MWRhNDMxM2RkZjBhNTA5OGYxYjQwNWYwNmEw
+    NDdmZTQ4OWIyNjZlYTRjMzUwZWJkZGI5Yjg4YWY2MTUxYjVlZTc=

data/.gitignore ADDED

@@ -0,0 +1,3 @@
+*.gem
+Gemfile.lock
+.bundle

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source 'http://rubygems.org'
+gem 'rake'
+gem 'gem_publisher'
+gem 'archive-tar-minitar'

data/LICENSE ADDED

@@ -0,0 +1,13 @@
+Copyright (c) 2012-2014 Elasticsearch <http://www.elasticsearch.org>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

data/Rakefile ADDED

@@ -0,0 +1,6 @@
+@files=[]
+task :default do
+  system("rake -T")
+end

data/lib/logstash/codecs/multiline.rb ADDED

@@ -0,0 +1,195 @@
+# encoding: utf-8
+require "logstash/codecs/base"
+require "logstash/util/charset"
+require "logstash/timestamp"
+# The multiline codec will collapse multiline messages and merge them into a
+# single event.
+#
+# The original goal of this codec was to allow joining of multiline messages
+# from files into a single event. For example, joining Java exception and
+# stacktrace messages into a single event.
+#
+# The config looks like this:
+#
+#     input {
+#       stdin {
+#         codec => multiline {
+#           pattern => "pattern, a regexp"
+#           negate => "true" or "false"
+#           what => "previous" or "next"
+#         }
+#       }
+#     }
+#
+# The `pattern` should match what you believe to be an indicator that the field
+# is part of a multi-line event.
+#
+# The `what` must be "previous" or "next" and indicates the relation
+# to the multi-line event.
+#
+# The `negate` can be "true" or "false" (defaults to "false"). If "true", a
+# message not matching the pattern will constitute a match of the multiline
+# filter and the `what` will be applied. (vice-versa is also true)
+#
+# For example, Java stack traces are multiline and usually have the message
+# starting at the far-left, with each subsequent line indented. Do this:
+#
+#     input {
+#       stdin {
+#         codec => multiline {
+#           pattern => "^\s"
+#           what => "previous"
+#         }
+#       }
+#     }
+#
+# This says that any line starting with whitespace belongs to the previous line.
+#
+# Another example is to merge lines not starting with a date up to the previous
+# line..
+#
+#     input {
+#       file {
+#         path => "/var/log/someapp.log"
+#         codec => multiline {
+#           # Grok pattern names are valid! :)
+#           pattern => "^%{TIMESTAMP_ISO8601} "
+#           negate => true
+#           what => previous
+#         }
+#       }
+#     }
+#
+# This says that any line not starting with a timestamp should be merged with the previous line.
+#
+# One more common example is C line continuations (backslash). Here's how to do that:
+#
+#     filter {
+#       multiline {
+#         type => "somefiletype"
+#         pattern => "\\$"
+#         what => "next"
+#       }
+#     }
+#
+# This says that any line ending with a backslash should be combined with the
+# following line.
+#
+class LogStash::Codecs::Multiline < LogStash::Codecs::Base
+  config_name "multiline"
+  milestone 3
+  # The regular expression to match.
+  config :pattern, :validate => :string, :required => true
+  # If the pattern matched, does event belong to the next or previous event?
+  config :what, :validate => ["previous", "next"], :required => true
+  # Negate the regexp pattern ('if not matched').
+  config :negate, :validate => :boolean, :default => false
+  # Logstash ships by default with a bunch of patterns, so you don't
+  # necessarily need to define this yourself unless you are adding additional
+  # patterns.
+  #
+  # Pattern files are plain text with format:
+  #
+  #     NAME PATTERN
+  #
+  # For example:
+  #
+  #     NUMBER \d+
+  config :patterns_dir, :validate => :array, :default => []
+  # The character encoding used in this input. Examples include "UTF-8"
+  # and "cp1252"
+  #
+  # This setting is useful if your log files are in Latin-1 (aka cp1252)
+  # or in another character set other than UTF-8.
+  #
+  # This only affects "plain" format logs since JSON is UTF-8 already.
+  config :charset, :validate => ::Encoding.name_list, :default => "UTF-8"
+  # Tag multiline events with a given tag. This tag will only be added
+  # to events that actually have multiple lines in them.
+  config :multiline_tag, :validate => :string, :default => "multiline"
+  public
+  def register
+    require "grok-pure" # rubygem 'jls-grok'
+    require 'logstash/patterns/core'
+    # Detect if we are running from a jarfile, pick the right path.
+    patterns_path = []
+    patterns_path += [LogStash::Patterns::Core.path]
+    @grok = Grok.new
+    @patterns_dir = patterns_path.to_a + @patterns_dir
+    @patterns_dir.each do |path|
+      if File.directory?(path)
+        path = File.join(path, "*")
+      end
+      Dir.glob(path).each do |file|
+        @logger.info("Grok loading patterns from file", :path => file)
+        @grok.add_patterns_from_file(file)
+      end
+    end
+    @grok.compile(@pattern)
+    @logger.debug("Registered multiline plugin", :type => @type, :config => @config)
+    @buffer = []
+    @handler = method("do_#{@what}".to_sym)
+    @converter = LogStash::Util::Charset.new(@charset)
+    @converter.logger = @logger
+  end # def register
+  public
+  def decode(text, &block)
+    text = @converter.convert(text)
+    match = @grok.match(text)
+    @logger.debug("Multiline", :pattern => @pattern, :text => text,
+                  :match => !match.nil?, :negate => @negate)
+    # Add negate option
+    match = (match and !@negate) || (!match and @negate)
+    @handler.call(text, match, &block)
+  end # def decode
+  def buffer(text)
+    @time = LogStash::Timestamp.now if @buffer.empty?
+    @buffer << text
+  end
+  def flush(&block)
+    if @buffer.any?
+      event = LogStash::Event.new(LogStash::Event::TIMESTAMP => @time, "message" => @buffer.join(NL))
+      # Tag multiline events
+      event.tag @multiline_tag if @multiline_tag && @buffer.size > 1
+      yield event
+      @buffer = []
+    end
+  end
+  def do_next(text, matched, &block)
+    buffer(text)
+    flush(&block) if !matched
+  end
+  def do_previous(text, matched, &block)
+    flush(&block) if !matched
+    buffer(text)
+  end
+  public
+  def encode(event)
+    # Nothing to do.
+    @on_event.call(event)
+  end # def encode
+end # class LogStash::Codecs::Plain

data/logstash-codec-multiline.gemspec ADDED

@@ -0,0 +1,29 @@
+Gem::Specification.new do |s|
+  s.name            = 'logstash-codec-multiline'
+  s.version         = '0.1.0'
+  s.licenses        = ['Apache License (2.0)']
+  s.summary         = "The multiline codec will collapse multiline messages and merge them into a single event."
+  s.description     = "The multiline codec will collapse multiline messages and merge them into a single event."
+  s.authors         = ["Elasticsearch"]
+  s.email           = 'richard.pijnenburg@elasticsearch.com'
+  s.homepage        = "http://logstash.net/"
+  s.require_paths = ["lib"]
+  # Files
+  s.files = `git ls-files`.split($\)
+  # Tests
+  s.test_files = s.files.grep(%r{^(test|spec|features)/})
+  # Special flag to let us know this is actually a logstash plugin
+  s.metadata = { "logstash_plugin" => "true", "group" => "codec" }
+  # Gem dependencies
+  s.add_runtime_dependency 'logstash', '>= 1.4.0', '< 2.0.0'
+  s.add_runtime_dependency 'logstash-patterns-core'
+  s.add_runtime_dependency 'jls-grok', [ '0.11.0' ]
+end

data/rakelib/publish.rake ADDED

@@ -0,0 +1,9 @@
+require "gem_publisher"
+desc "Publish gem to RubyGems.org"
+task :publish_gem do |t|
+  gem_file = Dir.glob(File.expand_path('../*.gemspec',File.dirname(__FILE__))).first
+  gem = GemPublisher.publish_if_updated(gem_file, :rubygems)
+  puts "Published #{gem}" if gem
+end

data/rakelib/vendor.rake ADDED

@@ -0,0 +1,169 @@
+require "net/http"
+require "uri"
+require "digest/sha1"
+def vendor(*args)
+  return File.join("vendor", *args)
+end
+directory "vendor/" => ["vendor"] do |task, args|
+  mkdir task.name
+end
+def fetch(url, sha1, output)
+  puts "Downloading #{url}"
+  actual_sha1 = download(url, output)
+  if actual_sha1 != sha1
+    fail "SHA1 does not match (expected '#{sha1}' but got '#{actual_sha1}')"
+  end
+end # def fetch
+def file_fetch(url, sha1)
+  filename = File.basename( URI(url).path )
+  output = "vendor/#{filename}"
+  task output => [ "vendor/" ] do
+    begin
+      actual_sha1 = file_sha1(output)
+      if actual_sha1 != sha1
+        fetch(url, sha1, output)
+      end
+    rescue Errno::ENOENT
+      fetch(url, sha1, output)
+    end
+  end.invoke
+  return output
+end
+def file_sha1(path)
+  digest = Digest::SHA1.new
+  fd = File.new(path, "r")
+  while true
+    begin
+      digest << fd.sysread(16384)
+    rescue EOFError
+      break
+    end
+  end
+  return digest.hexdigest
+ensure
+  fd.close if fd
+end
+def download(url, output)
+  uri = URI(url)
+  digest = Digest::SHA1.new
+  tmp = "#{output}.tmp"
+  Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == "https")) do |http|
+    request = Net::HTTP::Get.new(uri.path)
+    http.request(request) do |response|
+      fail "HTTP fetch failed for #{url}. #{response}" if [200, 301].include?(response.code)
+      size = (response["content-length"].to_i || -1).to_f
+      count = 0
+      File.open(tmp, "w") do |fd|
+        response.read_body do |chunk|
+          fd.write(chunk)
+          digest << chunk
+          if size > 0 && $stdout.tty?
+            count += chunk.bytesize
+            $stdout.write(sprintf("\r%0.2f%%", count/size * 100))
+          end
+        end
+      end
+      $stdout.write("\r      \r") if $stdout.tty?
+    end
+  end
+  File.rename(tmp, output)
+  return digest.hexdigest
+rescue SocketError => e
+  puts "Failure while downloading #{url}: #{e}"
+  raise
+ensure
+  File.unlink(tmp) if File.exist?(tmp)
+end # def download
+def untar(tarball, &block)
+  require "archive/tar/minitar"
+  tgz = Zlib::GzipReader.new(File.open(tarball))
+  # Pull out typesdb
+  tar = Archive::Tar::Minitar::Input.open(tgz)
+  tar.each do |entry|
+    path = block.call(entry)
+    next if path.nil?
+    parent = File.dirname(path)
+    mkdir_p parent unless File.directory?(parent)
+    # Skip this file if the output file is the same size
+    if entry.directory?
+      mkdir path unless File.directory?(path)
+    else
+      entry_mode = entry.instance_eval { @mode } & 0777
+      if File.exists?(path)
+        stat = File.stat(path)
+        # TODO(sissel): Submit a patch to archive-tar-minitar upstream to
+        # expose headers in the entry.
+        entry_size = entry.instance_eval { @size }
+        # If file sizes are same, skip writing.
+        next if stat.size == entry_size && (stat.mode & 0777) == entry_mode
+      end
+      puts "Extracting #{entry.full_name} from #{tarball} #{entry_mode.to_s(8)}"
+      File.open(path, "w") do |fd|
+        # eof? check lets us skip empty files. Necessary because the API provided by
+        # Archive::Tar::Minitar::Reader::EntryStream only mostly acts like an
+        # IO object. Something about empty files in this EntryStream causes
+        # IO.copy_stream to throw "can't convert nil into String" on JRuby
+        # TODO(sissel): File a bug about this.
+        while !entry.eof?
+          chunk = entry.read(16384)
+          fd.write(chunk)
+        end
+          #IO.copy_stream(entry, fd)
+      end
+      File.chmod(entry_mode, path)
+    end
+  end
+  tar.close
+  File.unlink(tarball) if File.file?(tarball)
+end # def untar
+def ungz(file)
+  outpath = file.gsub('.gz', '')
+  tgz = Zlib::GzipReader.new(File.open(file))
+  begin
+    File.open(outpath, "w") do |out|
+      IO::copy_stream(tgz, out)
+    end
+    File.unlink(file)
+  rescue
+    File.unlink(outpath) if File.file?(outpath)
+   raise
+  end
+  tgz.close
+end
+desc "Process any vendor files required for this plugin"
+task "vendor" do |task, args|
+  @files.each do |file|
+    download = file_fetch(file['url'], file['sha1'])
+    if download =~ /.tar.gz/
+      prefix = download.gsub('.tar.gz', '').gsub('vendor/', '')
+      untar(download) do |entry|
+        if !file['files'].nil?
+          next unless file['files'].include?(entry.full_name.gsub(prefix, ''))
+          out = entry.full_name.split("/").last
+        end
+        File.join('vendor', out)
+      end
+    elsif download =~ /.gz/
+      ungz(download)
+    end
+  end
+end

data/spec/codecs/multiline_spec.rb ADDED

@@ -0,0 +1,160 @@
+# encoding: utf-8
+require "logstash/codecs/multiline"
+require "logstash/event"
+require "insist"
+describe LogStash::Codecs::Multiline do
+  context "#decode" do
+    it "should be able to handle multiline events with additional lines space-indented" do
+      codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
+      lines = [ "hello world", "   second line", "another first line" ]
+      events = []
+      lines.each do |line|
+        codec.decode(line) do |event|
+          events << event
+        end
+      end
+      codec.flush { |e| events << e }
+      insist { events.size } == 2
+      insist { events[0]["message"] } == "hello world\n   second line"
+      insist { events[0]["tags"] }.include?("multiline")
+      insist { events[1]["message"] } == "another first line"
+      insist { events[1]["tags"] }.nil?
+    end
+    it "should allow custom tag added to multiline events" do
+      codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous", "multiline_tag" => "hurray" )
+      lines = [ "hello world", "   second line", "another first line" ]
+      events = []
+      lines.each do |line|
+        codec.decode(line) do |event|
+          events << event
+        end
+      end
+      codec.flush { |e| events << e }
+      insist { events.size } == 2
+      insist { events[0]["tags"] }.include?("hurray")
+      insist { events[1]["tags"] }.nil?
+    end
+    it "should allow grok patterns to be used" do
+      codec = LogStash::Codecs::Multiline.new(
+        "pattern" => "^%{NUMBER} %{TIME}",
+        "negate" => true,
+        "what" => "previous"
+      )
+      lines = [ "120913 12:04:33 first line", "second line", "third line" ]
+      events = []
+      lines.each do |line|
+        codec.decode(line) do |event|
+          events << event
+        end
+      end
+      codec.flush { |e| events << e }
+      insist { events.size } == 1
+      insist { events.first["message"] } == lines.join("\n")
+    end
+    context "using default UTF-8 charset" do
+      it "should decode valid UTF-8 input" do
+        codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
+        lines = [ "foobar", "κόσμε" ]
+        events = []
+        lines.each do |line|
+          insist { line.encoding.name } == "UTF-8"
+          insist { line.valid_encoding? } == true
+          codec.decode(line) { |event| events << event }
+        end
+        codec.flush { |e| events << e }
+        insist { events.size } == 2
+        events.zip(lines).each do |tuple|
+          insist { tuple[0]["message"] } == tuple[1]
+          insist { tuple[0]["message"].encoding.name } == "UTF-8"
+        end
+      end
+      it "should escape invalid sequences" do
+        codec = LogStash::Codecs::Multiline.new("pattern" => "^\\s", "what" => "previous")
+        lines = [ "foo \xED\xB9\x81\xC3", "bar \xAD" ]
+        events = []
+        lines.each do |line|
+          insist { line.encoding.name } == "UTF-8"
+          insist { line.valid_encoding? } == false
+          codec.decode(line) { |event| events << event }
+        end
+        codec.flush { |e| events << e }
+        insist { events.size } == 2
+        events.zip(lines).each do |tuple|
+          insist { tuple[0]["message"] } == tuple[1].inspect[1..-2]
+          insist { tuple[0]["message"].encoding.name } == "UTF-8"
+        end
+      end
+    end
+    context "with valid non UTF-8 source encoding" do
+      it "should encode to UTF-8" do
+        codec = LogStash::Codecs::Multiline.new("charset" => "ISO-8859-1", "pattern" => "^\\s", "what" => "previous")
+        samples = [
+          ["foobar", "foobar"],
+          ["\xE0 Montr\xE9al", "à Montréal"],
+        ]
+        # lines = [ "foo \xED\xB9\x81\xC3", "bar \xAD" ]
+        events = []
+        samples.map{|(a, b)| a.force_encoding("ISO-8859-1")}.each do |line|
+          insist { line.encoding.name } == "ISO-8859-1"
+          insist { line.valid_encoding? } == true
+          codec.decode(line) { |event| events << event }
+        end
+        codec.flush { |e| events << e }
+        insist { events.size } == 2
+        events.zip(samples.map{|(a, b)| b}).each do |tuple|
+          insist { tuple[1].encoding.name } == "UTF-8"
+          insist { tuple[0]["message"] } == tuple[1]
+          insist { tuple[0]["message"].encoding.name } == "UTF-8"
+        end
+      end
+    end
+    context "with invalid non UTF-8 source encoding" do
+     it "should encode to UTF-8" do
+        codec = LogStash::Codecs::Multiline.new("charset" => "ASCII-8BIT", "pattern" => "^\\s", "what" => "previous")
+        samples = [
+          ["\xE0 Montr\xE9al", "� Montr�al"],
+          ["\xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5", "����������"],
+        ]
+        events = []
+        samples.map{|(a, b)| a.force_encoding("ASCII-8BIT")}.each do |line|
+          insist { line.encoding.name } == "ASCII-8BIT"
+          insist { line.valid_encoding? } == true
+          codec.decode(line) { |event| events << event }
+        end
+        codec.flush { |e| events << e }
+        insist { events.size } == 2
+        events.zip(samples.map{|(a, b)| b}).each do |tuple|
+          insist { tuple[1].encoding.name } == "UTF-8"
+          insist { tuple[0]["message"] } == tuple[1]
+          insist { tuple[0]["message"].encoding.name } == "UTF-8"
+        end
+      end
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,105 @@
+--- !ruby/object:Gem::Specification
+name: logstash-codec-multiline
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Elasticsearch
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-11-05 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: logstash
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 2.0.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 1.4.0
+    - - <
+      - !ruby/object:Gem::Version
+        version: 2.0.0
+- !ruby/object:Gem::Dependency
+  name: logstash-patterns-core
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: jls-grok
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.11.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 0.11.0
+description: The multiline codec will collapse multiline messages and merge them into
+  a single event.
+email: richard.pijnenburg@elasticsearch.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE
+- Rakefile
+- lib/logstash/codecs/multiline.rb
+- logstash-codec-multiline.gemspec
+- rakelib/publish.rake
+- rakelib/vendor.rake
+- spec/codecs/multiline_spec.rb
+homepage: http://logstash.net/
+licenses:
+- Apache License (2.0)
+metadata:
+  logstash_plugin: 'true'
+  group: codec
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.1
+signing_key:
+specification_version: 4
+summary: The multiline codec will collapse multiline messages and merge them into
+  a single event.
+test_files:
+- spec/codecs/multiline_spec.rb