RubyGems - log2json - Versions diffs - 0.1.5 - Mend

log2json 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/Gemfile +2 -0
data/Gemfile.lock +24 -0
data/README +66 -0
data/bin/lines2redis +73 -0
data/bin/nginxlog2json +58 -0
data/bin/redis2es +146 -0
data/bin/syslog2json +23 -0
data/bin/tail +0 -0
data/bin/tail-log +7 -0
data/bin/tail-log.sh +67 -0
data/bin/track-tails +54 -0
data/lib/log2json.rb +217 -0
data/lib/log2json/filters/base.patterns +93 -0
data/lib/log2json/filters/nginx_access.rb +46 -0
data/lib/log2json/filters/syslog.rb +62 -0
data/lib/log2json/railslogger.rb +96 -0
data/log2json.gemspec +18 -0
data/src/coreutils-8.13_tail.patch +9 -0
data/src/tail.c +2224 -0
metadata +192 -0

data/bin/track-tails ADDED Viewed

@@ -0,0 +1,54 @@
+#!/usr/bin/env ruby
+#
+# Helper for the tail-log.sh script to actually create and maintain the sincedb.
+#
+# A sincedb is just a directory that contains subdirectories to text files that
+# record the stats of the files that we're following. Each file records the
+# inode, size, and the number of lines from the start of the file read last time.
+#
+require 'fileutils'
+@fmap = {}    # path => [inode_number, file_size, number_of_lines_read]
+@sincedb_dir = ARGV.shift()
+# Note: We expect each -n+N argument to be followed by a file path
+while not ARGV.empty? do
+  arg = ARGV.shift()
+  if arg =~ /^-n\+(\d+)$/
+    fpath = ARGV.shift()
+    next if not File.exists?(fpath)
+    fstat = File.stat(fpath)
+    @fmap[fpath] = [fstat.ino, fstat.size, $1.to_i() - 1]
+  end
+end
+def commit
+  return if @fmap.nil?
+  @fmap.each do |fpath, t|
+    sincedb_path = "#{@sincedb_dir}/#{fpath}.since"
+    FileUtils.mkdir_p(File.dirname(sincedb_path))
+    IO.write(sincedb_path, "#{t.join(' ')}\n")
+  end
+end
+at_exit(&method(:commit))
+# Note: You probably want to set the locale env var: LC_CTYPE=en_US.UTF-8
+while line = gets()
+  if line =~ /^==> (.+) <==(?: \[(.+)\])?$/
+    fpath = $1
+    if @fmap[fpath].nil? or $2 =~ /^new_file$|^truncated$/
+      fstat = File.stat(fpath)
+      @fmap[fpath] = [fstat.ino, fstat.size, 0]
+    end
+    STDOUT.write(line); STDOUT.flush()
+    next
+  end
+  STDOUT.write(line); STDOUT.flush()
+  @fmap[fpath][2] += 1
+  # Note: In the case of interruption, there's a chance that the line count is
+  #       one line behind the number of log lines written to stdout. This is
+  #       OK since we'd rather output a duplicate log line rather than miss
+  #       one.
+end

data/lib/log2json.rb ADDED Viewed

@@ -0,0 +1,217 @@
+require 'json'
+require 'grok-pure'
+module Log2Json
+def self.main(filters, opts={})
+  output = opts[:output] || STDOUT
+  spitter = opts[:spitter]
+  if spitter.nil?
+    # configure the spitter to take config overrides from ENV
+    config = {}
+    Spitter::CONFIG.keys.each do |name|
+      key = name.to_s.downcase
+      config[name] = ENV[key] if ENV.member?(key)
+    end
+    spitter = ::Log2Json::Spitter.new(STDIN, ENV['type'], config)
+  end
+  spitter.each_record do |rec|
+    filters[rec['@type']].each { |f| f.filter(rec) }
+    if ! rec['@timestamp'].nil?
+      output.write(rec.to_json() << "\n")
+      output.flush()
+      # NOTE: Ruby's built-in json module, by default, doesn't output any
+      #       literal newline characters while serializing. So using
+      #       newlines as json record separator is fine here.
+    end
+  end
+end
+# A generic front-end to filters. It sits between an input and a filter, taking
+# log lines from an input and normalizing them into logstash-compatible JSON log
+# records for filters to consume.
+#
+# An input represents the source of log records. The only requirement of of an input
+# is that it outputs to stdout a stream of lines(one line for each log record), with
+# the first line indicating the source(eg, file path, url, ...) of the log lines that
+# follow it. By default, the format of such source-indicating line is the same as
+# those spit out by the tail utility when multiple files are followed.(ie, ==> file-a.txt <==)
+# The format is customizable via a regex.
+#
+# For each type of logs that you'd like to ship, there will be 1 input process, 1 log2json
+# process(with perhaps multiple filters configured), and 1 output process. All connected
+# via unix pipes. The idea is that you can implement your log input and output processes
+# as shell scripts, and filters can be implemented in ruby(likely using Log2Json::Filters::GrokFilter)
+# and installed as ruby gems. Then, you will configure and combine filters and create a
+# Spitter that would use them. See the log2json ruby script for details.
+#
+#
+#
+class Spitter
+  CONFIG = {
+    LOG_INPUT_ENCODING: "UTF-8",
+    UTC_TIMESTAMP_FORMAT: "%FT%T.%6NZ",
+    SOURCE_SEPERATOR_REGEX: Regexp.new("^==> (.+) <=="),
+    # because /.../ screws up syntax highlighting in vim so I use Regexp.new(...)
+    TAGS: '',
+    FIELDS: '',
+  }
+  attr_reader :options
+  def initialize(input_file, type, opts={})
+    @input = input_file
+    @type = type || ''
+    # type can be either a string or a hash whose keys are pathes specified
+    # as regex and values are type strings.
+    @options = CONFIG.merge(opts)
+    @source_host = %x(hostname).chomp()
+    @source_path = nil
+    @tags = options[:TAGS].strip.split(/\s*,\s*/)
+    fields = options[:FIELDS].strip.gsub(/,/, ' ').split(/ +/)
+    raise "Number of keys or values in fields must be even!" if fields.length % 2 != 0
+    @fields = {}
+    while not fields.empty? do
+      k, v = fields.pop(2)
+      @fields[k] = v
+    end
+  end
+  def each_record(&block)
+    @input.each_line do |line|
+      line.force_encoding(options[:LOG_INPUT_ENCODING])
+      line.chomp!
+      next if line.empty?
+      if line =~ options[:SOURCE_SEPERATOR_REGEX]
+        @source_path = $1
+        next
+      end
+      block.call({
+        # Every record has a '@type' this is how we match filters to log records.
+        # Note: in Ruby 1.9, Hash are ordered, so here we'll be matching source path
+        #       against the regex in the order they are defined.
+        '@type' => if @type.is_a?(String)
+                     @type
+                   else  # @type is a Hash
+                     if type = @type.find { |re, t| re =~ @source_path }
+                       type[1]
+                     else
+                       @type[nil] || ''
+                     end
+                   end,
+        '@source_path' => @source_path,
+        '@source_host' => @source_host,
+        '@timestamp' => Time.new.utc.strftime(options[:UTC_TIMESTAMP_FORMAT]),
+        '@message' => line,
+        '@tags' => @tags.clone,     # defaults to []
+        '@fields' => @fields.clone, # defaluts to {}
+      })
+    end
+  end
+end  # Spitter
+module Filters #--------------------------------------
+# A filter takes a JSON log record, process it by adding, correcting or
+# even removing attributes from it if necessary.
+class GrokFilter
+  DEFAULT_PATTERNS = File.join(File.dirname(__FILE__),
+                                 'log2json', 'filters', 'base.patterns')
+  CONFIG = {
+    NAMED_CAPTURES_ONLY: true,
+    KEEP_EMTPY_CAPTURES: false
+  }
+  attr_reader :type, :name
+  def initialize(type, name, regexps, opts={}, &filter_block)
+    @type = type
+    @name = name
+    @filter_block = filter_block
+    @record_kvs = opts.select { |k,v| k.start_with?('@') }
+    @config = opts.select { |k,v| not k.start_with?('@') }.merge CONFIG
+    @pile = Grok::Pile.new
+    @pile.add_patterns_from_file(@config[:pattern_file] || DEFAULT_PATTERNS)
+    regexps.each { |re| @pile.compile(re) }
+  end
+  # Filter the log record.
+  #
+  # This means checking if the record matches the patterns of this filter and
+  # add the captured groups as members of the @fields of the record if
+  # there's a match.
+  #
+  # Any '@' key-values configured for this filter will also
+  # be added to the record after merging the captured groups.
+  #
+  # Return the record at the end if there's a match else return nil.
+  # If the '@timestamp' attribute is removed from a record then the record will
+  # be dropped.
+  def filter(record)
+    grok, match = @pile.match(record['@message'])
+    if match
+      # code stolen and modified from logstash's grok filter.
+      fields = record['@fields']
+      match.each_capture() do |key, value|
+        next if value.nil? and not @config[:KEEP_EMTPY_CAPTURES]
+        if key.include?(':')
+          pattern_name, key, value_type = key.split(':') # ie, %{pattern_name:key:value_type}
+          case value_type
+            when 'int'  ; value = value.to_i
+            when 'float'; value = value.to_f
+          end
+        else
+          next if @config[:NAMED_CAPTURES_ONLY]
+        end
+        if fields[key].nil?
+          fields[key] = value
+        else # if there already exists a field for the captured value
+             # then we aggregate the captured values in an array for the field.
+          if not fields[key].is_a?(Array)
+            fields[key] = [fields[key]]
+          end
+          fields[key] << value
+        end
+      end
+      record.merge!(@record_kvs) do |k, oldval, newval|
+        if k == '@tags'
+          oldval.concat(newval).uniq!
+        elsif k == '@fields'
+          oldval.merge!(newval)
+        end
+      end
+      (fields['filtered_by'] ||= []) << name
+      if @filter_block
+        @filter_block.call(record)
+      else
+        record
+      end
+    else
+      nil
+    end
+  end
+end  # end class GrokFilter
+end  # end module Filters
+end # Log2Json module

data/lib/log2json/filters/base.patterns ADDED Viewed

@@ -0,0 +1,93 @@
+USERNAME [a-zA-Z0-9_-]+
+USER %{USERNAME}
+INT (?:[+-]?(?:[0-9]+))
+BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))
+NUMBER (?:%{BASE10NUM})
+BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+))
+BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b
+POSINT \b(?:[0-9]+)\b
+WORD \b\w+\b
+NOTSPACE \S+
+DATA .*?
+GREEDYDATA .*
+#QUOTEDSTRING (?:(?<!\\)(?:"(?:\\.|[^\\"])*"|(?:'(?:\\.|[^\\'])*')|(?:`(?:\\.|[^\\`])*`)))
+QUOTEDSTRING (?:(?<!\\)(?:"(?>[^\\"]+|\\.)*")|(?:'(?>[^\\']+|\\.)*')|(?:`(?>[^\\`]+|\\.)*`))
+# Networking
+MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})
+CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})
+WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})
+COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})
+IP (?<![0-9])(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))(?![0-9])
+HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)
+HOST %{HOSTNAME}
+IPORHOST (?:%{HOSTNAME}|%{IP})
+HOSTPORT (?:%{IPORHOST=~/\./}:%{POSINT})
+# paths
+PATH (?:%{UNIXPATH}|%{WINPATH})
+UNIXPATH (?:/(?:[\w_%!$@:.,-]+|\\.)*)+
+#UNIXPATH (?<![\w\/])(?:/[^\/\s?*]*)+
+LINUXTTY (?:/dev/pts/%{POSINT})
+BSDTTY (?:/dev/tty[pq][a-z0-9])
+TTY (?:%{BSDTTY}|%{LINUXTTY})
+WINPATH (?:[A-Za-z]+:|\\)(?:\\[^\\?*]*)+
+URIPROTO [A-Za-z]+(\+[A-Za-z+]+)?
+URIHOST %{IPORHOST}(?::%{POSINT:port})?
+# uripath comes loosely from RFC1738, but mostly from what Firefox
+# doesn't turn into %XX
+URIPATH (?:/[A-Za-z0-9$.+!*'(),~:#%_-]*)+
+#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
+URIPARAM \?[A-Za-z0-9$.+!*'(),~#%&/=:;_-]*
+URIPATHPARAM %{URIPATH}(?:%{URIPARAM})?
+URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?
+# Months: January, Feb, 3, 03, 12, December
+MONTH \b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|[Mm]ay|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b
+MONTHNUM (?:0?[1-9]|1[0-2])
+MONTHDAY (?:3[01]|[1-2]?[0-9]|0?[1-9])
+# Days: Monday, Tue, Thu, etc...
+DAY (?:[Mm]on(?:day)?|[Tt]ue(?:sday)?|[Ww]ed(?:nesday)?|[Tt]hu(?:rsday)?|[Ff]ri(?:day)?|[Ss]at(?:urday)?|[Ss]un(?:day)?)
+# Years?
+YEAR [0-9]+
+# Time: HH:MM:SS
+#TIME \d{2}:\d{2}(?::\d{2}(?:\.\d+)?)?
+# I'm still on the fence about using grok to perform the time match,
+# since it's probably slower.
+# TIME %{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)?
+HOUR (?:2[0123]|[01][0-9])
+MINUTE (?:[0-5][0-9])
+# '60' is a leap second in most time standards and thus is valid.
+SECOND (?:(?:[0-5][0-9]|60)(?:[.,][0-9]+)?)
+TIME (?<![0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9])
+# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
+DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}
+DATE_EU %{YEAR}[/-]%{MONTHNUM}[/-]%{MONTHDAY}
+ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE}))
+ISO8601_SECOND (?:%{SECOND}|60)
+TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?
+DATE %{DATE_US}|%{DATE_EU}
+DATESTAMP %{DATE}[- ]%{TIME}
+TZ (?:[PMCE][SD]T)
+DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}
+DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}
+# Syslog Dates: Month Day HH:MM:SS
+SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME}
+PROG (?:[\w._/-]+)
+SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])?
+SYSLOGHOST %{IPORHOST}
+SYSLOGFACILITY <%{POSINT:facility}.%{POSINT:priority}>
+ZONE %{INT}
+HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{ZONE}
+# Shortcuts
+QS %{QUOTEDSTRING}
+# Log formats
+SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:
+COMBINEDAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "%{WORD:verb} %{URIPATHPARAM:request} HTTP/%{NUMBER:httpversion}" %{NUMBER:response} (?:%{NUMBER:bytes}|-) "(?:%{URI:referrer}|-)" %{QS:agent}

data/lib/log2json/filters/nginx_access.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'log2json'
+require 'date'
+module Log2Json
+module Filters
+#----
+class NginxAccessLogFilter < GrokFilter
+  def initialize(name, config={})
+    # Thanks to - http://boojapathy.wordpress.com/2012/04/29/logstash-graylog-cant-ask-more-for-logging/
+    #
+    # 10.33.158.237 - - [12/Apr/2013:13:27:54 -0000] "GET /UEFA/news.json?blackberry_native_version=1.9.4&locale=es HTTP/1.1" 200 6495 "-" "-" "-" "-" "-" cache_status:BYPASS
+    #
+    type = config.delete(:type) {'nginx-access'}
+    super(type, name, [
+      %w[ %{IP:ip}
+          (?:%{HOST:host}|-)
+          (?:%{USER:user}|-)
+          \\\[%{HTTPDATE:datetime}\\\] +"(?:%{WORD:method} %{URIPATHPARAM:path} HTTP/%{NUMBER:version}|%{DATA:request})"
+          %{NUMBER:status}
+          (?:%{NUMBER:size}|-)
+          %{QUOTEDSTRING:referrer}
+          %{QUOTEDSTRING:user_agent}
+          (?:%{GREEDYDATA:extra_info})
+        ].join(' ') ], config
+     )
+  end
+  def filter(record)
+    return nil if super(record).nil?
+    # eg, 23/Nov/2012:19:11:10 +0000
+    record['@timestamp'] = DateTime.strptime(record['@fields']['datetime'], "%d/%b/%Y:%T %z")
+    record['@fields'].delete('datetime')
+    record['@tags'] << "nginx" << "http"
+    record
+  end
+end  # NginxAccessLogFilter
+#----
+end
+end

data/lib/log2json/filters/syslog.rb ADDED Viewed

@@ -0,0 +1,62 @@
+require 'log2json'
+require 'date'
+module Log2Json
+module Filters
+#----
+# A default syslog filter.
+# This works the rsyslog and its default configuration as distributed with Ubuntu 12.04 LTS.
+#
+# It also assumes your syslog timestamp is in UTC. To make sure, add the following line to
+# /etc/default/rsyslog:
+#
+#    export TZ=UTC
+#
+# and then restart rsyslog.(ie, sudo service restart rsyslog)
+# Other settings for rsyslog you might want to adjust includes:
+#
+#
+# MaxMessageSize 64k             # Increase the message size allowed to 64k (default is like 2k... or something.)
+#
+# $IMUXSockRateLimitInterval 0   # Disable rate limiting, so we are sure to get every single message logged.
+#                                # Note: Add it after $ModLoad imuxsock
+#
+#
+class SyslogFilter < GrokFilter
+  def initialize(name, config={})
+    type = config.delete(:type) {'syslog'}
+    super(type, name, [
+      %w[ %{SYSLOGTIMESTAMP:syslog_timestamp}
+          %{SYSLOGHOST:syslog_hostname}?
+          %{PROG:syslog_program}(?:\\\[%{POSINT:syslog_pid}\\\])?:
+          %{GREEDYDATA:syslog_message}
+        ].join(' ')], config
+    )
+  end
+  def filter(record)
+    return nil if super(record).nil?
+    record['@received_at'] = record['@timestamp']
+    record['@received_from'] = record['@source_host']
+    fields = record['@fields']
+    fields['syslog_timestamp'] += '+0000'
+    record['@timestamp'] = DateTime.strptime(fields["syslog_timestamp"], "%b %e %T%z") # eg, Apr 12 15:55:28+0000
+    record['@source_host'] = fields['syslog_hostname']
+    record['@message'] = fields['syslog_message'].gsub(/#012/, "\n")
+    record['@tags'] << fields['syslog_program']
+    fields.each_key { |k| fields.delete(k) if k.start_with?('syslog_') }
+    record
+  end
+end  # SyslogFilter
+#----
+end
+end