log2json 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/track-tails ADDED
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Helper for the tail-log.sh script to actually create and maintain the sincedb.
4
+ #
5
+ # A sincedb is just a directory that contains subdirectories to text files that
6
+ # record the stats of the files that we're following. Each file records the
7
+ # inode, size, and the number of lines from the start of the file read last time.
8
+ #
9
+ require 'fileutils'
10
+
11
+
12
+ @fmap = {} # path => [inode_number, file_size, number_of_lines_read]
13
+ @sincedb_dir = ARGV.shift()
14
+
15
+ # Note: We expect each -n+N argument to be followed by a file path
16
+ while not ARGV.empty? do
17
+ arg = ARGV.shift()
18
+ if arg =~ /^-n\+(\d+)$/
19
+ fpath = ARGV.shift()
20
+ next if not File.exists?(fpath)
21
+ fstat = File.stat(fpath)
22
+ @fmap[fpath] = [fstat.ino, fstat.size, $1.to_i() - 1]
23
+ end
24
+ end
25
+
26
+ def commit
27
+ return if @fmap.nil?
28
+ @fmap.each do |fpath, t|
29
+ sincedb_path = "#{@sincedb_dir}/#{fpath}.since"
30
+ FileUtils.mkdir_p(File.dirname(sincedb_path))
31
+ IO.write(sincedb_path, "#{t.join(' ')}\n")
32
+ end
33
+ end
34
+ at_exit(&method(:commit))
35
+
36
+ # Note: You probably want to set the locale env var: LC_CTYPE=en_US.UTF-8
37
+ while line = gets()
38
+ if line =~ /^==> (.+) <==(?: \[(.+)\])?$/
39
+ fpath = $1
40
+ if @fmap[fpath].nil? or $2 =~ /^new_file$|^truncated$/
41
+ fstat = File.stat(fpath)
42
+ @fmap[fpath] = [fstat.ino, fstat.size, 0]
43
+ end
44
+ STDOUT.write(line); STDOUT.flush()
45
+ next
46
+ end
47
+ STDOUT.write(line); STDOUT.flush()
48
+ @fmap[fpath][2] += 1
49
+ # Note: In the case of interruption, there's a chance that the line count is
50
+ # one line behind the number of log lines written to stdout. This is
51
+ # OK since we'd rather output a duplicate log line rather than miss
52
+ # one.
53
+ end
54
+
data/lib/log2json.rb ADDED
@@ -0,0 +1,217 @@
1
+ require 'json'
2
+ require 'grok-pure'
3
+
4
+ module Log2Json
5
+
6
+ def self.main(filters, opts={})
7
+ output = opts[:output] || STDOUT
8
+ spitter = opts[:spitter]
9
+ if spitter.nil?
10
+ # configure the spitter to take config overrides from ENV
11
+ config = {}
12
+ Spitter::CONFIG.keys.each do |name|
13
+ key = name.to_s.downcase
14
+ config[name] = ENV[key] if ENV.member?(key)
15
+ end
16
+ spitter = ::Log2Json::Spitter.new(STDIN, ENV['type'], config)
17
+ end
18
+ spitter.each_record do |rec|
19
+ filters[rec['@type']].each { |f| f.filter(rec) }
20
+ if ! rec['@timestamp'].nil?
21
+ output.write(rec.to_json() << "\n")
22
+ output.flush()
23
+ # NOTE: Ruby's built-in json module, by default, doesn't output any
24
+ # literal newline characters while serializing. So using
25
+ # newlines as json record separator is fine here.
26
+ end
27
+ end
28
+ end
29
+
30
+
31
+ # A generic front-end to filters. It sits between an input and a filter, taking
32
+ # log lines from an input and normalizing them into logstash-compatible JSON log
33
+ # records for filters to consume.
34
+ #
35
+ # An input represents the source of log records. The only requirement of of an input
36
+ # is that it outputs to stdout a stream of lines(one line for each log record), with
37
+ # the first line indicating the source(eg, file path, url, ...) of the log lines that
38
+ # follow it. By default, the format of such source-indicating line is the same as
39
+ # those spit out by the tail utility when multiple files are followed.(ie, ==> file-a.txt <==)
40
+ # The format is customizable via a regex.
41
+ #
42
+ # For each type of logs that you'd like to ship, there will be 1 input process, 1 log2json
43
+ # process(with perhaps multiple filters configured), and 1 output process. All connected
44
+ # via unix pipes. The idea is that you can implement your log input and output processes
45
+ # as shell scripts, and filters can be implemented in ruby(likely using Log2Json::Filters::GrokFilter)
46
+ # and installed as ruby gems. Then, you will configure and combine filters and create a
47
+ # Spitter that would use them. See the log2json ruby script for details.
48
+ #
49
+ #
50
+ #
51
+ class Spitter
52
+
53
+ CONFIG = {
54
+ LOG_INPUT_ENCODING: "UTF-8",
55
+ UTC_TIMESTAMP_FORMAT: "%FT%T.%6NZ",
56
+ SOURCE_SEPERATOR_REGEX: Regexp.new("^==> (.+) <=="),
57
+ # because /.../ screws up syntax highlighting in vim so I use Regexp.new(...)
58
+
59
+ TAGS: '',
60
+ FIELDS: '',
61
+ }
62
+
63
+ attr_reader :options
64
+
65
+ def initialize(input_file, type, opts={})
66
+ @input = input_file
67
+ @type = type || ''
68
+ # type can be either a string or a hash whose keys are pathes specified
69
+ # as regex and values are type strings.
70
+
71
+ @options = CONFIG.merge(opts)
72
+
73
+ @source_host = %x(hostname).chomp()
74
+ @source_path = nil
75
+ @tags = options[:TAGS].strip.split(/\s*,\s*/)
76
+
77
+ fields = options[:FIELDS].strip.gsub(/,/, ' ').split(/ +/)
78
+ raise "Number of keys or values in fields must be even!" if fields.length % 2 != 0
79
+
80
+ @fields = {}
81
+ while not fields.empty? do
82
+ k, v = fields.pop(2)
83
+ @fields[k] = v
84
+ end
85
+ end
86
+
87
+ def each_record(&block)
88
+ @input.each_line do |line|
89
+ line.force_encoding(options[:LOG_INPUT_ENCODING])
90
+ line.chomp!
91
+ next if line.empty?
92
+ if line =~ options[:SOURCE_SEPERATOR_REGEX]
93
+ @source_path = $1
94
+ next
95
+ end
96
+ block.call({
97
+ # Every record has a '@type' this is how we match filters to log records.
98
+ # Note: in Ruby 1.9, Hash are ordered, so here we'll be matching source path
99
+ # against the regex in the order they are defined.
100
+ '@type' => if @type.is_a?(String)
101
+ @type
102
+ else # @type is a Hash
103
+ if type = @type.find { |re, t| re =~ @source_path }
104
+ type[1]
105
+ else
106
+ @type[nil] || ''
107
+ end
108
+ end,
109
+ '@source_path' => @source_path,
110
+ '@source_host' => @source_host,
111
+ '@timestamp' => Time.new.utc.strftime(options[:UTC_TIMESTAMP_FORMAT]),
112
+ '@message' => line,
113
+ '@tags' => @tags.clone, # defaults to []
114
+ '@fields' => @fields.clone, # defaluts to {}
115
+ })
116
+ end
117
+ end
118
+ end # Spitter
119
+
120
+
121
+ module Filters #--------------------------------------
122
+
123
+ # A filter takes a JSON log record, process it by adding, correcting or
124
+ # even removing attributes from it if necessary.
125
+ class GrokFilter
126
+
127
+ DEFAULT_PATTERNS = File.join(File.dirname(__FILE__),
128
+ 'log2json', 'filters', 'base.patterns')
129
+
130
+ CONFIG = {
131
+ NAMED_CAPTURES_ONLY: true,
132
+ KEEP_EMTPY_CAPTURES: false
133
+ }
134
+
135
+
136
+ attr_reader :type, :name
137
+
138
+ def initialize(type, name, regexps, opts={}, &filter_block)
139
+ @type = type
140
+ @name = name
141
+ @filter_block = filter_block
142
+ @record_kvs = opts.select { |k,v| k.start_with?('@') }
143
+ @config = opts.select { |k,v| not k.start_with?('@') }.merge CONFIG
144
+
145
+ @pile = Grok::Pile.new
146
+ @pile.add_patterns_from_file(@config[:pattern_file] || DEFAULT_PATTERNS)
147
+ regexps.each { |re| @pile.compile(re) }
148
+ end
149
+
150
+ # Filter the log record.
151
+ #
152
+ # This means checking if the record matches the patterns of this filter and
153
+ # add the captured groups as members of the @fields of the record if
154
+ # there's a match.
155
+ #
156
+ # Any '@' key-values configured for this filter will also
157
+ # be added to the record after merging the captured groups.
158
+ #
159
+ # Return the record at the end if there's a match else return nil.
160
+ # If the '@timestamp' attribute is removed from a record then the record will
161
+ # be dropped.
162
+ def filter(record)
163
+ grok, match = @pile.match(record['@message'])
164
+ if match
165
+ # code stolen and modified from logstash's grok filter.
166
+ fields = record['@fields']
167
+ match.each_capture() do |key, value|
168
+ next if value.nil? and not @config[:KEEP_EMTPY_CAPTURES]
169
+ if key.include?(':')
170
+ pattern_name, key, value_type = key.split(':') # ie, %{pattern_name:key:value_type}
171
+ case value_type
172
+ when 'int' ; value = value.to_i
173
+ when 'float'; value = value.to_f
174
+ end
175
+ else
176
+ next if @config[:NAMED_CAPTURES_ONLY]
177
+ end
178
+ if fields[key].nil?
179
+ fields[key] = value
180
+ else # if there already exists a field for the captured value
181
+ # then we aggregate the captured values in an array for the field.
182
+ if not fields[key].is_a?(Array)
183
+ fields[key] = [fields[key]]
184
+ end
185
+ fields[key] << value
186
+ end
187
+ end
188
+
189
+ record.merge!(@record_kvs) do |k, oldval, newval|
190
+ if k == '@tags'
191
+ oldval.concat(newval).uniq!
192
+ elsif k == '@fields'
193
+ oldval.merge!(newval)
194
+ end
195
+ end
196
+ (fields['filtered_by'] ||= []) << name
197
+ if @filter_block
198
+ @filter_block.call(record)
199
+ else
200
+ record
201
+ end
202
+ else
203
+ nil
204
+ end
205
+ end
206
+ end # end class GrokFilter
207
+
208
+
209
+ end # end module Filters
210
+
211
+
212
+
213
+
214
+
215
+
216
+ end # Log2Json module
217
+
@@ -0,0 +1,93 @@
1
+ USERNAME [a-zA-Z0-9_-]+
2
+ USER %{USERNAME}
3
+ INT (?:[+-]?(?:[0-9]+))
4
+ BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))
5
+ NUMBER (?:%{BASE10NUM})
6
+ BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+))
7
+ BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b
8
+
9
+ POSINT \b(?:[0-9]+)\b
10
+ WORD \b\w+\b
11
+ NOTSPACE \S+
12
+ DATA .*?
13
+ GREEDYDATA .*
14
+ #QUOTEDSTRING (?:(?<!\\)(?:"(?:\\.|[^\\"])*"|(?:'(?:\\.|[^\\'])*')|(?:`(?:\\.|[^\\`])*`)))
15
+ QUOTEDSTRING (?:(?<!\\)(?:"(?>[^\\"]+|\\.)*")|(?:'(?>[^\\']+|\\.)*')|(?:`(?>[^\\`]+|\\.)*`))
16
+
17
+ # Networking
18
+ MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})
19
+ CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})
20
+ WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})
21
+ COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})
22
+ IP (?<![0-9])(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))(?![0-9])
23
+ HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)
24
+ HOST %{HOSTNAME}
25
+ IPORHOST (?:%{HOSTNAME}|%{IP})
26
+ HOSTPORT (?:%{IPORHOST=~/\./}:%{POSINT})
27
+
28
+ # paths
29
+ PATH (?:%{UNIXPATH}|%{WINPATH})
30
+ UNIXPATH (?:/(?:[\w_%!$@:.,-]+|\\.)*)+
31
+ #UNIXPATH (?<![\w\/])(?:/[^\/\s?*]*)+
32
+ LINUXTTY (?:/dev/pts/%{POSINT})
33
+ BSDTTY (?:/dev/tty[pq][a-z0-9])
34
+ TTY (?:%{BSDTTY}|%{LINUXTTY})
35
+ WINPATH (?:[A-Za-z]+:|\\)(?:\\[^\\?*]*)+
36
+ URIPROTO [A-Za-z]+(\+[A-Za-z+]+)?
37
+ URIHOST %{IPORHOST}(?::%{POSINT:port})?
38
+ # uripath comes loosely from RFC1738, but mostly from what Firefox
39
+ # doesn't turn into %XX
40
+ URIPATH (?:/[A-Za-z0-9$.+!*'(),~:#%_-]*)+
41
+ #URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
42
+ URIPARAM \?[A-Za-z0-9$.+!*'(),~#%&/=:;_-]*
43
+ URIPATHPARAM %{URIPATH}(?:%{URIPARAM})?
44
+ URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?
45
+
46
+ # Months: January, Feb, 3, 03, 12, December
47
+ MONTH \b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|[Mm]ay|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b
48
+ MONTHNUM (?:0?[1-9]|1[0-2])
49
+ MONTHDAY (?:3[01]|[1-2]?[0-9]|0?[1-9])
50
+
51
+ # Days: Monday, Tue, Thu, etc...
52
+ DAY (?:[Mm]on(?:day)?|[Tt]ue(?:sday)?|[Ww]ed(?:nesday)?|[Tt]hu(?:rsday)?|[Ff]ri(?:day)?|[Ss]at(?:urday)?|[Ss]un(?:day)?)
53
+
54
+ # Years?
55
+ YEAR [0-9]+
56
+ # Time: HH:MM:SS
57
+ #TIME \d{2}:\d{2}(?::\d{2}(?:\.\d+)?)?
58
+ # I'm still on the fence about using grok to perform the time match,
59
+ # since it's probably slower.
60
+ # TIME %{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)?
61
+ HOUR (?:2[0123]|[01][0-9])
62
+ MINUTE (?:[0-5][0-9])
63
+ # '60' is a leap second in most time standards and thus is valid.
64
+ SECOND (?:(?:[0-5][0-9]|60)(?:[.,][0-9]+)?)
65
+ TIME (?<![0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9])
66
+ # datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
67
+ DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}
68
+ DATE_EU %{YEAR}[/-]%{MONTHNUM}[/-]%{MONTHDAY}
69
+ ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE}))
70
+ ISO8601_SECOND (?:%{SECOND}|60)
71
+ TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?
72
+ DATE %{DATE_US}|%{DATE_EU}
73
+ DATESTAMP %{DATE}[- ]%{TIME}
74
+ TZ (?:[PMCE][SD]T)
75
+ DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}
76
+ DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}
77
+
78
+ # Syslog Dates: Month Day HH:MM:SS
79
+ SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME}
80
+ PROG (?:[\w._/-]+)
81
+ SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])?
82
+ SYSLOGHOST %{IPORHOST}
83
+ SYSLOGFACILITY <%{POSINT:facility}.%{POSINT:priority}>
84
+
85
+ ZONE %{INT}
86
+ HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{ZONE}
87
+
88
+ # Shortcuts
89
+ QS %{QUOTEDSTRING}
90
+
91
+ # Log formats
92
+ SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:
93
+ COMBINEDAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "%{WORD:verb} %{URIPATHPARAM:request} HTTP/%{NUMBER:httpversion}" %{NUMBER:response} (?:%{NUMBER:bytes}|-) "(?:%{URI:referrer}|-)" %{QS:agent}
@@ -0,0 +1,46 @@
1
+ require 'log2json'
2
+ require 'date'
3
+
4
+ module Log2Json
5
+ module Filters
6
+ #----
7
+
8
+ class NginxAccessLogFilter < GrokFilter
9
+
10
+ def initialize(name, config={})
11
+ # Thanks to - http://boojapathy.wordpress.com/2012/04/29/logstash-graylog-cant-ask-more-for-logging/
12
+ #
13
+ # 10.33.158.237 - - [12/Apr/2013:13:27:54 -0000] "GET /UEFA/news.json?blackberry_native_version=1.9.4&locale=es HTTP/1.1" 200 6495 "-" "-" "-" "-" "-" cache_status:BYPASS
14
+ #
15
+ type = config.delete(:type) {'nginx-access'}
16
+ super(type, name, [
17
+ %w[ %{IP:ip}
18
+ (?:%{HOST:host}|-)
19
+ (?:%{USER:user}|-)
20
+ \\\[%{HTTPDATE:datetime}\\\] +"(?:%{WORD:method} %{URIPATHPARAM:path} HTTP/%{NUMBER:version}|%{DATA:request})"
21
+ %{NUMBER:status}
22
+ (?:%{NUMBER:size}|-)
23
+ %{QUOTEDSTRING:referrer}
24
+ %{QUOTEDSTRING:user_agent}
25
+ (?:%{GREEDYDATA:extra_info})
26
+ ].join(' ') ], config
27
+ )
28
+ end
29
+
30
+ def filter(record)
31
+ return nil if super(record).nil?
32
+ # eg, 23/Nov/2012:19:11:10 +0000
33
+ record['@timestamp'] = DateTime.strptime(record['@fields']['datetime'], "%d/%b/%Y:%T %z")
34
+ record['@fields'].delete('datetime')
35
+ record['@tags'] << "nginx" << "http"
36
+ record
37
+ end
38
+
39
+ end # NginxAccessLogFilter
40
+
41
+
42
+
43
+
44
+ #----
45
+ end
46
+ end
@@ -0,0 +1,62 @@
1
+ require 'log2json'
2
+ require 'date'
3
+
4
+ module Log2Json
5
+ module Filters
6
+ #----
7
+
8
+ # A default syslog filter.
9
+ # This works the rsyslog and its default configuration as distributed with Ubuntu 12.04 LTS.
10
+ #
11
+ # It also assumes your syslog timestamp is in UTC. To make sure, add the following line to
12
+ # /etc/default/rsyslog:
13
+ #
14
+ # export TZ=UTC
15
+ #
16
+ # and then restart rsyslog.(ie, sudo service restart rsyslog)
17
+ # Other settings for rsyslog you might want to adjust includes:
18
+ #
19
+ #
20
+ # MaxMessageSize 64k # Increase the message size allowed to 64k (default is like 2k... or something.)
21
+ #
22
+ # $IMUXSockRateLimitInterval 0 # Disable rate limiting, so we are sure to get every single message logged.
23
+ # # Note: Add it after $ModLoad imuxsock
24
+ #
25
+ #
26
+ class SyslogFilter < GrokFilter
27
+
28
+ def initialize(name, config={})
29
+ type = config.delete(:type) {'syslog'}
30
+ super(type, name, [
31
+ %w[ %{SYSLOGTIMESTAMP:syslog_timestamp}
32
+ %{SYSLOGHOST:syslog_hostname}?
33
+ %{PROG:syslog_program}(?:\\\[%{POSINT:syslog_pid}\\\])?:
34
+ %{GREEDYDATA:syslog_message}
35
+ ].join(' ')], config
36
+ )
37
+ end
38
+
39
+ def filter(record)
40
+ return nil if super(record).nil?
41
+ record['@received_at'] = record['@timestamp']
42
+ record['@received_from'] = record['@source_host']
43
+
44
+ fields = record['@fields']
45
+
46
+ fields['syslog_timestamp'] += '+0000'
47
+ record['@timestamp'] = DateTime.strptime(fields["syslog_timestamp"], "%b %e %T%z") # eg, Apr 12 15:55:28+0000
48
+
49
+ record['@source_host'] = fields['syslog_hostname']
50
+ record['@message'] = fields['syslog_message'].gsub(/#012/, "\n")
51
+ record['@tags'] << fields['syslog_program']
52
+ fields.each_key { |k| fields.delete(k) if k.start_with?('syslog_') }
53
+ record
54
+ end
55
+
56
+ end # SyslogFilter
57
+
58
+
59
+
60
+ #----
61
+ end
62
+ end