log2json 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/bin/track-tails ADDED
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Helper for the tail-log.sh script to actually create and maintain the sincedb.
4
+ #
5
+ # A sincedb is just a directory that contains subdirectories to text files that
6
+ # record the stats of the files that we're following. Each file records the
7
+ # inode, size, and the number of lines from the start of the file read last time.
8
+ #
9
+ require 'fileutils'
10
+
11
+
12
+ @fmap = {} # path => [inode_number, file_size, number_of_lines_read]
13
+ @sincedb_dir = ARGV.shift()
14
+
15
+ # Note: We expect each -n+N argument to be followed by a file path
16
+ while not ARGV.empty? do
17
+ arg = ARGV.shift()
18
+ if arg =~ /^-n\+(\d+)$/
19
+ fpath = ARGV.shift()
20
+ next if not File.exists?(fpath)
21
+ fstat = File.stat(fpath)
22
+ @fmap[fpath] = [fstat.ino, fstat.size, $1.to_i() - 1]
23
+ end
24
+ end
25
+
26
+ def commit
27
+ return if @fmap.nil?
28
+ @fmap.each do |fpath, t|
29
+ sincedb_path = "#{@sincedb_dir}/#{fpath}.since"
30
+ FileUtils.mkdir_p(File.dirname(sincedb_path))
31
+ IO.write(sincedb_path, "#{t.join(' ')}\n")
32
+ end
33
+ end
34
+ at_exit(&method(:commit))
35
+
36
+ # Note: You probably want to set the locale env var: LC_CTYPE=en_US.UTF-8
37
+ while line = gets()
38
+ if line =~ /^==> (.+) <==(?: \[(.+)\])?$/
39
+ fpath = $1
40
+ if @fmap[fpath].nil? or $2 =~ /^new_file$|^truncated$/
41
+ fstat = File.stat(fpath)
42
+ @fmap[fpath] = [fstat.ino, fstat.size, 0]
43
+ end
44
+ STDOUT.write(line); STDOUT.flush()
45
+ next
46
+ end
47
+ STDOUT.write(line); STDOUT.flush()
48
+ @fmap[fpath][2] += 1
49
+ # Note: In the case of interruption, there's a chance that the line count is
50
+ # one line behind the number of log lines written to stdout. This is
51
+ # OK since we'd rather output a duplicate log line rather than miss
52
+ # one.
53
+ end
54
+
data/lib/log2json.rb ADDED
@@ -0,0 +1,217 @@
1
+ require 'json'
2
+ require 'grok-pure'
3
+
4
+ module Log2Json
5
+
6
+ def self.main(filters, opts={})
7
+ output = opts[:output] || STDOUT
8
+ spitter = opts[:spitter]
9
+ if spitter.nil?
10
+ # configure the spitter to take config overrides from ENV
11
+ config = {}
12
+ Spitter::CONFIG.keys.each do |name|
13
+ key = name.to_s.downcase
14
+ config[name] = ENV[key] if ENV.member?(key)
15
+ end
16
+ spitter = ::Log2Json::Spitter.new(STDIN, ENV['type'], config)
17
+ end
18
+ spitter.each_record do |rec|
19
+ filters[rec['@type']].each { |f| f.filter(rec) }
20
+ if ! rec['@timestamp'].nil?
21
+ output.write(rec.to_json() << "\n")
22
+ output.flush()
23
+ # NOTE: Ruby's built-in json module, by default, doesn't output any
24
+ # literal newline characters while serializing. So using
25
+ # newlines as json record separator is fine here.
26
+ end
27
+ end
28
+ end
29
+
30
+
31
+ # A generic front-end to filters. It sits between an input and a filter, taking
32
+ # log lines from an input and normalizing them into logstash-compatible JSON log
33
+ # records for filters to consume.
34
+ #
35
+ # An input represents the source of log records. The only requirement of of an input
36
+ # is that it outputs to stdout a stream of lines(one line for each log record), with
37
+ # the first line indicating the source(eg, file path, url, ...) of the log lines that
38
+ # follow it. By default, the format of such source-indicating line is the same as
39
+ # those spit out by the tail utility when multiple files are followed.(ie, ==> file-a.txt <==)
40
+ # The format is customizable via a regex.
41
+ #
42
+ # For each type of logs that you'd like to ship, there will be 1 input process, 1 log2json
43
+ # process(with perhaps multiple filters configured), and 1 output process. All connected
44
+ # via unix pipes. The idea is that you can implement your log input and output processes
45
+ # as shell scripts, and filters can be implemented in ruby(likely using Log2Json::Filters::GrokFilter)
46
+ # and installed as ruby gems. Then, you will configure and combine filters and create a
47
+ # Spitter that would use them. See the log2json ruby script for details.
48
+ #
49
+ #
50
+ #
51
+ class Spitter
52
+
53
+ CONFIG = {
54
+ LOG_INPUT_ENCODING: "UTF-8",
55
+ UTC_TIMESTAMP_FORMAT: "%FT%T.%6NZ",
56
+ SOURCE_SEPERATOR_REGEX: Regexp.new("^==> (.+) <=="),
57
+ # because /.../ screws up syntax highlighting in vim so I use Regexp.new(...)
58
+
59
+ TAGS: '',
60
+ FIELDS: '',
61
+ }
62
+
63
+ attr_reader :options
64
+
65
+ def initialize(input_file, type, opts={})
66
+ @input = input_file
67
+ @type = type || ''
68
+ # type can be either a string or a hash whose keys are pathes specified
69
+ # as regex and values are type strings.
70
+
71
+ @options = CONFIG.merge(opts)
72
+
73
+ @source_host = %x(hostname).chomp()
74
+ @source_path = nil
75
+ @tags = options[:TAGS].strip.split(/\s*,\s*/)
76
+
77
+ fields = options[:FIELDS].strip.gsub(/,/, ' ').split(/ +/)
78
+ raise "Number of keys or values in fields must be even!" if fields.length % 2 != 0
79
+
80
+ @fields = {}
81
+ while not fields.empty? do
82
+ k, v = fields.pop(2)
83
+ @fields[k] = v
84
+ end
85
+ end
86
+
87
+ def each_record(&block)
88
+ @input.each_line do |line|
89
+ line.force_encoding(options[:LOG_INPUT_ENCODING])
90
+ line.chomp!
91
+ next if line.empty?
92
+ if line =~ options[:SOURCE_SEPERATOR_REGEX]
93
+ @source_path = $1
94
+ next
95
+ end
96
+ block.call({
97
+ # Every record has a '@type' this is how we match filters to log records.
98
+ # Note: in Ruby 1.9, Hash are ordered, so here we'll be matching source path
99
+ # against the regex in the order they are defined.
100
+ '@type' => if @type.is_a?(String)
101
+ @type
102
+ else # @type is a Hash
103
+ if type = @type.find { |re, t| re =~ @source_path }
104
+ type[1]
105
+ else
106
+ @type[nil] || ''
107
+ end
108
+ end,
109
+ '@source_path' => @source_path,
110
+ '@source_host' => @source_host,
111
+ '@timestamp' => Time.new.utc.strftime(options[:UTC_TIMESTAMP_FORMAT]),
112
+ '@message' => line,
113
+ '@tags' => @tags.clone, # defaults to []
114
+ '@fields' => @fields.clone, # defaluts to {}
115
+ })
116
+ end
117
+ end
118
+ end # Spitter
119
+
120
+
121
+ module Filters #--------------------------------------
122
+
123
+ # A filter takes a JSON log record, process it by adding, correcting or
124
+ # even removing attributes from it if necessary.
125
+ class GrokFilter
126
+
127
+ DEFAULT_PATTERNS = File.join(File.dirname(__FILE__),
128
+ 'log2json', 'filters', 'base.patterns')
129
+
130
+ CONFIG = {
131
+ NAMED_CAPTURES_ONLY: true,
132
+ KEEP_EMTPY_CAPTURES: false
133
+ }
134
+
135
+
136
+ attr_reader :type, :name
137
+
138
+ def initialize(type, name, regexps, opts={}, &filter_block)
139
+ @type = type
140
+ @name = name
141
+ @filter_block = filter_block
142
+ @record_kvs = opts.select { |k,v| k.start_with?('@') }
143
+ @config = opts.select { |k,v| not k.start_with?('@') }.merge CONFIG
144
+
145
+ @pile = Grok::Pile.new
146
+ @pile.add_patterns_from_file(@config[:pattern_file] || DEFAULT_PATTERNS)
147
+ regexps.each { |re| @pile.compile(re) }
148
+ end
149
+
150
+ # Filter the log record.
151
+ #
152
+ # This means checking if the record matches the patterns of this filter and
153
+ # add the captured groups as members of the @fields of the record if
154
+ # there's a match.
155
+ #
156
+ # Any '@' key-values configured for this filter will also
157
+ # be added to the record after merging the captured groups.
158
+ #
159
+ # Return the record at the end if there's a match else return nil.
160
+ # If the '@timestamp' attribute is removed from a record then the record will
161
+ # be dropped.
162
+ def filter(record)
163
+ grok, match = @pile.match(record['@message'])
164
+ if match
165
+ # code stolen and modified from logstash's grok filter.
166
+ fields = record['@fields']
167
+ match.each_capture() do |key, value|
168
+ next if value.nil? and not @config[:KEEP_EMTPY_CAPTURES]
169
+ if key.include?(':')
170
+ pattern_name, key, value_type = key.split(':') # ie, %{pattern_name:key:value_type}
171
+ case value_type
172
+ when 'int' ; value = value.to_i
173
+ when 'float'; value = value.to_f
174
+ end
175
+ else
176
+ next if @config[:NAMED_CAPTURES_ONLY]
177
+ end
178
+ if fields[key].nil?
179
+ fields[key] = value
180
+ else # if there already exists a field for the captured value
181
+ # then we aggregate the captured values in an array for the field.
182
+ if not fields[key].is_a?(Array)
183
+ fields[key] = [fields[key]]
184
+ end
185
+ fields[key] << value
186
+ end
187
+ end
188
+
189
+ record.merge!(@record_kvs) do |k, oldval, newval|
190
+ if k == '@tags'
191
+ oldval.concat(newval).uniq!
192
+ elsif k == '@fields'
193
+ oldval.merge!(newval)
194
+ end
195
+ end
196
+ (fields['filtered_by'] ||= []) << name
197
+ if @filter_block
198
+ @filter_block.call(record)
199
+ else
200
+ record
201
+ end
202
+ else
203
+ nil
204
+ end
205
+ end
206
+ end # end class GrokFilter
207
+
208
+
209
+ end # end module Filters
210
+
211
+
212
+
213
+
214
+
215
+
216
+ end # Log2Json module
217
+
@@ -0,0 +1,93 @@
1
+ USERNAME [a-zA-Z0-9_-]+
2
+ USER %{USERNAME}
3
+ INT (?:[+-]?(?:[0-9]+))
4
+ BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))
5
+ NUMBER (?:%{BASE10NUM})
6
+ BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+))
7
+ BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b
8
+
9
+ POSINT \b(?:[0-9]+)\b
10
+ WORD \b\w+\b
11
+ NOTSPACE \S+
12
+ DATA .*?
13
+ GREEDYDATA .*
14
+ #QUOTEDSTRING (?:(?<!\\)(?:"(?:\\.|[^\\"])*"|(?:'(?:\\.|[^\\'])*')|(?:`(?:\\.|[^\\`])*`)))
15
+ QUOTEDSTRING (?:(?<!\\)(?:"(?>[^\\"]+|\\.)*")|(?:'(?>[^\\']+|\\.)*')|(?:`(?>[^\\`]+|\\.)*`))
16
+
17
+ # Networking
18
+ MAC (?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})
19
+ CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})
20
+ WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})
21
+ COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})
22
+ IP (?<![0-9])(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))(?![0-9])
23
+ HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)
24
+ HOST %{HOSTNAME}
25
+ IPORHOST (?:%{HOSTNAME}|%{IP})
26
+ HOSTPORT (?:%{IPORHOST=~/\./}:%{POSINT})
27
+
28
+ # paths
29
+ PATH (?:%{UNIXPATH}|%{WINPATH})
30
+ UNIXPATH (?:/(?:[\w_%!$@:.,-]+|\\.)*)+
31
+ #UNIXPATH (?<![\w\/])(?:/[^\/\s?*]*)+
32
+ LINUXTTY (?:/dev/pts/%{POSINT})
33
+ BSDTTY (?:/dev/tty[pq][a-z0-9])
34
+ TTY (?:%{BSDTTY}|%{LINUXTTY})
35
+ WINPATH (?:[A-Za-z]+:|\\)(?:\\[^\\?*]*)+
36
+ URIPROTO [A-Za-z]+(\+[A-Za-z+]+)?
37
+ URIHOST %{IPORHOST}(?::%{POSINT:port})?
38
+ # uripath comes loosely from RFC1738, but mostly from what Firefox
39
+ # doesn't turn into %XX
40
+ URIPATH (?:/[A-Za-z0-9$.+!*'(),~:#%_-]*)+
41
+ #URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
42
+ URIPARAM \?[A-Za-z0-9$.+!*'(),~#%&/=:;_-]*
43
+ URIPATHPARAM %{URIPATH}(?:%{URIPARAM})?
44
+ URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?
45
+
46
+ # Months: January, Feb, 3, 03, 12, December
47
+ MONTH \b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|[Mm]ay|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b
48
+ MONTHNUM (?:0?[1-9]|1[0-2])
49
+ MONTHDAY (?:3[01]|[1-2]?[0-9]|0?[1-9])
50
+
51
+ # Days: Monday, Tue, Thu, etc...
52
+ DAY (?:[Mm]on(?:day)?|[Tt]ue(?:sday)?|[Ww]ed(?:nesday)?|[Tt]hu(?:rsday)?|[Ff]ri(?:day)?|[Ss]at(?:urday)?|[Ss]un(?:day)?)
53
+
54
+ # Years?
55
+ YEAR [0-9]+
56
+ # Time: HH:MM:SS
57
+ #TIME \d{2}:\d{2}(?::\d{2}(?:\.\d+)?)?
58
+ # I'm still on the fence about using grok to perform the time match,
59
+ # since it's probably slower.
60
+ # TIME %{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)?
61
+ HOUR (?:2[0123]|[01][0-9])
62
+ MINUTE (?:[0-5][0-9])
63
+ # '60' is a leap second in most time standards and thus is valid.
64
+ SECOND (?:(?:[0-5][0-9]|60)(?:[.,][0-9]+)?)
65
+ TIME (?<![0-9])%{HOUR}:%{MINUTE}(?::%{SECOND})(?![0-9])
66
+ # datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
67
+ DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}
68
+ DATE_EU %{YEAR}[/-]%{MONTHNUM}[/-]%{MONTHDAY}
69
+ ISO8601_TIMEZONE (?:Z|[+-]%{HOUR}(?::?%{MINUTE}))
70
+ ISO8601_SECOND (?:%{SECOND}|60)
71
+ TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR}:?%{MINUTE}(?::?%{SECOND})?%{ISO8601_TIMEZONE}?
72
+ DATE %{DATE_US}|%{DATE_EU}
73
+ DATESTAMP %{DATE}[- ]%{TIME}
74
+ TZ (?:[PMCE][SD]T)
75
+ DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}
76
+ DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}
77
+
78
+ # Syslog Dates: Month Day HH:MM:SS
79
+ SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME}
80
+ PROG (?:[\w._/-]+)
81
+ SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])?
82
+ SYSLOGHOST %{IPORHOST}
83
+ SYSLOGFACILITY <%{POSINT:facility}.%{POSINT:priority}>
84
+
85
+ ZONE %{INT}
86
+ HTTPDATE %{MONTHDAY}/%{MONTH}/%{YEAR}:%{TIME} %{ZONE}
87
+
88
+ # Shortcuts
89
+ QS %{QUOTEDSTRING}
90
+
91
+ # Log formats
92
+ SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:
93
+ COMBINEDAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "%{WORD:verb} %{URIPATHPARAM:request} HTTP/%{NUMBER:httpversion}" %{NUMBER:response} (?:%{NUMBER:bytes}|-) "(?:%{URI:referrer}|-)" %{QS:agent}
@@ -0,0 +1,46 @@
1
+ require 'log2json'
2
+ require 'date'
3
+
4
+ module Log2Json
5
+ module Filters
6
+ #----
7
+
8
+ class NginxAccessLogFilter < GrokFilter
9
+
10
+ def initialize(name, config={})
11
+ # Thanks to - http://boojapathy.wordpress.com/2012/04/29/logstash-graylog-cant-ask-more-for-logging/
12
+ #
13
+ # 10.33.158.237 - - [12/Apr/2013:13:27:54 -0000] "GET /UEFA/news.json?blackberry_native_version=1.9.4&locale=es HTTP/1.1" 200 6495 "-" "-" "-" "-" "-" cache_status:BYPASS
14
+ #
15
+ type = config.delete(:type) {'nginx-access'}
16
+ super(type, name, [
17
+ %w[ %{IP:ip}
18
+ (?:%{HOST:host}|-)
19
+ (?:%{USER:user}|-)
20
+ \\\[%{HTTPDATE:datetime}\\\] +"(?:%{WORD:method} %{URIPATHPARAM:path} HTTP/%{NUMBER:version}|%{DATA:request})"
21
+ %{NUMBER:status}
22
+ (?:%{NUMBER:size}|-)
23
+ %{QUOTEDSTRING:referrer}
24
+ %{QUOTEDSTRING:user_agent}
25
+ (?:%{GREEDYDATA:extra_info})
26
+ ].join(' ') ], config
27
+ )
28
+ end
29
+
30
+ def filter(record)
31
+ return nil if super(record).nil?
32
+ # eg, 23/Nov/2012:19:11:10 +0000
33
+ record['@timestamp'] = DateTime.strptime(record['@fields']['datetime'], "%d/%b/%Y:%T %z")
34
+ record['@fields'].delete('datetime')
35
+ record['@tags'] << "nginx" << "http"
36
+ record
37
+ end
38
+
39
+ end # NginxAccessLogFilter
40
+
41
+
42
+
43
+
44
+ #----
45
+ end
46
+ end
@@ -0,0 +1,62 @@
1
+ require 'log2json'
2
+ require 'date'
3
+
4
+ module Log2Json
5
+ module Filters
6
+ #----
7
+
8
+ # A default syslog filter.
9
+ # This works the rsyslog and its default configuration as distributed with Ubuntu 12.04 LTS.
10
+ #
11
+ # It also assumes your syslog timestamp is in UTC. To make sure, add the following line to
12
+ # /etc/default/rsyslog:
13
+ #
14
+ # export TZ=UTC
15
+ #
16
+ # and then restart rsyslog.(ie, sudo service restart rsyslog)
17
+ # Other settings for rsyslog you might want to adjust includes:
18
+ #
19
+ #
20
+ # MaxMessageSize 64k # Increase the message size allowed to 64k (default is like 2k... or something.)
21
+ #
22
+ # $IMUXSockRateLimitInterval 0 # Disable rate limiting, so we are sure to get every single message logged.
23
+ # # Note: Add it after $ModLoad imuxsock
24
+ #
25
+ #
26
+ class SyslogFilter < GrokFilter
27
+
28
+ def initialize(name, config={})
29
+ type = config.delete(:type) {'syslog'}
30
+ super(type, name, [
31
+ %w[ %{SYSLOGTIMESTAMP:syslog_timestamp}
32
+ %{SYSLOGHOST:syslog_hostname}?
33
+ %{PROG:syslog_program}(?:\\\[%{POSINT:syslog_pid}\\\])?:
34
+ %{GREEDYDATA:syslog_message}
35
+ ].join(' ')], config
36
+ )
37
+ end
38
+
39
+ def filter(record)
40
+ return nil if super(record).nil?
41
+ record['@received_at'] = record['@timestamp']
42
+ record['@received_from'] = record['@source_host']
43
+
44
+ fields = record['@fields']
45
+
46
+ fields['syslog_timestamp'] += '+0000'
47
+ record['@timestamp'] = DateTime.strptime(fields["syslog_timestamp"], "%b %e %T%z") # eg, Apr 12 15:55:28+0000
48
+
49
+ record['@source_host'] = fields['syslog_hostname']
50
+ record['@message'] = fields['syslog_message'].gsub(/#012/, "\n")
51
+ record['@tags'] << fields['syslog_program']
52
+ fields.each_key { |k| fields.delete(k) if k.start_with?('syslog_') }
53
+ record
54
+ end
55
+
56
+ end # SyslogFilter
57
+
58
+
59
+
60
+ #----
61
+ end
62
+ end