log_line_parser 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/exe/log_line_parser +0 -0
- data/lib/log_line_parser/command_line_interface.rb +19 -3
- data/lib/log_line_parser/ltsv.rb +48 -0
- data/lib/log_line_parser/moe.rb +12 -7
- data/lib/log_line_parser/query.rb +84 -0
- data/lib/log_line_parser/version.rb +1 -1
- data/lib/log_line_parser.rb +52 -9
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 98ed47b6f1624564237a5fcd95df8fa59f7e68d4
|
4
|
+
data.tar.gz: 9236e365d98e5a70e3503956327ca28285d2ff0f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2af2324f7fd1b56dc88c65a68b618ec7c41a8e30e2054e59c5f96a2bf2c3db1d167998f8b8baa0bcd13f783b3ee6dfdbf0b0d4ec7f0f934434a22441cb76607e
|
7
|
+
data.tar.gz: 8301442589177b2caa33c3e8ecea442f7e8abb215dabdbd746f9020d4cfca0334d2f153a3d0194d9664826043bcfa129bc3be48ebae2b0bfcbe456f63f62c03e
|
data/README.md
CHANGED
@@ -28,7 +28,7 @@ Or install it yourself as:
|
|
28
28
|
require 'log_line_parser'
|
29
29
|
|
30
30
|
line = '192.168.3.4 - - [07/Feb/2016: ... ] ...'
|
31
|
-
LogLineParser.
|
31
|
+
LogLineParser.to_array(line)
|
32
32
|
# => ["192.168.3.4", "-", "-", "07/Feb/2016: ... ", ... ]
|
33
33
|
```
|
34
34
|
|
@@ -103,7 +103,7 @@ The command line tool `log_line_parser` can be used for two purposes:
|
|
103
103
|
1. For converting file formats
|
104
104
|
2. For picking up log records that satisfy certain criteria
|
105
105
|
|
106
|
-
For the first purpose, the tool
|
106
|
+
For the first purpose, the tool supports conversion from an Apache log format to CSV or TSV format.
|
107
107
|
And for the second purpose, criteria such as :not_found?(= :status_code_404?) or :access_by_bots? are defined, and you can combine them by writing a configuration file.
|
108
108
|
|
109
109
|
#### For converting file formats
|
data/exe/log_line_parser
CHANGED
File without changes
|
@@ -31,7 +31,8 @@ module LogLineParser
|
|
31
31
|
end
|
32
32
|
|
33
33
|
opt.on("-l [LogFormat]", "--log-format [=LogFormat]",
|
34
|
-
"Specify LogFormat
|
34
|
+
"Specify LogFormat by giving a LogFormat or one of \
|
35
|
+
formats predefined as #{predefined_options_for_log_format}") do |log_format|
|
35
36
|
options[:log_format] = log_format
|
36
37
|
end
|
37
38
|
|
@@ -41,7 +42,7 @@ module LogLineParser
|
|
41
42
|
end
|
42
43
|
|
43
44
|
opt.on("-t [format]", "--to [=format]",
|
44
|
-
"Specify a format") do |format|
|
45
|
+
"Specify a format: csv, tsv or ltsv") do |format|
|
45
46
|
options[:format] = format
|
46
47
|
end
|
47
48
|
|
@@ -79,7 +80,7 @@ module LogLineParser
|
|
79
80
|
output_log_names = collect_output_log_names(configs)
|
80
81
|
Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
|
81
82
|
queries = setup_queries_from_configs(configs, logs)
|
82
|
-
LogLineParser.each_record(
|
83
|
+
LogLineParser.each_record(parser: parser) do |line, record|
|
83
84
|
queries.each {|query| query.call(line, record) }
|
84
85
|
end
|
85
86
|
end
|
@@ -92,6 +93,9 @@ module LogLineParser
|
|
92
93
|
convert_to_csv(input, output)
|
93
94
|
when "tsv"
|
94
95
|
convert_to_tsv(input, output)
|
96
|
+
when "ltsv"
|
97
|
+
convert_to_ltsv(input, output,
|
98
|
+
choose_log_parser(options[:log_format]))
|
95
99
|
else
|
96
100
|
raise UnsupportedFormatError.new(output_format)
|
97
101
|
end
|
@@ -99,6 +103,12 @@ module LogLineParser
|
|
99
103
|
|
100
104
|
private
|
101
105
|
|
106
|
+
def self.predefined_options_for_log_format
|
107
|
+
PREDEFINED_FORMATS.keys.
|
108
|
+
map {|opt| "\"#{opt}\"" }.
|
109
|
+
join(", ")
|
110
|
+
end
|
111
|
+
|
102
112
|
def self.collect_output_log_names(configs)
|
103
113
|
configs.map do |config|
|
104
114
|
config[Query::ConfigFields::OUTPUT_LOG_NAME]
|
@@ -122,5 +132,11 @@ module LogLineParser
|
|
122
132
|
output.puts Utils.to_tsv(line.chomp)
|
123
133
|
end
|
124
134
|
end
|
135
|
+
|
136
|
+
def self.convert_to_ltsv(input, output, parser)
|
137
|
+
input.each_line do |line|
|
138
|
+
output.puts parser.to_ltsv(line.chomp)
|
139
|
+
end
|
140
|
+
end
|
125
141
|
end
|
126
142
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
module LogLineParser
|
4
|
+
|
5
|
+
module Ltsv
|
6
|
+
LABEL_SEPARATOR = ":"
|
7
|
+
TAB = "\t"
|
8
|
+
|
9
|
+
##
|
10
|
+
# Label names are borrowed from
|
11
|
+
# http://ltsv.org/
|
12
|
+
|
13
|
+
FORMAT_STRING_LABEL_TABLE = {
|
14
|
+
"%t" => "time",
|
15
|
+
"%h" => "host",
|
16
|
+
"%{X-Forwarded-For}i" => "forwardedfor",
|
17
|
+
"%l" => "ident",
|
18
|
+
"%u" => "user",
|
19
|
+
"%r" => "req",
|
20
|
+
"%m" => "method",
|
21
|
+
"%U%q" => "uri",
|
22
|
+
"%H" => "protocol",
|
23
|
+
"%>s" => "status",
|
24
|
+
"%B" => "size",
|
25
|
+
"%b" => "size",
|
26
|
+
"%I" => "reqsize",
|
27
|
+
"%{Referer}i" => "referer",
|
28
|
+
"%{User-agent}i" => "ua",
|
29
|
+
"%{Host}i" => "vhost",
|
30
|
+
"%D" => "reqtime_microsec",
|
31
|
+
"%T" => "reqtime",
|
32
|
+
"%{X-Cache}o" => "cache",
|
33
|
+
"%{X-Runtime}o" => "runtime",
|
34
|
+
# "-" => "apptime",
|
35
|
+
}
|
36
|
+
|
37
|
+
def self.format_strings_to_labels(format_strings)
|
38
|
+
format_strings.map do |string|
|
39
|
+
FORMAT_STRING_LABEL_TABLE[string]||string
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.to_ltsv(labels, values)
|
44
|
+
fields = labels.zip(values).map {|field| field.join(LABEL_SEPARATOR) }
|
45
|
+
fields.join(TAB)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/log_line_parser/moe.rb
CHANGED
@@ -3,16 +3,21 @@
|
|
3
3
|
require 'log_line_parser'
|
4
4
|
require 'log_line_parser/utils'
|
5
5
|
|
6
|
-
# MoeLogParser is added from the personal needs of the original author,
|
7
|
-
# and the LogFormat for it is not a widely used format.
|
8
|
-
# You may remove this file if you don't need it.
|
9
|
-
# (MOE is the acronym of the organization's name for which the author
|
10
|
-
# is working at the time of the first release of this program.)
|
11
|
-
|
12
6
|
module LogLineParser
|
13
|
-
|
7
|
+
|
8
|
+
# MoeLogFormat and MoeLogParser is added from the personal needs of the
|
9
|
+
# original author, and the log format is not a widely used one.
|
10
|
+
# You may remove this file if you don't need it.
|
11
|
+
# (MOE is the acronym of the name of the organization for which
|
12
|
+
# the author is working at the time of the first release of this program.)
|
13
|
+
#
|
14
|
+
# MoeLogFormat = CombinedLogFormat + "%D"
|
14
15
|
MoeLogFormat = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D"
|
16
|
+
|
17
|
+
##
|
18
|
+
# Parser of MoeLogFormat
|
15
19
|
MoeLogParser = parser(MoeLogFormat)
|
20
|
+
|
16
21
|
PREDEFINED_FORMATS['moe'] = MoeLogParser
|
17
22
|
end
|
18
23
|
|
@@ -75,6 +75,10 @@ YetiBot
|
|
75
75
|
bots_re =~ record.user_agent
|
76
76
|
end
|
77
77
|
|
78
|
+
##
|
79
|
+
# Returns true if the path+query part of the value of %{Referer}i
|
80
|
+
# matchs one of resources.
|
81
|
+
|
78
82
|
def self.referred_from_resources?(record, resources=[])
|
79
83
|
resources.include?(record.referer_resource)
|
80
84
|
end
|
@@ -183,11 +187,50 @@ YetiBot
|
|
183
187
|
bots_re =~ record.user_agent
|
184
188
|
end
|
185
189
|
|
190
|
+
##
|
191
|
+
# Returns true if the path+query part of the value of %{Referer}i
|
192
|
+
# matches one of the resources that are passed as the second
|
193
|
+
# argument when you create an instance of Query.
|
194
|
+
#
|
195
|
+
# When a given resource is a directory, you should append a "/" at the
|
196
|
+
# end of it, otherwise you would get a wrong result. For example,
|
197
|
+
# suppose you define the following queries:
|
198
|
+
#
|
199
|
+
# correct_query = Query.new("www.example.org", ["/dir/subdir/"])
|
200
|
+
# wrong_query = Query.new("www.example.org", ["/dir/subdir"])
|
201
|
+
#
|
202
|
+
# <tt>correct_query.referred_from_resources?(record)</tt> returns true
|
203
|
+
# when the value of %{Referer}i is "http://www.example.org/subdir"
|
204
|
+
# or "http://www.example.org/subdir/",
|
205
|
+
# but <tt>wrong_query.referred_from_resources?(record)</tt> returns
|
206
|
+
# false for "http://www.example.org/subdir/"
|
207
|
+
|
186
208
|
def referred_from_resources?(record)
|
187
209
|
if_matching_domain(record) and
|
188
210
|
@normalized_resources.include?(record.referer_resource)
|
189
211
|
end
|
190
212
|
|
213
|
+
##
|
214
|
+
# Returns true if the path+query part of the value of %{Referer}i
|
215
|
+
# begins with one of the resources that are passed as the second
|
216
|
+
# argument when you create an instance of Query.
|
217
|
+
#
|
218
|
+
# When a given resource is a directory, you should append a "/" at the
|
219
|
+
# end of it, otherwise you would get a wrong result. For example,
|
220
|
+
# suppose you define the following queries:
|
221
|
+
#
|
222
|
+
# correct_query = Query.new("www.example.org", ["/dir/subdir/"])
|
223
|
+
# wrong_query = Query.new("www.example.org", ["/dir/subdir"])
|
224
|
+
#
|
225
|
+
# <tt>wrong_query.referred_from_under_resources?(record)</tt>
|
226
|
+
# returns true even when the value of %{Referer}i in record is
|
227
|
+
# "http://www.example.org/subdir_for_images/a_file_name",
|
228
|
+
# while <tt>correct_query.referred_from_under_resources?(record)</tt>
|
229
|
+
# returns true when the value of %{Referer}i is
|
230
|
+
# "http://www.example.org/subdir/a_filename" or
|
231
|
+
# "http://www.example.org/subdir",
|
232
|
+
# and returns false for "http://www.example.org/subdir_for_images".
|
233
|
+
|
191
234
|
def referred_from_under_resources?(record)
|
192
235
|
referer_resource = record.referer_resource
|
193
236
|
if_matching_domain(record) and
|
@@ -195,10 +238,51 @@ YetiBot
|
|
195
238
|
@resources.any?{|target| referer_resource.start_with?(target) }
|
196
239
|
end
|
197
240
|
|
241
|
+
##
|
242
|
+
# Returns true if the value of %U%q in +record+ matches one of the
|
243
|
+
# resources that are passed as the second argument when you create
|
244
|
+
# an instance of Query.
|
245
|
+
#
|
246
|
+
# When you give a directory as one of resources, you should append
|
247
|
+
# a "/" at the end of the directory, otherwise records whose %U%q
|
248
|
+
# value points to the same directory but without trailing "/"
|
249
|
+
# will return false.
|
250
|
+
#
|
251
|
+
# For example, when you create queries as follows,
|
252
|
+
#
|
253
|
+
# query_with_slash = Query.new("www.example.org", ["/dir/subdir/"])
|
254
|
+
# query_without_slash = Query.new("www.example.org", ["/dir/subdir"])
|
255
|
+
#
|
256
|
+
# <tt>query_with_slash.access_to_resources?(record)</tt> returns true for
|
257
|
+
# both of records whose %U%q value is "/dir/subdir/" and "/dir/subdir"
|
258
|
+
# respectively.
|
259
|
+
#
|
260
|
+
# But <tt>query_without_slash.access_to_resources?(record)</tt> returns
|
261
|
+
# false for a record whose %U%q value is "/dir/subdir/"
|
262
|
+
|
198
263
|
def access_to_resources?(record)
|
199
264
|
@normalized_resources.include?(record.resource)
|
200
265
|
end
|
201
266
|
|
267
|
+
##
|
268
|
+
# Returns true if the value of %U%q in +record+ begins with one
|
269
|
+
# of the resources that are passed as the second argument when
|
270
|
+
# you create an instance of Query.
|
271
|
+
#
|
272
|
+
# When a given resource is a directory, you should append a "/" at the
|
273
|
+
# end of it, otherwise you would get a wrong result. For example,
|
274
|
+
# suppose you define the following queries:
|
275
|
+
#
|
276
|
+
# correct_query = Query.new("www.example.org", ["/dir/subdir/"])
|
277
|
+
# wrong_query = Query.new("www.example.org", ["/dir/subdir"])
|
278
|
+
#
|
279
|
+
# <tt>wrong_query.access_to_under_resources?(record)</tt>
|
280
|
+
# returns true even when the value of %U%q in record is
|
281
|
+
# "/subdir_for_images/a_file_name", while
|
282
|
+
# <tt>correct_query.access_to_under_resources?(record)</tt>
|
283
|
+
# returns true when the value of %U%q is "/subdir/a_filename" or
|
284
|
+
# "/subdir", and returns false for "/subdir_for_images".
|
285
|
+
|
202
286
|
def access_to_under_resources?(record)
|
203
287
|
resource = record.resource
|
204
288
|
@normalized_dirs.include?(resource) or
|
data/lib/log_line_parser.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
require "log_line_parser/version"
|
4
4
|
require "log_line_parser/line_parser"
|
5
5
|
require "log_line_parser/apache"
|
6
|
+
require "log_line_parser/ltsv"
|
6
7
|
require "strscan"
|
7
8
|
require "time"
|
8
9
|
require "date"
|
@@ -65,12 +66,12 @@ module LogLineParser
|
|
65
66
|
root.subnodes.map {|node| node.to_s }
|
66
67
|
end
|
67
68
|
|
68
|
-
def to_hash(
|
69
|
-
|
69
|
+
def to_hash(parser=CombinedLogParser)
|
70
|
+
parser.to_hash(to_a)
|
70
71
|
end
|
71
72
|
|
72
|
-
def to_record(
|
73
|
-
|
73
|
+
def to_record(parser=CombinedLogParser)
|
74
|
+
parser.create(to_a)
|
74
75
|
end
|
75
76
|
end
|
76
77
|
|
@@ -82,6 +83,7 @@ module LogLineParser
|
|
82
83
|
def setup(field_names, format_strings=nil)
|
83
84
|
@field_names = field_names
|
84
85
|
@format_strings = format_strings
|
86
|
+
@ltsv_labels = Ltsv.format_strings_to_labels(format_strings)
|
85
87
|
@number_of_fields = field_names.length
|
86
88
|
@referer_defined = field_names.include?(:referer)
|
87
89
|
@parse_time_value = false
|
@@ -115,6 +117,13 @@ module LogLineParser
|
|
115
117
|
h
|
116
118
|
end
|
117
119
|
|
120
|
+
def to_ltsv(line)
|
121
|
+
values = line.kind_of?(Array) ? line : LogLineParser.parse(line).to_a
|
122
|
+
Ltsv.to_ltsv(@ltsv_labels, values)
|
123
|
+
end
|
124
|
+
|
125
|
+
private
|
126
|
+
|
118
127
|
def parse_request(h)
|
119
128
|
if first_line_of_request = h["%r".freeze]
|
120
129
|
request = first_line_of_request.split(/ /)
|
@@ -124,8 +133,6 @@ module LogLineParser
|
|
124
133
|
end
|
125
134
|
end
|
126
135
|
|
127
|
-
private
|
128
|
-
|
129
136
|
def response_size(rec)
|
130
137
|
size_str = rec.response_bytes
|
131
138
|
size_str == "-".freeze ? 0 : size_str.to_i
|
@@ -183,6 +190,17 @@ module LogLineParser
|
|
183
190
|
record_type
|
184
191
|
end
|
185
192
|
|
193
|
+
private_class_method :create_record_type
|
194
|
+
|
195
|
+
##
|
196
|
+
# Creates a parser from a LogFormat.
|
197
|
+
#
|
198
|
+
# For example,
|
199
|
+
#
|
200
|
+
# parser = LogLineParse.parser("%h %l %u %t \"%r\" %>s %b")
|
201
|
+
#
|
202
|
+
# creates the parser of Common Log Format.
|
203
|
+
|
186
204
|
def self.parser(log_format)
|
187
205
|
if log_format.kind_of? String
|
188
206
|
format_strings = Apache.parse_log_format(log_format)
|
@@ -206,24 +224,49 @@ module LogLineParser
|
|
206
224
|
# LogLineTokenizer.tokenize(line.chomp, stack)
|
207
225
|
end
|
208
226
|
|
227
|
+
##
|
228
|
+
# Turns a line of Apache access logs into an array of field values.
|
229
|
+
#
|
230
|
+
# Escaped characters such as "\\t" or "\\"" will be unescaped.
|
231
|
+
|
209
232
|
def self.to_array(line)
|
210
233
|
parse(line).to_a
|
211
234
|
end
|
212
235
|
|
236
|
+
##
|
237
|
+
# Parser of Common Log Format (CLF)
|
238
|
+
#
|
239
|
+
# ref: https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format
|
240
|
+
|
213
241
|
CommonLogParser = parser(Apache::LogFormat::COMMON)
|
242
|
+
|
243
|
+
##
|
244
|
+
# Parser of Common Log Format with Virtual Host
|
245
|
+
|
214
246
|
CommonLogWithVHParser = parser(Apache::LogFormat::COMMON_WITH_VH)
|
247
|
+
|
248
|
+
##
|
249
|
+
# Parser of NCSA extended/combined log format
|
250
|
+
|
215
251
|
CombinedLogParser = parser(Apache::LogFormat::COMBINED)
|
216
252
|
|
217
253
|
PREDEFINED_FORMATS['common'] = CommonLogParser
|
218
254
|
PREDEFINED_FORMATS['common_with_vh'] = CommonLogWithVHParser
|
219
255
|
PREDEFINED_FORMATS['combined'] = CombinedLogParser
|
220
256
|
|
221
|
-
|
257
|
+
##
|
258
|
+
# Reads each line from +input+ (Apache access log files are expected) and
|
259
|
+
# parses it, then yields the line and the parsed result (+record+) to the
|
260
|
+
# associated block.
|
261
|
+
#
|
262
|
+
# When it fails to parse a line, the line will be printed to +error_output+
|
263
|
+
|
264
|
+
def self.each_record(parser: CombinedLogParser,
|
222
265
|
input: ARGF,
|
223
|
-
error_output: STDERR)
|
266
|
+
error_output: STDERR) # :yields: line, record
|
224
267
|
input.each_line do |line|
|
225
268
|
begin
|
226
|
-
yield line,
|
269
|
+
yield line, parser.parse(line)
|
227
270
|
rescue MalFormedRecordError => e
|
228
271
|
error_output.print e.message
|
229
272
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: log_line_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- HASHIMOTO, Naoki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -62,6 +62,7 @@ files:
|
|
62
62
|
- lib/log_line_parser/apache.rb
|
63
63
|
- lib/log_line_parser/command_line_interface.rb
|
64
64
|
- lib/log_line_parser/line_parser.rb
|
65
|
+
- lib/log_line_parser/ltsv.rb
|
65
66
|
- lib/log_line_parser/moe.rb
|
66
67
|
- lib/log_line_parser/query.rb
|
67
68
|
- lib/log_line_parser/utils.rb
|