log_line_parser 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/exe/log_line_parser +0 -0
- data/lib/log_line_parser/command_line_interface.rb +19 -3
- data/lib/log_line_parser/ltsv.rb +48 -0
- data/lib/log_line_parser/moe.rb +12 -7
- data/lib/log_line_parser/query.rb +84 -0
- data/lib/log_line_parser/version.rb +1 -1
- data/lib/log_line_parser.rb +52 -9
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 98ed47b6f1624564237a5fcd95df8fa59f7e68d4
|
4
|
+
data.tar.gz: 9236e365d98e5a70e3503956327ca28285d2ff0f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2af2324f7fd1b56dc88c65a68b618ec7c41a8e30e2054e59c5f96a2bf2c3db1d167998f8b8baa0bcd13f783b3ee6dfdbf0b0d4ec7f0f934434a22441cb76607e
|
7
|
+
data.tar.gz: 8301442589177b2caa33c3e8ecea442f7e8abb215dabdbd746f9020d4cfca0334d2f153a3d0194d9664826043bcfa129bc3be48ebae2b0bfcbe456f63f62c03e
|
data/README.md
CHANGED
@@ -28,7 +28,7 @@ Or install it yourself as:
|
|
28
28
|
require 'log_line_parser'
|
29
29
|
|
30
30
|
line = '192.168.3.4 - - [07/Feb/2016: ... ] ...'
|
31
|
-
LogLineParser.
|
31
|
+
LogLineParser.to_array(line)
|
32
32
|
# => ["192.168.3.4", "-", "-", "07/Feb/2016: ... ", ... ]
|
33
33
|
```
|
34
34
|
|
@@ -103,7 +103,7 @@ The command line tool `log_line_parser` can be used for two purposes:
|
|
103
103
|
1. For converting file formats
|
104
104
|
2. For picking up log records that satisfy certain criteria
|
105
105
|
|
106
|
-
For the first purpose, the tool
|
106
|
+
For the first purpose, the tool supports conversion from an Apache log format to CSV or TSV format.
|
107
107
|
And for the second purpose, criteria such as :not_found?(= :status_code_404?) or :access_by_bots? are defined, and you can combine them by writing a configuration file.
|
108
108
|
|
109
109
|
#### For converting file formats
|
data/exe/log_line_parser
CHANGED
File without changes
|
@@ -31,7 +31,8 @@ module LogLineParser
|
|
31
31
|
end
|
32
32
|
|
33
33
|
opt.on("-l [LogFormat]", "--log-format [=LogFormat]",
|
34
|
-
"Specify LogFormat
|
34
|
+
"Specify LogFormat by giving a LogFormat or one of \
|
35
|
+
formats predefined as #{predefined_options_for_log_format}") do |log_format|
|
35
36
|
options[:log_format] = log_format
|
36
37
|
end
|
37
38
|
|
@@ -41,7 +42,7 @@ module LogLineParser
|
|
41
42
|
end
|
42
43
|
|
43
44
|
opt.on("-t [format]", "--to [=format]",
|
44
|
-
"Specify a format") do |format|
|
45
|
+
"Specify a format: csv, tsv or ltsv") do |format|
|
45
46
|
options[:format] = format
|
46
47
|
end
|
47
48
|
|
@@ -79,7 +80,7 @@ module LogLineParser
|
|
79
80
|
output_log_names = collect_output_log_names(configs)
|
80
81
|
Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
|
81
82
|
queries = setup_queries_from_configs(configs, logs)
|
82
|
-
LogLineParser.each_record(
|
83
|
+
LogLineParser.each_record(parser: parser) do |line, record|
|
83
84
|
queries.each {|query| query.call(line, record) }
|
84
85
|
end
|
85
86
|
end
|
@@ -92,6 +93,9 @@ module LogLineParser
|
|
92
93
|
convert_to_csv(input, output)
|
93
94
|
when "tsv"
|
94
95
|
convert_to_tsv(input, output)
|
96
|
+
when "ltsv"
|
97
|
+
convert_to_ltsv(input, output,
|
98
|
+
choose_log_parser(options[:log_format]))
|
95
99
|
else
|
96
100
|
raise UnsupportedFormatError.new(output_format)
|
97
101
|
end
|
@@ -99,6 +103,12 @@ module LogLineParser
|
|
99
103
|
|
100
104
|
private
|
101
105
|
|
106
|
+
def self.predefined_options_for_log_format
|
107
|
+
PREDEFINED_FORMATS.keys.
|
108
|
+
map {|opt| "\"#{opt}\"" }.
|
109
|
+
join(", ")
|
110
|
+
end
|
111
|
+
|
102
112
|
def self.collect_output_log_names(configs)
|
103
113
|
configs.map do |config|
|
104
114
|
config[Query::ConfigFields::OUTPUT_LOG_NAME]
|
@@ -122,5 +132,11 @@ module LogLineParser
|
|
122
132
|
output.puts Utils.to_tsv(line.chomp)
|
123
133
|
end
|
124
134
|
end
|
135
|
+
|
136
|
+
def self.convert_to_ltsv(input, output, parser)
|
137
|
+
input.each_line do |line|
|
138
|
+
output.puts parser.to_ltsv(line.chomp)
|
139
|
+
end
|
140
|
+
end
|
125
141
|
end
|
126
142
|
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
module LogLineParser
|
4
|
+
|
5
|
+
module Ltsv
|
6
|
+
LABEL_SEPARATOR = ":"
|
7
|
+
TAB = "\t"
|
8
|
+
|
9
|
+
##
|
10
|
+
# Label names are borrowed from
|
11
|
+
# http://ltsv.org/
|
12
|
+
|
13
|
+
FORMAT_STRING_LABEL_TABLE = {
|
14
|
+
"%t" => "time",
|
15
|
+
"%h" => "host",
|
16
|
+
"%{X-Forwarded-For}i" => "forwardedfor",
|
17
|
+
"%l" => "ident",
|
18
|
+
"%u" => "user",
|
19
|
+
"%r" => "req",
|
20
|
+
"%m" => "method",
|
21
|
+
"%U%q" => "uri",
|
22
|
+
"%H" => "protocol",
|
23
|
+
"%>s" => "status",
|
24
|
+
"%B" => "size",
|
25
|
+
"%b" => "size",
|
26
|
+
"%I" => "reqsize",
|
27
|
+
"%{Referer}i" => "referer",
|
28
|
+
"%{User-agent}i" => "ua",
|
29
|
+
"%{Host}i" => "vhost",
|
30
|
+
"%D" => "reqtime_microsec",
|
31
|
+
"%T" => "reqtime",
|
32
|
+
"%{X-Cache}o" => "cache",
|
33
|
+
"%{X-Runtime}o" => "runtime",
|
34
|
+
# "-" => "apptime",
|
35
|
+
}
|
36
|
+
|
37
|
+
def self.format_strings_to_labels(format_strings)
|
38
|
+
format_strings.map do |string|
|
39
|
+
FORMAT_STRING_LABEL_TABLE[string]||string
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.to_ltsv(labels, values)
|
44
|
+
fields = labels.zip(values).map {|field| field.join(LABEL_SEPARATOR) }
|
45
|
+
fields.join(TAB)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/lib/log_line_parser/moe.rb
CHANGED
@@ -3,16 +3,21 @@
|
|
3
3
|
require 'log_line_parser'
|
4
4
|
require 'log_line_parser/utils'
|
5
5
|
|
6
|
-
# MoeLogParser is added from the personal needs of the original author,
|
7
|
-
# and the LogFormat for it is not a widely used format.
|
8
|
-
# You may remove this file if you don't need it.
|
9
|
-
# (MOE is the acronym of the organization's name for which the author
|
10
|
-
# is working at the time of the first release of this program.)
|
11
|
-
|
12
6
|
module LogLineParser
|
13
|
-
|
7
|
+
|
8
|
+
# MoeLogFormat and MoeLogParser is added from the personal needs of the
|
9
|
+
# original author, and the log format is not a widely used one.
|
10
|
+
# You may remove this file if you don't need it.
|
11
|
+
# (MOE is the acronym of the name of the organization for which
|
12
|
+
# the author is working at the time of the first release of this program.)
|
13
|
+
#
|
14
|
+
# MoeLogFormat = CombinedLogFormat + "%D"
|
14
15
|
MoeLogFormat = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D"
|
16
|
+
|
17
|
+
##
|
18
|
+
# Parser of MoeLogFormat
|
15
19
|
MoeLogParser = parser(MoeLogFormat)
|
20
|
+
|
16
21
|
PREDEFINED_FORMATS['moe'] = MoeLogParser
|
17
22
|
end
|
18
23
|
|
@@ -75,6 +75,10 @@ YetiBot
|
|
75
75
|
bots_re =~ record.user_agent
|
76
76
|
end
|
77
77
|
|
78
|
+
##
|
79
|
+
# Returns true if the path+query part of the value of %{Referer}i
|
80
|
+
# matchs one of resources.
|
81
|
+
|
78
82
|
def self.referred_from_resources?(record, resources=[])
|
79
83
|
resources.include?(record.referer_resource)
|
80
84
|
end
|
@@ -183,11 +187,50 @@ YetiBot
|
|
183
187
|
bots_re =~ record.user_agent
|
184
188
|
end
|
185
189
|
|
190
|
+
##
|
191
|
+
# Returns true if the path+query part of the value of %{Referer}i
|
192
|
+
# matches one of the resources that are passed as the second
|
193
|
+
# argument when you create an instance of Query.
|
194
|
+
#
|
195
|
+
# When a given resource is a directory, you should append a "/" at the
|
196
|
+
# end of it, otherwise you would get a wrong result. For example,
|
197
|
+
# suppose you define the following queries:
|
198
|
+
#
|
199
|
+
# correct_query = Query.new("www.example.org", ["/dir/subdir/"])
|
200
|
+
# wrong_query = Query.new("www.example.org", ["/dir/subdir"])
|
201
|
+
#
|
202
|
+
# <tt>correct_query.referred_from_resources?(record)</tt> returns true
|
203
|
+
# when the value of %{Referer}i is "http://www.example.org/subdir"
|
204
|
+
# or "http://www.example.org/subdir/",
|
205
|
+
# but <tt>wrong_query.referred_from_resources?(record)</tt> returns
|
206
|
+
# false for "http://www.example.org/subdir/"
|
207
|
+
|
186
208
|
def referred_from_resources?(record)
|
187
209
|
if_matching_domain(record) and
|
188
210
|
@normalized_resources.include?(record.referer_resource)
|
189
211
|
end
|
190
212
|
|
213
|
+
##
|
214
|
+
# Returns true if the path+query part of the value of %{Referer}i
|
215
|
+
# begins with one of the resources that are passed as the second
|
216
|
+
# argument when you create an instance of Query.
|
217
|
+
#
|
218
|
+
# When a given resource is a directory, you should append a "/" at the
|
219
|
+
# end of it, otherwise you would get a wrong result. For example,
|
220
|
+
# suppose you define the following queries:
|
221
|
+
#
|
222
|
+
# correct_query = Query.new("www.example.org", ["/dir/subdir/"])
|
223
|
+
# wrong_query = Query.new("www.example.org", ["/dir/subdir"])
|
224
|
+
#
|
225
|
+
# <tt>wrong_query.referred_from_under_resources?(record)</tt>
|
226
|
+
# returns true even when the value of %{Referer}i in record is
|
227
|
+
# "http://www.example.org/subdir_for_images/a_file_name",
|
228
|
+
# while <tt>correct_query.referred_from_under_resources?(record)</tt>
|
229
|
+
# returns true when the value of %{Referer}i is
|
230
|
+
# "http://www.example.org/subdir/a_filename" or
|
231
|
+
# "http://www.example.org/subdir",
|
232
|
+
# and returns false for "http://www.example.org/subdir_for_images".
|
233
|
+
|
191
234
|
def referred_from_under_resources?(record)
|
192
235
|
referer_resource = record.referer_resource
|
193
236
|
if_matching_domain(record) and
|
@@ -195,10 +238,51 @@ YetiBot
|
|
195
238
|
@resources.any?{|target| referer_resource.start_with?(target) }
|
196
239
|
end
|
197
240
|
|
241
|
+
##
|
242
|
+
# Returns true if the value of %U%q in +record+ matches one of the
|
243
|
+
# resources that are passed as the second argument when you create
|
244
|
+
# an instance of Query.
|
245
|
+
#
|
246
|
+
# When you give a directory as one of resources, you should append
|
247
|
+
# a "/" at the end of the directory, otherwise records whose %U%q
|
248
|
+
# value points to the same directory but without trailing "/"
|
249
|
+
# will return false.
|
250
|
+
#
|
251
|
+
# For example, when you create queries as follows,
|
252
|
+
#
|
253
|
+
# query_with_slash = Query.new("www.example.org", ["/dir/subdir/"])
|
254
|
+
# query_without_slash = Query.new("www.example.org", ["/dir/subdir"])
|
255
|
+
#
|
256
|
+
# <tt>query_with_slash.access_to_resources?(record)</tt> returns true for
|
257
|
+
# both of records whose %U%q value is "/dir/subdir/" and "/dir/subdir"
|
258
|
+
# respectively.
|
259
|
+
#
|
260
|
+
# But <tt>query_without_slash.access_to_resources?(record)</tt> returns
|
261
|
+
# false for a record whose %U%q value is "/dir/subdir/"
|
262
|
+
|
198
263
|
def access_to_resources?(record)
|
199
264
|
@normalized_resources.include?(record.resource)
|
200
265
|
end
|
201
266
|
|
267
|
+
##
|
268
|
+
# Returns true if the value of %U%q in +record+ begins with one
|
269
|
+
# of the resources that are passed as the second argument when
|
270
|
+
# you create an instance of Query.
|
271
|
+
#
|
272
|
+
# When a given resource is a directory, you should append a "/" at the
|
273
|
+
# end of it, otherwise you would get a wrong result. For example,
|
274
|
+
# suppose you define the following queries:
|
275
|
+
#
|
276
|
+
# correct_query = Query.new("www.example.org", ["/dir/subdir/"])
|
277
|
+
# wrong_query = Query.new("www.example.org", ["/dir/subdir"])
|
278
|
+
#
|
279
|
+
# <tt>wrong_query.access_to_under_resources?(record)</tt>
|
280
|
+
# returns true even when the value of %U%q in record is
|
281
|
+
# "/subdir_for_images/a_file_name", while
|
282
|
+
# <tt>correct_query.access_to_under_resources?(record)</tt>
|
283
|
+
# returns true when the value of %U%q is "/subdir/a_filename" or
|
284
|
+
# "/subdir", and returns false for "/subdir_for_images".
|
285
|
+
|
202
286
|
def access_to_under_resources?(record)
|
203
287
|
resource = record.resource
|
204
288
|
@normalized_dirs.include?(resource) or
|
data/lib/log_line_parser.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
require "log_line_parser/version"
|
4
4
|
require "log_line_parser/line_parser"
|
5
5
|
require "log_line_parser/apache"
|
6
|
+
require "log_line_parser/ltsv"
|
6
7
|
require "strscan"
|
7
8
|
require "time"
|
8
9
|
require "date"
|
@@ -65,12 +66,12 @@ module LogLineParser
|
|
65
66
|
root.subnodes.map {|node| node.to_s }
|
66
67
|
end
|
67
68
|
|
68
|
-
def to_hash(
|
69
|
-
|
69
|
+
def to_hash(parser=CombinedLogParser)
|
70
|
+
parser.to_hash(to_a)
|
70
71
|
end
|
71
72
|
|
72
|
-
def to_record(
|
73
|
-
|
73
|
+
def to_record(parser=CombinedLogParser)
|
74
|
+
parser.create(to_a)
|
74
75
|
end
|
75
76
|
end
|
76
77
|
|
@@ -82,6 +83,7 @@ module LogLineParser
|
|
82
83
|
def setup(field_names, format_strings=nil)
|
83
84
|
@field_names = field_names
|
84
85
|
@format_strings = format_strings
|
86
|
+
@ltsv_labels = Ltsv.format_strings_to_labels(format_strings)
|
85
87
|
@number_of_fields = field_names.length
|
86
88
|
@referer_defined = field_names.include?(:referer)
|
87
89
|
@parse_time_value = false
|
@@ -115,6 +117,13 @@ module LogLineParser
|
|
115
117
|
h
|
116
118
|
end
|
117
119
|
|
120
|
+
def to_ltsv(line)
|
121
|
+
values = line.kind_of?(Array) ? line : LogLineParser.parse(line).to_a
|
122
|
+
Ltsv.to_ltsv(@ltsv_labels, values)
|
123
|
+
end
|
124
|
+
|
125
|
+
private
|
126
|
+
|
118
127
|
def parse_request(h)
|
119
128
|
if first_line_of_request = h["%r".freeze]
|
120
129
|
request = first_line_of_request.split(/ /)
|
@@ -124,8 +133,6 @@ module LogLineParser
|
|
124
133
|
end
|
125
134
|
end
|
126
135
|
|
127
|
-
private
|
128
|
-
|
129
136
|
def response_size(rec)
|
130
137
|
size_str = rec.response_bytes
|
131
138
|
size_str == "-".freeze ? 0 : size_str.to_i
|
@@ -183,6 +190,17 @@ module LogLineParser
|
|
183
190
|
record_type
|
184
191
|
end
|
185
192
|
|
193
|
+
private_class_method :create_record_type
|
194
|
+
|
195
|
+
##
|
196
|
+
# Creates a parser from a LogFormat.
|
197
|
+
#
|
198
|
+
# For example,
|
199
|
+
#
|
200
|
+
# parser = LogLineParse.parser("%h %l %u %t \"%r\" %>s %b")
|
201
|
+
#
|
202
|
+
# creates the parser of Common Log Format.
|
203
|
+
|
186
204
|
def self.parser(log_format)
|
187
205
|
if log_format.kind_of? String
|
188
206
|
format_strings = Apache.parse_log_format(log_format)
|
@@ -206,24 +224,49 @@ module LogLineParser
|
|
206
224
|
# LogLineTokenizer.tokenize(line.chomp, stack)
|
207
225
|
end
|
208
226
|
|
227
|
+
##
|
228
|
+
# Turns a line of Apache access logs into an array of field values.
|
229
|
+
#
|
230
|
+
# Escaped characters such as "\\t" or "\\"" will be unescaped.
|
231
|
+
|
209
232
|
def self.to_array(line)
|
210
233
|
parse(line).to_a
|
211
234
|
end
|
212
235
|
|
236
|
+
##
|
237
|
+
# Parser of Common Log Format (CLF)
|
238
|
+
#
|
239
|
+
# ref: https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format
|
240
|
+
|
213
241
|
CommonLogParser = parser(Apache::LogFormat::COMMON)
|
242
|
+
|
243
|
+
##
|
244
|
+
# Parser of Common Log Format with Virtual Host
|
245
|
+
|
214
246
|
CommonLogWithVHParser = parser(Apache::LogFormat::COMMON_WITH_VH)
|
247
|
+
|
248
|
+
##
|
249
|
+
# Parser of NCSA extended/combined log format
|
250
|
+
|
215
251
|
CombinedLogParser = parser(Apache::LogFormat::COMBINED)
|
216
252
|
|
217
253
|
PREDEFINED_FORMATS['common'] = CommonLogParser
|
218
254
|
PREDEFINED_FORMATS['common_with_vh'] = CommonLogWithVHParser
|
219
255
|
PREDEFINED_FORMATS['combined'] = CombinedLogParser
|
220
256
|
|
221
|
-
|
257
|
+
##
|
258
|
+
# Reads each line from +input+ (Apache access log files are expected) and
|
259
|
+
# parses it, then yields the line and the parsed result (+record+) to the
|
260
|
+
# associated block.
|
261
|
+
#
|
262
|
+
# When it fails to parse a line, the line will be printed to +error_output+
|
263
|
+
|
264
|
+
def self.each_record(parser: CombinedLogParser,
|
222
265
|
input: ARGF,
|
223
|
-
error_output: STDERR)
|
266
|
+
error_output: STDERR) # :yields: line, record
|
224
267
|
input.each_line do |line|
|
225
268
|
begin
|
226
|
-
yield line,
|
269
|
+
yield line, parser.parse(line)
|
227
270
|
rescue MalFormedRecordError => e
|
228
271
|
error_output.print e.message
|
229
272
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: log_line_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- HASHIMOTO, Naoki
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -62,6 +62,7 @@ files:
|
|
62
62
|
- lib/log_line_parser/apache.rb
|
63
63
|
- lib/log_line_parser/command_line_interface.rb
|
64
64
|
- lib/log_line_parser/line_parser.rb
|
65
|
+
- lib/log_line_parser/ltsv.rb
|
65
66
|
- lib/log_line_parser/moe.rb
|
66
67
|
- lib/log_line_parser/query.rb
|
67
68
|
- lib/log_line_parser/utils.rb
|