log_line_parser 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 86f95f21fb4df9bd1a358a43e988c4c78bdfada6
4
- data.tar.gz: a14d81562c43a2f80525a436138d263816d60617
3
+ metadata.gz: 98ed47b6f1624564237a5fcd95df8fa59f7e68d4
4
+ data.tar.gz: 9236e365d98e5a70e3503956327ca28285d2ff0f
5
5
  SHA512:
6
- metadata.gz: a59e7d1a346527f9a7cb32760c80d65b7a58d6b24b5b7ca28a9205028856ece88f717983261862c65c54b7afca16e02832357d846a1f9e272c576972ad7799b5
7
- data.tar.gz: b6153aaec48fb5340a7445b488c73c24cb776ffb7c0f1ab937f8d48fb47b9aa87fce661b31b0c17eab4590f33bb4b48ed1e34d6d1a37499d85bf228218e47eab
6
+ metadata.gz: 2af2324f7fd1b56dc88c65a68b618ec7c41a8e30e2054e59c5f96a2bf2c3db1d167998f8b8baa0bcd13f783b3ee6dfdbf0b0d4ec7f0f934434a22441cb76607e
7
+ data.tar.gz: 8301442589177b2caa33c3e8ecea442f7e8abb215dabdbd746f9020d4cfca0334d2f153a3d0194d9664826043bcfa129bc3be48ebae2b0bfcbe456f63f62c03e
data/README.md CHANGED
@@ -28,7 +28,7 @@ Or install it yourself as:
28
28
  require 'log_line_parser'
29
29
 
30
30
  line = '192.168.3.4 - - [07/Feb/2016: ... ] ...'
31
- LogLineParser.parse(line).to_a
31
+ LogLineParser.to_array(line)
32
32
  # => ["192.168.3.4", "-", "-", "07/Feb/2016: ... ", ... ]
33
33
  ```
34
34
 
@@ -103,7 +103,7 @@ The command line tool `log_line_parser` can be used for two purposes:
103
103
  1. For converting file formats
104
104
  2. For picking up log records that satisfy certain criteria
105
105
 
106
- For the first purpose, the tool support conversion from an Apache log format to CSV or TSV format.
106
+ For the first purpose, the tool supports conversion from an Apache log format to CSV or TSV format.
107
107
  And for the second purpose, criteria such as :not_found?(= :status_code_404?) or :access_by_bots? are defined, and you can combine them by writing a configuration file.
108
108
 
109
109
  #### For converting file formats
data/exe/log_line_parser CHANGED
File without changes
@@ -31,7 +31,8 @@ module LogLineParser
31
31
  end
32
32
 
33
33
  opt.on("-l [LogFormat]", "--log-format [=LogFormat]",
34
- "Specify LogFormat") do |log_format|
34
+ "Specify LogFormat by giving a LogFormat or one of \
35
+ formats predefined as #{predefined_options_for_log_format}") do |log_format|
35
36
  options[:log_format] = log_format
36
37
  end
37
38
 
@@ -41,7 +42,7 @@ module LogLineParser
41
42
  end
42
43
 
43
44
  opt.on("-t [format]", "--to [=format]",
44
- "Specify a format") do |format|
45
+ "Specify a format: csv, tsv or ltsv") do |format|
45
46
  options[:format] = format
46
47
  end
47
48
 
@@ -79,7 +80,7 @@ module LogLineParser
79
80
  output_log_names = collect_output_log_names(configs)
80
81
  Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
81
82
  queries = setup_queries_from_configs(configs, logs)
82
- LogLineParser.each_record(record_type: parser) do |line, record|
83
+ LogLineParser.each_record(parser: parser) do |line, record|
83
84
  queries.each {|query| query.call(line, record) }
84
85
  end
85
86
  end
@@ -92,6 +93,9 @@ module LogLineParser
92
93
  convert_to_csv(input, output)
93
94
  when "tsv"
94
95
  convert_to_tsv(input, output)
96
+ when "ltsv"
97
+ convert_to_ltsv(input, output,
98
+ choose_log_parser(options[:log_format]))
95
99
  else
96
100
  raise UnsupportedFormatError.new(output_format)
97
101
  end
@@ -99,6 +103,12 @@ module LogLineParser
99
103
 
100
104
  private
101
105
 
106
+ def self.predefined_options_for_log_format
107
+ PREDEFINED_FORMATS.keys.
108
+ map {|opt| "\"#{opt}\"" }.
109
+ join(", ")
110
+ end
111
+
102
112
  def self.collect_output_log_names(configs)
103
113
  configs.map do |config|
104
114
  config[Query::ConfigFields::OUTPUT_LOG_NAME]
@@ -122,5 +132,11 @@ module LogLineParser
122
132
  output.puts Utils.to_tsv(line.chomp)
123
133
  end
124
134
  end
135
+
136
+ def self.convert_to_ltsv(input, output, parser)
137
+ input.each_line do |line|
138
+ output.puts parser.to_ltsv(line.chomp)
139
+ end
140
+ end
125
141
  end
126
142
  end
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module LogLineParser
4
+
5
+ module Ltsv
6
+ LABEL_SEPARATOR = ":"
7
+ TAB = "\t"
8
+
9
+ ##
10
+ # Label names are borrowed from
11
+ # http://ltsv.org/
12
+
13
+ FORMAT_STRING_LABEL_TABLE = {
14
+ "%t" => "time",
15
+ "%h" => "host",
16
+ "%{X-Forwarded-For}i" => "forwardedfor",
17
+ "%l" => "ident",
18
+ "%u" => "user",
19
+ "%r" => "req",
20
+ "%m" => "method",
21
+ "%U%q" => "uri",
22
+ "%H" => "protocol",
23
+ "%>s" => "status",
24
+ "%B" => "size",
25
+ "%b" => "size",
26
+ "%I" => "reqsize",
27
+ "%{Referer}i" => "referer",
28
+ "%{User-agent}i" => "ua",
29
+ "%{Host}i" => "vhost",
30
+ "%D" => "reqtime_microsec",
31
+ "%T" => "reqtime",
32
+ "%{X-Cache}o" => "cache",
33
+ "%{X-Runtime}o" => "runtime",
34
+ # "-" => "apptime",
35
+ }
36
+
37
+ def self.format_strings_to_labels(format_strings)
38
+ format_strings.map do |string|
39
+ FORMAT_STRING_LABEL_TABLE[string]||string
40
+ end
41
+ end
42
+
43
+ def self.to_ltsv(labels, values)
44
+ fields = labels.zip(values).map {|field| field.join(LABEL_SEPARATOR) }
45
+ fields.join(TAB)
46
+ end
47
+ end
48
+ end
@@ -3,16 +3,21 @@
3
3
  require 'log_line_parser'
4
4
  require 'log_line_parser/utils'
5
5
 
6
- # MoeLogParser is added from the personal needs of the original author,
7
- # and the LogFormat for it is not a widely used format.
8
- # You may remove this file if you don't need it.
9
- # (MOE is the acronym of the organization's name for which the author
10
- # is working at the time of the first release of this program.)
11
-
12
6
  module LogLineParser
13
- # CombinedLogFormat + "%D"
7
+
8
+ # MoeLogFormat and MoeLogParser is added from the personal needs of the
9
+ # original author, and the log format is not a widely used one.
10
+ # You may remove this file if you don't need it.
11
+ # (MOE is the acronym of the name of the organization for which
12
+ # the author is working at the time of the first release of this program.)
13
+ #
14
+ # MoeLogFormat = CombinedLogFormat + "%D"
14
15
  MoeLogFormat = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D"
16
+
17
+ ##
18
+ # Parser of MoeLogFormat
15
19
  MoeLogParser = parser(MoeLogFormat)
20
+
16
21
  PREDEFINED_FORMATS['moe'] = MoeLogParser
17
22
  end
18
23
 
@@ -75,6 +75,10 @@ YetiBot
75
75
  bots_re =~ record.user_agent
76
76
  end
77
77
 
78
+ ##
79
+ # Returns true if the path+query part of the value of %{Referer}i
80
+ # matchs one of resources.
81
+
78
82
  def self.referred_from_resources?(record, resources=[])
79
83
  resources.include?(record.referer_resource)
80
84
  end
@@ -183,11 +187,50 @@ YetiBot
183
187
  bots_re =~ record.user_agent
184
188
  end
185
189
 
190
+ ##
191
+ # Returns true if the path+query part of the value of %{Referer}i
192
+ # matches one of the resources that are passed as the second
193
+ # argument when you create an instance of Query.
194
+ #
195
+ # When a given resource is a directory, you should append a "/" at the
196
+ # end of it, otherwise you would get a wrong result. For example,
197
+ # suppose you define the following queries:
198
+ #
199
+ # correct_query = Query.new("www.example.org", ["/dir/subdir/"])
200
+ # wrong_query = Query.new("www.example.org", ["/dir/subdir"])
201
+ #
202
+ # <tt>correct_query.referred_from_resources?(record)</tt> returns true
203
+ # when the value of %{Referer}i is "http://www.example.org/subdir"
204
+ # or "http://www.example.org/subdir/",
205
+ # but <tt>wrong_query.referred_from_resources?(record)</tt> returns
206
+ # false for "http://www.example.org/subdir/"
207
+
186
208
  def referred_from_resources?(record)
187
209
  if_matching_domain(record) and
188
210
  @normalized_resources.include?(record.referer_resource)
189
211
  end
190
212
 
213
+ ##
214
+ # Returns true if the path+query part of the value of %{Referer}i
215
+ # begins with one of the resources that are passed as the second
216
+ # argument when you create an instance of Query.
217
+ #
218
+ # When a given resource is a directory, you should append a "/" at the
219
+ # end of it, otherwise you would get a wrong result. For example,
220
+ # suppose you define the following queries:
221
+ #
222
+ # correct_query = Query.new("www.example.org", ["/dir/subdir/"])
223
+ # wrong_query = Query.new("www.example.org", ["/dir/subdir"])
224
+ #
225
+ # <tt>wrong_query.referred_from_under_resources?(record)</tt>
226
+ # returns true even when the value of %{Referer}i in record is
227
+ # "http://www.example.org/subdir_for_images/a_file_name",
228
+ # while <tt>correct_query.referred_from_under_resources?(record)</tt>
229
+ # returns true when the value of %{Referer}i is
230
+ # "http://www.example.org/subdir/a_filename" or
231
+ # "http://www.example.org/subdir",
232
+ # and returns false for "http://www.example.org/subdir_for_images".
233
+
191
234
  def referred_from_under_resources?(record)
192
235
  referer_resource = record.referer_resource
193
236
  if_matching_domain(record) and
@@ -195,10 +238,51 @@ YetiBot
195
238
  @resources.any?{|target| referer_resource.start_with?(target) }
196
239
  end
197
240
 
241
+ ##
242
+ # Returns true if the value of %U%q in +record+ matches one of the
243
+ # resources that are passed as the second argument when you create
244
+ # an instance of Query.
245
+ #
246
+ # When you give a directory as one of resources, you should append
247
+ # a "/" at the end of the directory, otherwise records whose %U%q
248
+ # value points to the same directory but without trailing "/"
249
+ # will return false.
250
+ #
251
+ # For example, when you create queries as follows,
252
+ #
253
+ # query_with_slash = Query.new("www.example.org", ["/dir/subdir/"])
254
+ # query_without_slash = Query.new("www.example.org", ["/dir/subdir"])
255
+ #
256
+ # <tt>query_with_slash.access_to_resources?(record)</tt> returns true for
257
+ # both of records whose %U%q value is "/dir/subdir/" and "/dir/subdir"
258
+ # respectively.
259
+ #
260
+ # But <tt>query_without_slash.access_to_resources?(record)</tt> returns
261
+ # false for a record whose %U%q value is "/dir/subdir/"
262
+
198
263
  def access_to_resources?(record)
199
264
  @normalized_resources.include?(record.resource)
200
265
  end
201
266
 
267
+ ##
268
+ # Returns true if the value of %U%q in +record+ begins with one
269
+ # of the resources that are passed as the second argument when
270
+ # you create an instance of Query.
271
+ #
272
+ # When a given resource is a directory, you should append a "/" at the
273
+ # end of it, otherwise you would get a wrong result. For example,
274
+ # suppose you define the following queries:
275
+ #
276
+ # correct_query = Query.new("www.example.org", ["/dir/subdir/"])
277
+ # wrong_query = Query.new("www.example.org", ["/dir/subdir"])
278
+ #
279
+ # <tt>wrong_query.access_to_under_resources?(record)</tt>
280
+ # returns true even when the value of %U%q in record is
281
+ # "/subdir_for_images/a_file_name", while
282
+ # <tt>correct_query.access_to_under_resources?(record)</tt>
283
+ # returns true when the value of %U%q is "/subdir/a_filename" or
284
+ # "/subdir", and returns false for "/subdir_for_images".
285
+
202
286
  def access_to_under_resources?(record)
203
287
  resource = record.resource
204
288
  @normalized_dirs.include?(resource) or
@@ -1,3 +1,3 @@
1
1
  module LogLineParser
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -3,6 +3,7 @@
3
3
  require "log_line_parser/version"
4
4
  require "log_line_parser/line_parser"
5
5
  require "log_line_parser/apache"
6
+ require "log_line_parser/ltsv"
6
7
  require "strscan"
7
8
  require "time"
8
9
  require "date"
@@ -65,12 +66,12 @@ module LogLineParser
65
66
  root.subnodes.map {|node| node.to_s }
66
67
  end
67
68
 
68
- def to_hash(record_type=CombinedLogParser)
69
- record_type.to_hash(to_a)
69
+ def to_hash(parser=CombinedLogParser)
70
+ parser.to_hash(to_a)
70
71
  end
71
72
 
72
- def to_record(record_type=CombinedLogParser)
73
- record_type.create(to_a)
73
+ def to_record(parser=CombinedLogParser)
74
+ parser.create(to_a)
74
75
  end
75
76
  end
76
77
 
@@ -82,6 +83,7 @@ module LogLineParser
82
83
  def setup(field_names, format_strings=nil)
83
84
  @field_names = field_names
84
85
  @format_strings = format_strings
86
+ @ltsv_labels = Ltsv.format_strings_to_labels(format_strings)
85
87
  @number_of_fields = field_names.length
86
88
  @referer_defined = field_names.include?(:referer)
87
89
  @parse_time_value = false
@@ -115,6 +117,13 @@ module LogLineParser
115
117
  h
116
118
  end
117
119
 
120
+ def to_ltsv(line)
121
+ values = line.kind_of?(Array) ? line : LogLineParser.parse(line).to_a
122
+ Ltsv.to_ltsv(@ltsv_labels, values)
123
+ end
124
+
125
+ private
126
+
118
127
  def parse_request(h)
119
128
  if first_line_of_request = h["%r".freeze]
120
129
  request = first_line_of_request.split(/ /)
@@ -124,8 +133,6 @@ module LogLineParser
124
133
  end
125
134
  end
126
135
 
127
- private
128
-
129
136
  def response_size(rec)
130
137
  size_str = rec.response_bytes
131
138
  size_str == "-".freeze ? 0 : size_str.to_i
@@ -183,6 +190,17 @@ module LogLineParser
183
190
  record_type
184
191
  end
185
192
 
193
+ private_class_method :create_record_type
194
+
195
+ ##
196
+ # Creates a parser from a LogFormat.
197
+ #
198
+ # For example,
199
+ #
200
+ # parser = LogLineParse.parser("%h %l %u %t \"%r\" %>s %b")
201
+ #
202
+ # creates the parser of Common Log Format.
203
+
186
204
  def self.parser(log_format)
187
205
  if log_format.kind_of? String
188
206
  format_strings = Apache.parse_log_format(log_format)
@@ -206,24 +224,49 @@ module LogLineParser
206
224
  # LogLineTokenizer.tokenize(line.chomp, stack)
207
225
  end
208
226
 
227
+ ##
228
+ # Turns a line of Apache access logs into an array of field values.
229
+ #
230
+ # Escaped characters such as "\\t" or "\\"" will be unescaped.
231
+
209
232
  def self.to_array(line)
210
233
  parse(line).to_a
211
234
  end
212
235
 
236
+ ##
237
+ # Parser of Common Log Format (CLF)
238
+ #
239
+ # ref: https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format
240
+
213
241
  CommonLogParser = parser(Apache::LogFormat::COMMON)
242
+
243
+ ##
244
+ # Parser of Common Log Format with Virtual Host
245
+
214
246
  CommonLogWithVHParser = parser(Apache::LogFormat::COMMON_WITH_VH)
247
+
248
+ ##
249
+ # Parser of NCSA extended/combined log format
250
+
215
251
  CombinedLogParser = parser(Apache::LogFormat::COMBINED)
216
252
 
217
253
  PREDEFINED_FORMATS['common'] = CommonLogParser
218
254
  PREDEFINED_FORMATS['common_with_vh'] = CommonLogWithVHParser
219
255
  PREDEFINED_FORMATS['combined'] = CombinedLogParser
220
256
 
221
- def self.each_record(record_type: CommonLogParser,
257
+ ##
258
+ # Reads each line from +input+ (Apache access log files are expected) and
259
+ # parses it, then yields the line and the parsed result (+record+) to the
260
+ # associated block.
261
+ #
262
+ # When it fails to parse a line, the line will be printed to +error_output+
263
+
264
+ def self.each_record(parser: CombinedLogParser,
222
265
  input: ARGF,
223
- error_output: STDERR)
266
+ error_output: STDERR) # :yields: line, record
224
267
  input.each_line do |line|
225
268
  begin
226
- yield line, record_type.parse(line)
269
+ yield line, parser.parse(line)
227
270
  rescue MalFormedRecordError => e
228
271
  error_output.print e.message
229
272
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: log_line_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - HASHIMOTO, Naoki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-03-06 00:00:00.000000000 Z
11
+ date: 2016-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -62,6 +62,7 @@ files:
62
62
  - lib/log_line_parser/apache.rb
63
63
  - lib/log_line_parser/command_line_interface.rb
64
64
  - lib/log_line_parser/line_parser.rb
65
+ - lib/log_line_parser/ltsv.rb
65
66
  - lib/log_line_parser/moe.rb
66
67
  - lib/log_line_parser/query.rb
67
68
  - lib/log_line_parser/utils.rb