log_line_parser 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 86f95f21fb4df9bd1a358a43e988c4c78bdfada6
4
- data.tar.gz: a14d81562c43a2f80525a436138d263816d60617
3
+ metadata.gz: 98ed47b6f1624564237a5fcd95df8fa59f7e68d4
4
+ data.tar.gz: 9236e365d98e5a70e3503956327ca28285d2ff0f
5
5
  SHA512:
6
- metadata.gz: a59e7d1a346527f9a7cb32760c80d65b7a58d6b24b5b7ca28a9205028856ece88f717983261862c65c54b7afca16e02832357d846a1f9e272c576972ad7799b5
7
- data.tar.gz: b6153aaec48fb5340a7445b488c73c24cb776ffb7c0f1ab937f8d48fb47b9aa87fce661b31b0c17eab4590f33bb4b48ed1e34d6d1a37499d85bf228218e47eab
6
+ metadata.gz: 2af2324f7fd1b56dc88c65a68b618ec7c41a8e30e2054e59c5f96a2bf2c3db1d167998f8b8baa0bcd13f783b3ee6dfdbf0b0d4ec7f0f934434a22441cb76607e
7
+ data.tar.gz: 8301442589177b2caa33c3e8ecea442f7e8abb215dabdbd746f9020d4cfca0334d2f153a3d0194d9664826043bcfa129bc3be48ebae2b0bfcbe456f63f62c03e
data/README.md CHANGED
@@ -28,7 +28,7 @@ Or install it yourself as:
28
28
  require 'log_line_parser'
29
29
 
30
30
  line = '192.168.3.4 - - [07/Feb/2016: ... ] ...'
31
- LogLineParser.parse(line).to_a
31
+ LogLineParser.to_array(line)
32
32
  # => ["192.168.3.4", "-", "-", "07/Feb/2016: ... ", ... ]
33
33
  ```
34
34
 
@@ -103,7 +103,7 @@ The command line tool `log_line_parser` can be used for two purposes:
103
103
  1. For converting file formats
104
104
  2. For picking up log records that satisfy certain criteria
105
105
 
106
- For the first purpose, the tool support conversion from an Apache log format to CSV or TSV format.
106
+ For the first purpose, the tool supports conversion from an Apache log format to CSV or TSV format.
107
107
  And for the second purpose, criteria such as :not_found?(= :status_code_404?) or :access_by_bots? are defined, and you can combine them by writing a configuration file.
108
108
 
109
109
  #### For converting file formats
data/exe/log_line_parser CHANGED
File without changes
@@ -31,7 +31,8 @@ module LogLineParser
31
31
  end
32
32
 
33
33
  opt.on("-l [LogFormat]", "--log-format [=LogFormat]",
34
- "Specify LogFormat") do |log_format|
34
+ "Specify LogFormat by giving a LogFormat or one of \
35
+ formats predefined as #{predefined_options_for_log_format}") do |log_format|
35
36
  options[:log_format] = log_format
36
37
  end
37
38
 
@@ -41,7 +42,7 @@ module LogLineParser
41
42
  end
42
43
 
43
44
  opt.on("-t [format]", "--to [=format]",
44
- "Specify a format") do |format|
45
+ "Specify a format: csv, tsv or ltsv") do |format|
45
46
  options[:format] = format
46
47
  end
47
48
 
@@ -79,7 +80,7 @@ module LogLineParser
79
80
  output_log_names = collect_output_log_names(configs)
80
81
  Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
81
82
  queries = setup_queries_from_configs(configs, logs)
82
- LogLineParser.each_record(record_type: parser) do |line, record|
83
+ LogLineParser.each_record(parser: parser) do |line, record|
83
84
  queries.each {|query| query.call(line, record) }
84
85
  end
85
86
  end
@@ -92,6 +93,9 @@ module LogLineParser
92
93
  convert_to_csv(input, output)
93
94
  when "tsv"
94
95
  convert_to_tsv(input, output)
96
+ when "ltsv"
97
+ convert_to_ltsv(input, output,
98
+ choose_log_parser(options[:log_format]))
95
99
  else
96
100
  raise UnsupportedFormatError.new(output_format)
97
101
  end
@@ -99,6 +103,12 @@ module LogLineParser
99
103
 
100
104
  private
101
105
 
106
+ def self.predefined_options_for_log_format
107
+ PREDEFINED_FORMATS.keys.
108
+ map {|opt| "\"#{opt}\"" }.
109
+ join(", ")
110
+ end
111
+
102
112
  def self.collect_output_log_names(configs)
103
113
  configs.map do |config|
104
114
  config[Query::ConfigFields::OUTPUT_LOG_NAME]
@@ -122,5 +132,11 @@ module LogLineParser
122
132
  output.puts Utils.to_tsv(line.chomp)
123
133
  end
124
134
  end
135
+
136
+ def self.convert_to_ltsv(input, output, parser)
137
+ input.each_line do |line|
138
+ output.puts parser.to_ltsv(line.chomp)
139
+ end
140
+ end
125
141
  end
126
142
  end
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module LogLineParser
4
+
5
+ module Ltsv
6
+ LABEL_SEPARATOR = ":"
7
+ TAB = "\t"
8
+
9
+ ##
10
+ # Label names are borrowed from
11
+ # http://ltsv.org/
12
+
13
+ FORMAT_STRING_LABEL_TABLE = {
14
+ "%t" => "time",
15
+ "%h" => "host",
16
+ "%{X-Forwarded-For}i" => "forwardedfor",
17
+ "%l" => "ident",
18
+ "%u" => "user",
19
+ "%r" => "req",
20
+ "%m" => "method",
21
+ "%U%q" => "uri",
22
+ "%H" => "protocol",
23
+ "%>s" => "status",
24
+ "%B" => "size",
25
+ "%b" => "size",
26
+ "%I" => "reqsize",
27
+ "%{Referer}i" => "referer",
28
+ "%{User-agent}i" => "ua",
29
+ "%{Host}i" => "vhost",
30
+ "%D" => "reqtime_microsec",
31
+ "%T" => "reqtime",
32
+ "%{X-Cache}o" => "cache",
33
+ "%{X-Runtime}o" => "runtime",
34
+ # "-" => "apptime",
35
+ }
36
+
37
+ def self.format_strings_to_labels(format_strings)
38
+ format_strings.map do |string|
39
+ FORMAT_STRING_LABEL_TABLE[string]||string
40
+ end
41
+ end
42
+
43
+ def self.to_ltsv(labels, values)
44
+ fields = labels.zip(values).map {|field| field.join(LABEL_SEPARATOR) }
45
+ fields.join(TAB)
46
+ end
47
+ end
48
+ end
@@ -3,16 +3,21 @@
3
3
  require 'log_line_parser'
4
4
  require 'log_line_parser/utils'
5
5
 
6
- # MoeLogParser is added from the personal needs of the original author,
7
- # and the LogFormat for it is not a widely used format.
8
- # You may remove this file if you don't need it.
9
- # (MOE is the acronym of the organization's name for which the author
10
- # is working at the time of the first release of this program.)
11
-
12
6
  module LogLineParser
13
- # CombinedLogFormat + "%D"
7
+
8
+ # MoeLogFormat and MoeLogParser is added from the personal needs of the
9
+ # original author, and the log format is not a widely used one.
10
+ # You may remove this file if you don't need it.
11
+ # (MOE is the acronym of the name of the organization for which
12
+ # the author is working at the time of the first release of this program.)
13
+ #
14
+ # MoeLogFormat = CombinedLogFormat + "%D"
14
15
  MoeLogFormat = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D"
16
+
17
+ ##
18
+ # Parser of MoeLogFormat
15
19
  MoeLogParser = parser(MoeLogFormat)
20
+
16
21
  PREDEFINED_FORMATS['moe'] = MoeLogParser
17
22
  end
18
23
 
@@ -75,6 +75,10 @@ YetiBot
75
75
  bots_re =~ record.user_agent
76
76
  end
77
77
 
78
+ ##
79
+ # Returns true if the path+query part of the value of %{Referer}i
80
+ # matchs one of resources.
81
+
78
82
  def self.referred_from_resources?(record, resources=[])
79
83
  resources.include?(record.referer_resource)
80
84
  end
@@ -183,11 +187,50 @@ YetiBot
183
187
  bots_re =~ record.user_agent
184
188
  end
185
189
 
190
+ ##
191
+ # Returns true if the path+query part of the value of %{Referer}i
192
+ # matches one of the resources that are passed as the second
193
+ # argument when you create an instance of Query.
194
+ #
195
+ # When a given resource is a directory, you should append a "/" at the
196
+ # end of it, otherwise you would get a wrong result. For example,
197
+ # suppose you define the following queries:
198
+ #
199
+ # correct_query = Query.new("www.example.org", ["/dir/subdir/"])
200
+ # wrong_query = Query.new("www.example.org", ["/dir/subdir"])
201
+ #
202
+ # <tt>correct_query.referred_from_resources?(record)</tt> returns true
203
+ # when the value of %{Referer}i is "http://www.example.org/subdir"
204
+ # or "http://www.example.org/subdir/",
205
+ # but <tt>wrong_query.referred_from_resources?(record)</tt> returns
206
+ # false for "http://www.example.org/subdir/"
207
+
186
208
  def referred_from_resources?(record)
187
209
  if_matching_domain(record) and
188
210
  @normalized_resources.include?(record.referer_resource)
189
211
  end
190
212
 
213
+ ##
214
+ # Returns true if the path+query part of the value of %{Referer}i
215
+ # begins with one of the resources that are passed as the second
216
+ # argument when you create an instance of Query.
217
+ #
218
+ # When a given resource is a directory, you should append a "/" at the
219
+ # end of it, otherwise you would get a wrong result. For example,
220
+ # suppose you define the following queries:
221
+ #
222
+ # correct_query = Query.new("www.example.org", ["/dir/subdir/"])
223
+ # wrong_query = Query.new("www.example.org", ["/dir/subdir"])
224
+ #
225
+ # <tt>wrong_query.referred_from_under_resources?(record)</tt>
226
+ # returns true even when the value of %{Referer}i in record is
227
+ # "http://www.example.org/subdir_for_images/a_file_name",
228
+ # while <tt>correct_query.referred_from_under_resources?(record)</tt>
229
+ # returns true when the value of %{Referer}i is
230
+ # "http://www.example.org/subdir/a_filename" or
231
+ # "http://www.example.org/subdir",
232
+ # and returns false for "http://www.example.org/subdir_for_images".
233
+
191
234
  def referred_from_under_resources?(record)
192
235
  referer_resource = record.referer_resource
193
236
  if_matching_domain(record) and
@@ -195,10 +238,51 @@ YetiBot
195
238
  @resources.any?{|target| referer_resource.start_with?(target) }
196
239
  end
197
240
 
241
+ ##
242
+ # Returns true if the value of %U%q in +record+ matches one of the
243
+ # resources that are passed as the second argument when you create
244
+ # an instance of Query.
245
+ #
246
+ # When you give a directory as one of resources, you should append
247
+ # a "/" at the end of the directory, otherwise records whose %U%q
248
+ # value points to the same directory but without trailing "/"
249
+ # will return false.
250
+ #
251
+ # For example, when you create queries as follows,
252
+ #
253
+ # query_with_slash = Query.new("www.example.org", ["/dir/subdir/"])
254
+ # query_without_slash = Query.new("www.example.org", ["/dir/subdir"])
255
+ #
256
+ # <tt>query_with_slash.access_to_resources?(record)</tt> returns true for
257
+ # both of records whose %U%q value is "/dir/subdir/" and "/dir/subdir"
258
+ # respectively.
259
+ #
260
+ # But <tt>query_without_slash.access_to_resources?(record)</tt> returns
261
+ # false for a record whose %U%q value is "/dir/subdir/"
262
+
198
263
  def access_to_resources?(record)
199
264
  @normalized_resources.include?(record.resource)
200
265
  end
201
266
 
267
+ ##
268
+ # Returns true if the value of %U%q in +record+ begins with one
269
+ # of the resources that are passed as the second argument when
270
+ # you create an instance of Query.
271
+ #
272
+ # When a given resource is a directory, you should append a "/" at the
273
+ # end of it, otherwise you would get a wrong result. For example,
274
+ # suppose you define the following queries:
275
+ #
276
+ # correct_query = Query.new("www.example.org", ["/dir/subdir/"])
277
+ # wrong_query = Query.new("www.example.org", ["/dir/subdir"])
278
+ #
279
+ # <tt>wrong_query.access_to_under_resources?(record)</tt>
280
+ # returns true even when the value of %U%q in record is
281
+ # "/subdir_for_images/a_file_name", while
282
+ # <tt>correct_query.access_to_under_resources?(record)</tt>
283
+ # returns true when the value of %U%q is "/subdir/a_filename" or
284
+ # "/subdir", and returns false for "/subdir_for_images".
285
+
202
286
  def access_to_under_resources?(record)
203
287
  resource = record.resource
204
288
  @normalized_dirs.include?(resource) or
@@ -1,3 +1,3 @@
1
1
  module LogLineParser
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -3,6 +3,7 @@
3
3
  require "log_line_parser/version"
4
4
  require "log_line_parser/line_parser"
5
5
  require "log_line_parser/apache"
6
+ require "log_line_parser/ltsv"
6
7
  require "strscan"
7
8
  require "time"
8
9
  require "date"
@@ -65,12 +66,12 @@ module LogLineParser
65
66
  root.subnodes.map {|node| node.to_s }
66
67
  end
67
68
 
68
- def to_hash(record_type=CombinedLogParser)
69
- record_type.to_hash(to_a)
69
+ def to_hash(parser=CombinedLogParser)
70
+ parser.to_hash(to_a)
70
71
  end
71
72
 
72
- def to_record(record_type=CombinedLogParser)
73
- record_type.create(to_a)
73
+ def to_record(parser=CombinedLogParser)
74
+ parser.create(to_a)
74
75
  end
75
76
  end
76
77
 
@@ -82,6 +83,7 @@ module LogLineParser
82
83
  def setup(field_names, format_strings=nil)
83
84
  @field_names = field_names
84
85
  @format_strings = format_strings
86
+ @ltsv_labels = Ltsv.format_strings_to_labels(format_strings)
85
87
  @number_of_fields = field_names.length
86
88
  @referer_defined = field_names.include?(:referer)
87
89
  @parse_time_value = false
@@ -115,6 +117,13 @@ module LogLineParser
115
117
  h
116
118
  end
117
119
 
120
+ def to_ltsv(line)
121
+ values = line.kind_of?(Array) ? line : LogLineParser.parse(line).to_a
122
+ Ltsv.to_ltsv(@ltsv_labels, values)
123
+ end
124
+
125
+ private
126
+
118
127
  def parse_request(h)
119
128
  if first_line_of_request = h["%r".freeze]
120
129
  request = first_line_of_request.split(/ /)
@@ -124,8 +133,6 @@ module LogLineParser
124
133
  end
125
134
  end
126
135
 
127
- private
128
-
129
136
  def response_size(rec)
130
137
  size_str = rec.response_bytes
131
138
  size_str == "-".freeze ? 0 : size_str.to_i
@@ -183,6 +190,17 @@ module LogLineParser
183
190
  record_type
184
191
  end
185
192
 
193
+ private_class_method :create_record_type
194
+
195
+ ##
196
+ # Creates a parser from a LogFormat.
197
+ #
198
+ # For example,
199
+ #
200
+ # parser = LogLineParse.parser("%h %l %u %t \"%r\" %>s %b")
201
+ #
202
+ # creates the parser of Common Log Format.
203
+
186
204
  def self.parser(log_format)
187
205
  if log_format.kind_of? String
188
206
  format_strings = Apache.parse_log_format(log_format)
@@ -206,24 +224,49 @@ module LogLineParser
206
224
  # LogLineTokenizer.tokenize(line.chomp, stack)
207
225
  end
208
226
 
227
+ ##
228
+ # Turns a line of Apache access logs into an array of field values.
229
+ #
230
+ # Escaped characters such as "\\t" or "\\"" will be unescaped.
231
+
209
232
  def self.to_array(line)
210
233
  parse(line).to_a
211
234
  end
212
235
 
236
+ ##
237
+ # Parser of Common Log Format (CLF)
238
+ #
239
+ # ref: https://www.w3.org/Daemon/User/Config/Logging.html#common-logfile-format
240
+
213
241
  CommonLogParser = parser(Apache::LogFormat::COMMON)
242
+
243
+ ##
244
+ # Parser of Common Log Format with Virtual Host
245
+
214
246
  CommonLogWithVHParser = parser(Apache::LogFormat::COMMON_WITH_VH)
247
+
248
+ ##
249
+ # Parser of NCSA extended/combined log format
250
+
215
251
  CombinedLogParser = parser(Apache::LogFormat::COMBINED)
216
252
 
217
253
  PREDEFINED_FORMATS['common'] = CommonLogParser
218
254
  PREDEFINED_FORMATS['common_with_vh'] = CommonLogWithVHParser
219
255
  PREDEFINED_FORMATS['combined'] = CombinedLogParser
220
256
 
221
- def self.each_record(record_type: CommonLogParser,
257
+ ##
258
+ # Reads each line from +input+ (Apache access log files are expected) and
259
+ # parses it, then yields the line and the parsed result (+record+) to the
260
+ # associated block.
261
+ #
262
+ # When it fails to parse a line, the line will be printed to +error_output+
263
+
264
+ def self.each_record(parser: CombinedLogParser,
222
265
  input: ARGF,
223
- error_output: STDERR)
266
+ error_output: STDERR) # :yields: line, record
224
267
  input.each_line do |line|
225
268
  begin
226
- yield line, record_type.parse(line)
269
+ yield line, parser.parse(line)
227
270
  rescue MalFormedRecordError => e
228
271
  error_output.print e.message
229
272
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: log_line_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - HASHIMOTO, Naoki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-03-06 00:00:00.000000000 Z
11
+ date: 2016-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -62,6 +62,7 @@ files:
62
62
  - lib/log_line_parser/apache.rb
63
63
  - lib/log_line_parser/command_line_interface.rb
64
64
  - lib/log_line_parser/line_parser.rb
65
+ - lib/log_line_parser/ltsv.rb
65
66
  - lib/log_line_parser/moe.rb
66
67
  - lib/log_line_parser/query.rb
67
68
  - lib/log_line_parser/utils.rb