log_line_parser 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 98ed47b6f1624564237a5fcd95df8fa59f7e68d4
4
- data.tar.gz: 9236e365d98e5a70e3503956327ca28285d2ff0f
3
+ metadata.gz: 6ce0b6e321f311f85bc375942205567ab4d60743
4
+ data.tar.gz: ee2d0b41508c833d39cade577581426dbb06cb23
5
5
  SHA512:
6
- metadata.gz: 2af2324f7fd1b56dc88c65a68b618ec7c41a8e30e2054e59c5f96a2bf2c3db1d167998f8b8baa0bcd13f783b3ee6dfdbf0b0d4ec7f0f934434a22441cb76607e
7
- data.tar.gz: 8301442589177b2caa33c3e8ecea442f7e8abb215dabdbd746f9020d4cfca0334d2f153a3d0194d9664826043bcfa129bc3be48ebae2b0bfcbe456f63f62c03e
6
+ metadata.gz: c2b363c6be57677fda905270e36f0ee23c39f9cb0915199eaad63a5fabc566a814437e92ad1706040e874e171107c3c901e90d195c1d530ea4e0282edac0d2b6
7
+ data.tar.gz: ea2ae4f5b9c0eb962c221e605927f0cdf977c194d612fff620a0cc2bb113825620762b67c4e377abab4654dcc1877dc3134f73625d07119d7cfa70610eb9d2a7
data/README.md CHANGED
@@ -130,7 +130,7 @@ Second, run the following command if you want to pick up from [samples/sample_co
130
130
 
131
131
  Then the results are in [samples/output](https://github.com/nico-hn/LogLineParser/tree/master/samples/output/) directory.
132
132
 
133
- ##### Format of configuration
133
+ ##### Format of configuration file
134
134
 
135
135
  An example of configurations is below:
136
136
 
@@ -182,6 +182,7 @@ It contains three configurations, and each of them consists of parameters in the
182
182
  |Available criteria |Note |
183
183
  |----------------------------------------|------------------------------------------------------------------------------------------|
184
184
  |:access_by_bots? |Access by major web crawlers such as Googlebot or Bingbot. |
185
+ |:access_to_image? |The value of "%U%q" matches /\.(?:jpe?g\|png\|gif\|ico\|tiff?\|bmp\|svgz?\|webp)$/in |
185
186
  |:referred_from_resources? |The path part of the value of "%{Referer}i" matches any of the values of "resources". |
186
187
  |:referred_from_under_resources? |The path part of the value of "%{Referer}i" begins with any of the values of "resources". |
187
188
  |:access_to_resources? |The value of "%U%q" matches any of the values of "resources". |
@@ -201,6 +202,54 @@ It contains three configurations, and each of them consists of parameters in the
201
202
  |:patch_method? |The value of "%m" is PATCH. |
202
203
 
203
204
 
205
+ ##### Default web crawlers
206
+
207
+ The following web crawlers are set by default and used by Query#access_by_bots?:
208
+
209
+ * Googlebot
210
+ * Googlebot-Mobile
211
+ * Mediapartners-Google
212
+ * Bingbot
213
+ * Slurp
214
+ * Baiduspider
215
+ * BaiduImagespider
216
+ * BaiduMobaider
217
+ * YetiBot
218
+ * Applebot
219
+
220
+
221
+ ##### Format of bots configuration file
222
+
223
+ You can specify web crawlers by giving a configuration file to `--bots-config` option. The following is an example of configuration file:
224
+
225
+ ```yaml
226
+ inherit_default_bots: false
227
+ bots:
228
+ - Googlebot
229
+ - Googlebot-Mobile
230
+ - Mediapartners-Google
231
+ - Bingbot
232
+ - Slurp
233
+ - Baiduspider
234
+ - BaiduImagespider
235
+ - BaiduMobaider
236
+ - YetiBot
237
+ - Applebot
238
+ bots_re:
239
+ - " bot$"
240
+ ```
241
+
242
+ |Parameters |Note |
243
+ |--------------------------------|-----------------------------------------------------------------------------------------------------------------------------|
244
+ |bots (optional) |Names of web crawlers that make Query#access_by_bots? return true when they are included in the value of "%{Referer}i". |
245
+ |inherit_default_bots (optional) |If this option is set to true, the default names of major web crawlers are added to the names specified by `bots` parameter. |
246
+ |bots_re (optional) |Use this parameter if you want to identify bots by regular expressions. |
247
+
248
+ The values given by the configuration file are compiled into a regular expression, and you can check the expression by invoking the tool as follows:
249
+
250
+ $ log_line_parser --show-current-settings --bots-config=bots_config.yml
251
+
252
+
204
253
  ## Development
205
254
 
206
255
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment. Run `bundle exec log_line_parser` to use the code located in this directory, ignoring other installed copies of this gem.
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module LogLineParser
4
+ module Bots
5
+ module ConfigLabels
6
+ INHERIT_DEFAULT_BOTS = "inherit_default_bots"
7
+ BOTS = "bots"
8
+ BOTS_RE = "bots_re"
9
+ end
10
+
11
+ DEFAULT_BOTS = %w(
12
+ Googlebot
13
+ Googlebot-Mobile
14
+ Mediapartners-Google
15
+ Bingbot
16
+ Slurp
17
+ Baiduspider
18
+ BaiduImagespider
19
+ BaiduMobaider
20
+ YetiBot
21
+ Applebot
22
+ )
23
+
24
+ DEFAULT_CONFIG = {
25
+ ConfigLabels::INHERIT_DEFAULT_BOTS => true,
26
+ ConfigLabels::BOTS => [],
27
+ ConfigLabels::BOTS_RE => nil
28
+ }
29
+
30
+ def self.compile_bots_re(bots_config=DEFAULT_CONFIG)
31
+ escaped_re = compile_escaped_re(bots_config)
32
+ re = compile_re(bots_config)
33
+ return Regexp.union(escaped_re, re) if escaped_re && re
34
+ escaped_re || re
35
+ end
36
+
37
+ def self.compile_escaped_re(bots_config)
38
+ bot_names = bots_config[ConfigLabels::BOTS] || []
39
+ if bots_config[ConfigLabels::INHERIT_DEFAULT_BOTS]
40
+ bot_names = (DEFAULT_BOTS + bot_names).uniq
41
+ end
42
+ return if bot_names.empty?
43
+ escaped_bots_str = bot_names.map {|name| Regexp.escape(name) }.join("|")
44
+ Regexp.compile(escaped_bots_str, Regexp::IGNORECASE, "n")
45
+ end
46
+
47
+ def self.compile_re(bots_config)
48
+ bots_pats = bots_config[ConfigLabels::BOTS_RE]
49
+ Regexp.compile(bots_pats.join("|"), nil, "n") if bots_pats
50
+ end
51
+
52
+ private_class_method :compile_escaped_re, :compile_re
53
+
54
+ DEFAULT_RE = compile_bots_re
55
+ end
56
+ end
@@ -12,10 +12,6 @@ module LogLineParser
12
12
 
13
13
  DEFAULT_FORMAT = "csv"
14
14
 
15
- def self.read_configs(config)
16
- YAML.load_stream(config).to_a
17
- end
18
-
19
15
  def self.parse_options
20
16
  options = {}
21
17
 
@@ -25,6 +21,17 @@ module LogLineParser
25
21
  options[:config_file] = config_file
26
22
  end
27
23
 
24
+ opt.on("-b [bots_config_file]", "--bots-config [=bots_config_file]",
25
+ "Give a configuration file in yaml format. \
26
+ Default bots: #{Bots::DEFAULT_BOTS.join(', ')}") do |config_file|
27
+ options[:bots_config_file] = config_file
28
+ end
29
+
30
+ opt.on("-s", "--show-current-settings",
31
+ "Show the detail of the current settings") do
32
+ options[:show_settings] = true
33
+ end
34
+
28
35
  opt.on("-f", "--filter-mode",
29
36
  "Mode for choosing log records that satisfy certain criteria") do
30
37
  options[:filter_mode] = true
@@ -52,12 +59,6 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
52
59
  options
53
60
  end
54
61
 
55
- def self.load_config_file(config_file)
56
- open(File.expand_path(config_file)) do |f|
57
- read_configs(f.read)
58
- end
59
- end
60
-
61
62
  def self.choose_log_parser(log_format)
62
63
  return LogLineParser::CombinedLogParser unless log_format
63
64
  parser = LogLineParser::PREDEFINED_FORMATS[log_format]
@@ -66,24 +67,28 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
66
67
 
67
68
  def self.execute
68
69
  options = parse_options
69
- if options[:filter_mode]
70
+ if options[:show_settings]
71
+ show_settings(options)
72
+ elsif options[:filter_mode]
70
73
  execute_as_filter(options)
71
74
  else
72
75
  execute_as_converter(options)
73
76
  end
74
77
  end
75
78
 
79
+ def self.show_settings(options)
80
+ bots_re = compile_bots_re_from_config_file(options[:bots_config_file])
81
+ parser = choose_log_parser(options[:log_format])
82
+ puts "The regular expression for bots: #{bots_re}"
83
+ puts "LogFormat: #{parser.format_strings}"
84
+ end
85
+
76
86
  def self.execute_as_filter(options)
77
- configs = load_config_file(options[:config_file])
87
+ configs = Utils.load_config_file(options[:config_file])
78
88
  parser = choose_log_parser(options[:log_format])
79
89
  output_dir = options[:output_dir]
80
- output_log_names = collect_output_log_names(configs)
81
- Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
82
- queries = setup_queries_from_configs(configs, logs)
83
- LogLineParser.each_record(parser: parser) do |line, record|
84
- queries.each {|query| query.call(line, record) }
85
- end
86
- end
90
+ bots_re = compile_bots_re_from_config_file(options[:bots_config_file])
91
+ execute_queries(configs, parser, output_dir, bots_re)
87
92
  end
88
93
 
89
94
  def self.execute_as_converter(options, output=STDOUT, input=ARGF)
@@ -101,7 +106,7 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
101
106
  end
102
107
  end
103
108
 
104
- private
109
+ # private class methods
105
110
 
106
111
  def self.predefined_options_for_log_format
107
112
  PREDEFINED_FORMATS.keys.
@@ -109,15 +114,31 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
109
114
  join(", ")
110
115
  end
111
116
 
117
+ def self.compile_bots_re_from_config_file(bots_config_file)
118
+ return Bots::DEFAULT_RE unless bots_config_file
119
+ configs = Utils.load_config_file(bots_config_file)[0]
120
+ Bots.compile_bots_re(configs)
121
+ end
122
+
112
123
  def self.collect_output_log_names(configs)
113
124
  configs.map do |config|
114
125
  config[Query::ConfigFields::OUTPUT_LOG_NAME]
115
126
  end
116
127
  end
117
128
 
118
- def self.setup_queries_from_configs(configs, logs)
129
+ def self.execute_queries(configs, parser, output_dir, bots_re)
130
+ output_log_names = collect_output_log_names(configs)
131
+ Utils.open_multiple_output_files(output_log_names, output_dir) do |logs|
132
+ queries = setup_queries_from_configs(configs, logs, bots_re)
133
+ LogLineParser.each_record(parser: parser) do |line, record|
134
+ queries.each {|query| query.call(line, record) }
135
+ end
136
+ end
137
+ end
138
+
139
+ def self.setup_queries_from_configs(configs, logs, bots_re)
119
140
  configs.map do |config|
120
- Query.register_query_to_log(config, logs)
141
+ Query.register_query_to_log(config, logs, bots_re)
121
142
  end
122
143
  end
123
144
 
@@ -138,5 +159,14 @@ formats predefined as #{predefined_options_for_log_format}") do |log_format|
138
159
  output.puts parser.to_ltsv(line.chomp)
139
160
  end
140
161
  end
162
+
163
+ private_class_method(:predefined_options_for_log_format,
164
+ :compile_bots_re_from_config_file,
165
+ :collect_output_log_names,
166
+ :execute_queries,
167
+ :setup_queries_from_configs,
168
+ :convert_to_csv,
169
+ :convert_to_tsv,
170
+ :convert_to_ltsv)
141
171
  end
142
172
  end
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'log_line_parser/bots'
4
+
3
5
  module LogLineParser
4
6
  class Query
5
7
  class NotAllowableMethodError < StandardError; end
@@ -18,20 +20,12 @@ module LogLineParser
18
20
 
19
21
  TAIL_SLASH_RE = /\/$/
20
22
  SLASH = '/'
21
- DEFAULT_BOTS = %w(
22
- Googlebot
23
- Googlebot-Mobile
24
- Mediapartners-Google
25
- Bingbot
26
- Slurp
27
- Baiduspider
28
- BaiduImagespider
29
- BaiduMobaider
30
- YetiBot
31
- )
23
+
24
+ IMAGE_FILE_RE = /\.(?:jpe?g|png|gif|ico|tiff?|bmp|svgz?|webp)$/in
32
25
 
33
26
  ALLOWABLE_METHODS = [
34
27
  :access_by_bots?,
28
+ :access_to_image?,
35
29
  :referred_from_resources?,
36
30
  :referred_from_under_resources?,
37
31
  :access_to_resources?,
@@ -64,15 +58,12 @@ YetiBot
64
58
  MATCH_TYPE = "match_type" # The value should be "all" or "any".
65
59
  end
66
60
 
67
- def self.compile_bots_re(bot_names=DEFAULT_BOTS)
68
- bots_str = bot_names.map {|name| Regexp.escape(name) }.join("|")
69
- Regexp.compile(bots_str, Regexp::IGNORECASE)
61
+ def self.access_by_bots?(record, bots_re=Bots::DEFAULT_RE)
62
+ bots_re =~ record.user_agent
70
63
  end
71
64
 
72
- DEFAULT_BOTS_RE = compile_bots_re
73
-
74
- def self.access_by_bots?(record, bots_re=DEFAULT_BOTS_RE)
75
- bots_re =~ record.user_agent
65
+ def self.access_to_image?(record)
66
+ IMAGE_FILE_RE =~ record.resource
76
67
  end
77
68
 
78
69
  ##
@@ -95,10 +86,15 @@ YetiBot
95
86
  record.resource.start_with?(path)
96
87
  end
97
88
 
89
+ def self.referred_from_host?(record, host_name)
90
+ record.referer_host == host_name
91
+ end
92
+
98
93
  class << self
99
- def register_query_to_log(option, logs)
94
+ def register_query_to_log(option, logs, bots_re=Bots::DEFAULT_RE)
100
95
  query = Query.new(domain: option[ConfigFields::HOST_NAME],
101
- resources: option[ConfigFields::RESOURCES])
96
+ resources: option[ConfigFields::RESOURCES],
97
+ bots_re: bots_re)
102
98
  queries = option[ConfigFields::MATCH]
103
99
  reject_unacceptable_queries(queries)
104
100
  log = logs[option[ConfigFields::OUTPUT_LOG_NAME]]
@@ -176,15 +172,20 @@ YetiBot
176
172
  end
177
173
  end
178
174
 
179
- def initialize(domain: nil, resources: [])
175
+ def initialize(domain: nil, resources: [], bots_re: Bots::DEFAULT_RE)
180
176
  @domain = domain
181
177
  @resources = normalize_resources(resources)
178
+ @bots_re = bots_re
182
179
  @normalized_resources = normalize_resources(resources)
183
180
  @normalized_dirs = @normalized_resources - @resources
184
181
  end
185
182
 
186
- def access_by_bots?(record, bots_re=DEFAULT_BOTS_RE)
187
- bots_re =~ record.user_agent
183
+ def access_by_bots?(record)
184
+ @bots_re =~ record.user_agent
185
+ end
186
+
187
+ def access_to_image?(record)
188
+ IMAGE_FILE_RE =~ record.resource
188
189
  end
189
190
 
190
191
  ##
@@ -3,6 +3,7 @@
3
3
  require 'log_line_parser'
4
4
  require 'log_line_parser/query'
5
5
  require 'csv'
6
+ require 'yaml'
6
7
 
7
8
  module LogLineParser
8
9
  module Utils
@@ -15,10 +16,6 @@ module LogLineParser
15
16
  }
16
17
  SPECIAL_CHARS_RE = Regexp.compile(SPECIAL_CHARS.keys.join("|"))
17
18
 
18
- def self.access_by_bots?(record, bots_re=Query::DEFAULT_BOTS_RE)
19
- Query.access_by_bots?(record, bots_re)
20
- end
21
-
22
19
  def self.open_multiple_output_files(base_names, dir=nil, ext="log")
23
20
  logs = {}
24
21
  filepath = dir ? File.join(dir, "%s.#{ext}") : "%s.#{ext}"
@@ -32,6 +29,16 @@ module LogLineParser
32
29
  end
33
30
  end
34
31
 
32
+ def self.read_configs(config)
33
+ YAML.load_stream(config).to_a
34
+ end
35
+
36
+ def self.load_config_file(config_file)
37
+ open(File.expand_path(config_file)) do |f|
38
+ read_configs(f.read)
39
+ end
40
+ end
41
+
35
42
  def self.to_tsv(line, escape=true)
36
43
  LogLineParser.parse(line).to_a.map do |field|
37
44
  escape ? escape_special_chars(field) : field
@@ -42,12 +49,14 @@ module LogLineParser
42
49
  LogLineParser.parse(line).to_a.to_csv
43
50
  end
44
51
 
45
- private
52
+ # private class methods
46
53
 
47
54
  def self.escape_special_chars(field)
48
55
  field.gsub(SPECIAL_CHARS_RE) do |char|
49
56
  SPECIAL_CHARS[char]
50
57
  end
51
58
  end
59
+
60
+ private_class_method :escape_special_chars
52
61
  end
53
62
  end
@@ -1,3 +1,3 @@
1
1
  module LogLineParser
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -197,7 +197,7 @@ module LogLineParser
197
197
  #
198
198
  # For example,
199
199
  #
200
- # parser = LogLineParse.parser("%h %l %u %t \"%r\" %>s %b")
200
+ # parser = LogLineParser.parser("%h %l %u %t \"%r\" %>s %b")
201
201
  #
202
202
  # creates the parser of Common Log Format.
203
203
 
@@ -1 +1,2 @@
1
1
  192.168.3.4 - - [07/Feb/2016:07:39:42 +0900] "GET /index.html HTTP/1.1" 200 432 "http://www.example.org" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
2
+ 192.168.3.4 - - [07/Feb/2016:07:39:42 +0900] "GET /index.html HTTP/1.1" 200 432 "http://www.example.org" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Applebot/0.1)"
@@ -10,3 +10,4 @@
10
10
  192.168.3.4 - quidam [07/Feb/2016:07:39:42 +0900] "GET /index.html HTTP/1.1" 200 432 "http://www.example.net/external.html" "Mozilla/5.0 (X11; U; Linux i686; ja-JP; rv:1.7.5) Gecko/20041108 Firefox/1.0"
11
11
  192.168.3.4 - quidam [07/Feb/2016:07:39:42 +0900] "GET /index.html HTTP/1.1" 200 432 "http://www.example.net/external2.html" "Mozilla/5.0 (X11; U; Linux i686; ja-JP; rv:1.7.5) Gecko/20041108 Firefox/1.0"
12
12
  192.168.3.4 - - [07/Feb/2016:07:39:42 +0900] "GET /index.html HTTP/1.1" 200 432 "http://www.example.org" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
13
+ 192.168.3.4 - - [07/Feb/2016:07:39:42 +0900] "GET /index.html HTTP/1.1" 200 432 "http://www.example.org" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Applebot/0.1)"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: log_line_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - HASHIMOTO, Naoki
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-03-27 00:00:00.000000000 Z
11
+ date: 2016-04-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -60,6 +60,7 @@ files:
60
60
  - exe/log_line_parser
61
61
  - lib/log_line_parser.rb
62
62
  - lib/log_line_parser/apache.rb
63
+ - lib/log_line_parser/bots.rb
63
64
  - lib/log_line_parser/command_line_interface.rb
64
65
  - lib/log_line_parser/line_parser.rb
65
66
  - lib/log_line_parser/ltsv.rb