log_line_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module LineParser
4
+ class Tokenizer
5
+ class << self
6
+ attr_reader :special_token_re, :non_special_token_re
7
+
8
+ def tokenize(str, tokens=[])
9
+ @scanner.string = str
10
+ token = true # to start looping, you should assign a truthy value
11
+ while token
12
+ tokens.push token if token = scan_token
13
+ end
14
+
15
+ tokens.push @scanner.rest unless @scanner.eos?
16
+ tokens
17
+ end
18
+
19
+ def setup(special_tokens, unescaped_special_tokens=[])
20
+ @special_tokens = special_tokens
21
+ @unescaped_special_tokens = unescaped_special_tokens
22
+ @scanner = StringScanner.new("".freeze)
23
+ @special_token_re, @non_special_token_re = compose_re(@special_tokens)
24
+ end
25
+
26
+ private
27
+
28
+ def scan_token
29
+ @scanner.scan(@special_token_re) ||
30
+ @scanner.scan_until(@non_special_token_re)
31
+ end
32
+
33
+ def compose_special_tokens_str(special_tokens)
34
+ sorted = special_tokens.sort {|x, y| y.length <=> x.length }
35
+ escaped = sorted.map {|token| Regexp.escape(token) }
36
+ escaped.concat @unescaped_special_tokens if @unescaped_special_tokens
37
+ escaped.join('|')
38
+ end
39
+
40
+ def compose_re(special_tokens)
41
+ tokens_str = compose_special_tokens_str(special_tokens)
42
+ return Regexp.compile(tokens_str), Regexp.compile("(?=#{tokens_str})")
43
+ end
44
+ end
45
+ end
46
+
47
+ class NodeStack
48
+ attr_reader :current_node, :root
49
+
50
+ class << self
51
+ attr_reader :root_node_class, :default_node_class
52
+
53
+ def setup(root_node_class, default_node_class)
54
+ @root_node_class = root_node_class
55
+ @default_node_class = default_node_class
56
+ end
57
+ end
58
+
59
+ def initialize
60
+ @root = self.class.root_node_class.new
61
+ @default_node_class = self.class.default_node_class
62
+ @current_node = @root
63
+ end
64
+
65
+ def push_node(node)
66
+ @current_node.push node
67
+ node.node_below = @current_node
68
+ @current_node = node
69
+ end
70
+
71
+ def pop
72
+ popped = @current_node
73
+ @current_node = @current_node.node_below
74
+ popped.node_below = nil
75
+ popped
76
+ end
77
+
78
+ def push_token(token)
79
+ @current_node.push token
80
+ end
81
+
82
+ def push(token)
83
+ if @current_node.kind_of? EscapeNode
84
+ push_escaped_token(token)
85
+ elsif @current_node.end_tag?(token)
86
+ pop
87
+ elsif subnode_class = @current_node.subnode_class(token)
88
+ push_node(subnode_class.new)
89
+ elsif @current_node.can_ignore?(token)
90
+ nil
91
+ else
92
+ push_node(@default_node_class.new) if @current_node == @root
93
+ push_token(token)
94
+ end
95
+ end
96
+
97
+ def push_escaped_token(token)
98
+ part_to_be_escaped = @current_node.part_to_be_escaped(token)
99
+ remaining_part = nil
100
+ if part_to_be_escaped
101
+ remaining_part = @current_node.remove_escaped_part(token)
102
+ push_token(part_to_be_escaped)
103
+ end
104
+ pop
105
+ push_token(remaining_part) if remaining_part
106
+ end
107
+ end
108
+
109
+ class Node
110
+ attr_accessor :node_below
111
+
112
+ class << self
113
+ attr_reader :start_tag, :end_tag, :subnode_classes
114
+ attr_reader :start_tag_to_subnode, :tokens_to_be_ignored
115
+
116
+ def register_subnode_classes(*subnode_classes)
117
+ @subnode_classes = subnode_classes
118
+ subnode_classes.each do |subnode|
119
+ @start_tag_to_subnode[subnode.start_tag] = subnode
120
+ end
121
+ end
122
+
123
+ def setup(start_tag, end_tag, to_be_ignored=[])
124
+ @start_tag_to_subnode = {}
125
+ @tokens_to_be_ignored = []
126
+ @start_tag = start_tag
127
+ @end_tag = end_tag
128
+ @tokens_to_be_ignored.concat(to_be_ignored) if to_be_ignored
129
+ end
130
+ end
131
+
132
+ attr_reader :subnodes
133
+
134
+ def initialize
135
+ @subnodes = []
136
+ @self_class = self.class
137
+ @cannot_ignore = @self_class.tokens_to_be_ignored.empty?
138
+ end
139
+
140
+ def accept(visitor, memo=nil)
141
+ visitor.visit(self, memo)
142
+ end
143
+
144
+ def to_s
145
+ @subnodes.join
146
+ end
147
+
148
+ def subnode_class(token)
149
+ @self_class.start_tag_to_subnode[token]
150
+ end
151
+
152
+ def end_tag?(token)
153
+ @self_class.end_tag == token
154
+ end
155
+
156
+ def can_ignore?(token)
157
+ if @cannot_ignore
158
+ false
159
+ else
160
+ @self_class.tokens_to_be_ignored.include?(token)
161
+ end
162
+ end
163
+
164
+ def push(token)
165
+ @subnodes.push token
166
+ end
167
+ end
168
+
169
+ class EscapeNode < Node
170
+ class << self
171
+ attr_reader :to_be_escaped, :to_be_escaped_re
172
+
173
+ def setup(start_tag, end_tag, to_be_ignored=[], to_be_escaped=[])
174
+ super(start_tag, end_tag, to_be_ignored)
175
+ @to_be_escaped = to_be_escaped
176
+ @to_be_escaped_re = compile_to_be_escaped_re(to_be_escaped)
177
+ end
178
+
179
+ def compile_to_be_escaped_re(to_be_escaped)
180
+ re_str = to_be_escaped.map {|e| Regexp.escape(e) }.join("|")
181
+ /\A(?:#{re_str})/
182
+ end
183
+ end
184
+
185
+ def remove_escaped_part(token)
186
+ token.sub(@self_class.to_be_escaped_re, ''.freeze)
187
+ end
188
+
189
+ def part_to_be_escaped(token)
190
+ @self_class.to_be_escaped.each do |e|
191
+ return e if token.start_with?(e)
192
+ end
193
+ nil
194
+ end
195
+ end
196
+
197
+ module Helpers
198
+ def define_nodes(class_name_and_setup_values)
199
+ class_name_and_setup_values.each do |name, setup_values|
200
+ new_node_class = const_set(name, Class.new(Node))
201
+ new_node_class.setup(*setup_values)
202
+ end
203
+ end
204
+
205
+ def define_node_nesting(parent_children={})
206
+ parent_children.each do |parent, children|
207
+ parent.register_subnode_classes(*children)
208
+ end
209
+ end
210
+ end
211
+ end
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'log_line_parser'
4
+ require 'log_line_parser/utils'
5
+
6
+ # MoeLogParser is added from the personal needs of the original author,
7
+ # and the LogFormat for it is not a widely used format.
8
+ # You may remove this file if you don't need it.
9
+ # (MOE is the acronym of the organization's name for which the author
10
+ # is working at the time of the first release of this program.)
11
+
12
+ module LogLineParser
13
+ # CombinedLogFormat + "%D"
14
+ MoeLogFormat = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D"
15
+ MoeLogParser = parser(MoeLogFormat)
16
+ PREDEFINED_FORMATS['moe'] = MoeLogParser
17
+ end
18
+
@@ -0,0 +1,290 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module LogLineParser
4
+ class Query
5
+ class NotAllowableMethodError < StandardError; end
6
+
7
+ module HttpMethods
8
+ OPTIONS = "OPTIONS"
9
+ GET = "GET"
10
+ HEAD = "HEAD"
11
+ POST = "POST"
12
+ PUT = "PUT"
13
+ DELETE = "DELETE"
14
+ TRACE = "TRACE"
15
+ CONNECT = "CONNECT"
16
+ PATCH = "PATCH"
17
+ end
18
+
19
+ TAIL_SLASH_RE = /\/$/
20
+ SLASH = '/'
21
+ DEFAULT_BOTS = %w(
22
+ Googlebot
23
+ Googlebot-Mobile
24
+ Mediapartners-Google
25
+ Bingbot
26
+ Slurp
27
+ Baiduspider
28
+ BaiduImagespider
29
+ BaiduMobaider
30
+ YetiBot
31
+ )
32
+
33
+ ALLOWABLE_METHODS = [
34
+ :access_by_bots?,
35
+ :referred_from_resources?,
36
+ :referred_from_under_resources?,
37
+ :access_to_resources?,
38
+ :access_to_under_resources?,
39
+ :status_code_206?,
40
+ :status_code_301?,
41
+ :status_code_304?,
42
+ :status_code_404?,
43
+ :partial_content?,
44
+ :moved_permanently?,
45
+ :not_modified?,
46
+ :not_found?,
47
+ :options_method?,
48
+ :get_method?,
49
+ :head_method?,
50
+ :post_method?,
51
+ :put_method?,
52
+ :delete_method?,
53
+ :trace_method?,
54
+ :connect_method?,
55
+ :patch_method?,
56
+ ]
57
+
58
+ module ConfigFields
59
+ HOST_NAME = "host_name"
60
+ RESOURCES = "resources"
61
+ MATCH = "match"
62
+ IGNORE_MATCH = "ignore_match"
63
+ OUTPUT_LOG_NAME = "output_log_name"
64
+ MATCH_TYPE = "match_type" # The value should be "all" or "any".
65
+ end
66
+
67
+ def self.compile_bots_re(bot_names=DEFAULT_BOTS)
68
+ bots_str = bot_names.map {|name| Regexp.escape(name) }.join("|")
69
+ Regexp.compile(bots_str, Regexp::IGNORECASE)
70
+ end
71
+
72
+ DEFAULT_BOTS_RE = compile_bots_re
73
+
74
+ def self.access_by_bots?(record, bots_re=DEFAULT_BOTS_RE)
75
+ bots_re =~ record.user_agent
76
+ end
77
+
78
+ def self.referred_from_resources?(record, resources=[])
79
+ resources.include?(record.referer_resource)
80
+ end
81
+
82
+ def self.referred_from_under?(record, path)
83
+ record.referer_resource.start_with?(path)
84
+ end
85
+
86
+ def self.access_to_resources?(record, resources=[])
87
+ resources.include?(record.resource)
88
+ end
89
+
90
+ def self.access_to_under?(record, path)
91
+ record.resource.start_with?(path)
92
+ end
93
+
94
+ class << self
95
+ def register_query_to_log(option, logs)
96
+ query = Query.new(domain: option[ConfigFields::HOST_NAME],
97
+ resources: option[ConfigFields::RESOURCES])
98
+ queries = option[ConfigFields::MATCH]
99
+ reject_unacceptable_queries(queries)
100
+ log = logs[option[ConfigFields::OUTPUT_LOG_NAME]]
101
+ match_type = option[ConfigFields::MATCH_TYPE]
102
+ ignore_match = option[ConfigFields::IGNORE_MATCH]
103
+ reject_unacceptable_queries(ignore_match) if ignore_match
104
+ compile_query(match_type, log, query, queries, ignore_match)
105
+ end
106
+
107
+ private
108
+
109
+ def reject_unacceptable_queries(queries)
110
+ unacceptable_queries = queries - ALLOWABLE_METHODS
111
+ unless unacceptable_queries.empty?
112
+ message = error_message_for_unacceptable_queries(unacceptable_queries)
113
+ raise NotAllowableMethodError.new(message)
114
+ end
115
+ end
116
+
117
+ def error_message_for_unacceptable_queries(unacceptable_queries)
118
+ query_names = unacceptable_queries.join(", ")
119
+ if unacceptable_queries.length == 1
120
+ "An unacceptable query is set: #{query_names}"
121
+ else
122
+ "Unacceptable queries are set: #{query_names}"
123
+ end
124
+ end
125
+
126
+ def log_if_all_match(log, query, queries)
127
+ proc do |line, record|
128
+ if queries.all? {|method| query.send(method, record) }
129
+ log.print line
130
+ end
131
+ end
132
+ end
133
+
134
+ def log_if_any_match(log, query, queries)
135
+ proc do |line, record|
136
+ if queries.any? {|method| query.send(method, record) }
137
+ log.print line
138
+ end
139
+ end
140
+ end
141
+
142
+ def log_if_all_match_but(log, query, queries, ignore_match)
143
+ proc do |line, record|
144
+ if queries.all? {|method| query.send(method, record) } and
145
+ not ignore_match.any? {|method| query.send(method, record) }
146
+ log.print line
147
+ end
148
+ end
149
+ end
150
+
151
+ def log_if_any_match_but(log, query, queries, ignore_match)
152
+ proc do |line, record|
153
+ if queries.any? {|method| query.send(method, record) } and
154
+ not ignore_match.any? {|method| query.send(method, record) }
155
+ log.print line
156
+ end
157
+ end
158
+ end
159
+
160
+ def compile_query(match_type, log, query, queries, ignore_match)
161
+ if match_type == "all".freeze
162
+ if ignore_match
163
+ return log_if_all_match_but(log, query, queries, ignore_match)
164
+ end
165
+ log_if_all_match(log, query, queries)
166
+ else
167
+ if ignore_match
168
+ return log_if_any_match_but(log, query, queries, ignore_match)
169
+ end
170
+ log_if_any_match(log, query, queries)
171
+ end
172
+ end
173
+ end
174
+
175
+ def initialize(domain: nil, resources: [])
176
+ @domain = domain
177
+ @resources = normalize_resources(resources)
178
+ @normalized_resources = normalize_resources(resources)
179
+ @normalized_dirs = @normalized_resources - @resources
180
+ end
181
+
182
+ def access_by_bots?(record, bots_re=DEFAULT_BOTS_RE)
183
+ bots_re =~ record.user_agent
184
+ end
185
+
186
+ def referred_from_resources?(record)
187
+ if_matching_domain(record) and
188
+ @normalized_resources.include?(record.referer_resource)
189
+ end
190
+
191
+ def referred_from_under_resources?(record)
192
+ referer_resource = record.referer_resource
193
+ if_matching_domain(record) and
194
+ @normalized_dirs.include?(referer_resource) or
195
+ @resources.any?{|target| referer_resource.start_with?(target) }
196
+ end
197
+
198
+ def access_to_resources?(record)
199
+ @normalized_resources.include?(record.resource)
200
+ end
201
+
202
+ def access_to_under_resources?(record)
203
+ resource = record.resource
204
+ @normalized_dirs.include?(resource) or
205
+ @resources.any? {|target| resource.start_with?(target) }
206
+ end
207
+
208
+ def status_code_206?(record)
209
+ record.last_request_status == 206
210
+ end
211
+
212
+ def status_code_301?(record)
213
+ record.last_request_status == 301
214
+ end
215
+
216
+ def status_code_304?(record)
217
+ record.last_request_status == 304
218
+ end
219
+
220
+ def status_code_404?(record)
221
+ record.last_request_status == 404
222
+ end
223
+
224
+ alias :partial_content? :status_code_206?
225
+ alias :moved_permanently? :status_code_301?
226
+ alias :not_modified? :status_code_304?
227
+ alias :not_found? :status_code_404?
228
+
229
+ def options_method?(record)
230
+ record.method == HttpMethods::OPTIONS
231
+ end
232
+
233
+ def get_method?(record)
234
+ record.method == HttpMethods::GET
235
+ end
236
+
237
+ def head_method?(record)
238
+ record.method == HttpMethods::HEAD
239
+ end
240
+
241
+ def post_method?(record)
242
+ record.method == HttpMethods::POST
243
+ end
244
+
245
+ def put_method?(record)
246
+ record.method == HttpMethods::PUT
247
+ end
248
+
249
+ def delete_method?(record)
250
+ record.method == HttpMethods::DELETE
251
+ end
252
+
253
+ def trace_method?(record)
254
+ record.method == HttpMethods::TRACE
255
+ end
256
+
257
+ def connect_method?(record)
258
+ record.method == HttpMethods::CONNECT
259
+ end
260
+
261
+ def patch_method?(record)
262
+ record.method == HttpMethods::PATCH
263
+ end
264
+
265
+ private
266
+
267
+ def if_matching_domain(record)
268
+ # When @domain is not set, it should be ignored.
269
+ not @domain or @domain == record.referer_host
270
+ end
271
+
272
+ def normalize_resources(resources)
273
+ [].tap do |normalized|
274
+ resources.each do |resource|
275
+ # record.referer_resource is expected to return '/'
276
+ # even when the value of record.referer doesn't end
277
+ # with a slash (e.g. 'http://www.example.org').
278
+ # So in the normalized result, you don't have to include
279
+ # an empty string that corresponds to the root of a given
280
+ # domain.
281
+ if TAIL_SLASH_RE =~ resource and SLASH != resource
282
+ normalized.push resource.sub(TAIL_SLASH_RE, "".freeze)
283
+ end
284
+
285
+ normalized.push resource
286
+ end
287
+ end
288
+ end
289
+ end
290
+ end
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'log_line_parser'
4
+ require 'log_line_parser/query'
5
+ require 'csv'
6
+
7
+ module LogLineParser
8
+ module Utils
9
+ TAB = "\t"
10
+ SPECIAL_CHARS = {
11
+ "\t" => '\\t',
12
+ "\n" => '\\n',
13
+ "\r" => '\\r',
14
+ '\\\\' => '\\\\',
15
+ }
16
+ SPECIAL_CHARS_RE = Regexp.compile(SPECIAL_CHARS.keys.join("|"))
17
+
18
+ def self.access_by_bots?(record, bots_re=Query::DEFAULT_BOTS_RE)
19
+ Query.access_by_bots?(record, bots_re)
20
+ end
21
+
22
+ def self.open_multiple_output_files(base_names, dir=nil, ext="log")
23
+ logs = {}
24
+ filepath = dir ? File.join(dir, "%s.#{ext}") : "%s.#{ext}"
25
+ base_names.each do |base|
26
+ logs[base] = open(format(filepath, base), "w")
27
+ end
28
+ yield logs
29
+ ensure
30
+ logs.each do |k, v|
31
+ v.close
32
+ end
33
+ end
34
+
35
+ def self.to_tsv(line, escape=true)
36
+ LogLineParser.parse(line).to_a.map do |field|
37
+ escape ? escape_special_chars(field) : field
38
+ end.join(TAB)
39
+ end
40
+
41
+ def self.to_csv(line)
42
+ LogLineParser.parse(line).to_a.to_csv
43
+ end
44
+
45
+ private
46
+
47
+ def self.escape_special_chars(field)
48
+ field.gsub(SPECIAL_CHARS_RE) do |char|
49
+ SPECIAL_CHARS[char]
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,3 @@
1
+ module LogLineParser
2
+ VERSION = "0.1.0"
3
+ end