log_line_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,211 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module LineParser
4
+ class Tokenizer
5
+ class << self
6
+ attr_reader :special_token_re, :non_special_token_re
7
+
8
+ def tokenize(str, tokens=[])
9
+ @scanner.string = str
10
+ token = true # to start looping, you should assign a truthy value
11
+ while token
12
+ tokens.push token if token = scan_token
13
+ end
14
+
15
+ tokens.push @scanner.rest unless @scanner.eos?
16
+ tokens
17
+ end
18
+
19
+ def setup(special_tokens, unescaped_special_tokens=[])
20
+ @special_tokens = special_tokens
21
+ @unescaped_special_tokens = unescaped_special_tokens
22
+ @scanner = StringScanner.new("".freeze)
23
+ @special_token_re, @non_special_token_re = compose_re(@special_tokens)
24
+ end
25
+
26
+ private
27
+
28
+ def scan_token
29
+ @scanner.scan(@special_token_re) ||
30
+ @scanner.scan_until(@non_special_token_re)
31
+ end
32
+
33
+ def compose_special_tokens_str(special_tokens)
34
+ sorted = special_tokens.sort {|x, y| y.length <=> x.length }
35
+ escaped = sorted.map {|token| Regexp.escape(token) }
36
+ escaped.concat @unescaped_special_tokens if @unescaped_special_tokens
37
+ escaped.join('|')
38
+ end
39
+
40
+ def compose_re(special_tokens)
41
+ tokens_str = compose_special_tokens_str(special_tokens)
42
+ return Regexp.compile(tokens_str), Regexp.compile("(?=#{tokens_str})")
43
+ end
44
+ end
45
+ end
46
+
47
+ class NodeStack
48
+ attr_reader :current_node, :root
49
+
50
+ class << self
51
+ attr_reader :root_node_class, :default_node_class
52
+
53
+ def setup(root_node_class, default_node_class)
54
+ @root_node_class = root_node_class
55
+ @default_node_class = default_node_class
56
+ end
57
+ end
58
+
59
+ def initialize
60
+ @root = self.class.root_node_class.new
61
+ @default_node_class = self.class.default_node_class
62
+ @current_node = @root
63
+ end
64
+
65
+ def push_node(node)
66
+ @current_node.push node
67
+ node.node_below = @current_node
68
+ @current_node = node
69
+ end
70
+
71
+ def pop
72
+ popped = @current_node
73
+ @current_node = @current_node.node_below
74
+ popped.node_below = nil
75
+ popped
76
+ end
77
+
78
+ def push_token(token)
79
+ @current_node.push token
80
+ end
81
+
82
+ def push(token)
83
+ if @current_node.kind_of? EscapeNode
84
+ push_escaped_token(token)
85
+ elsif @current_node.end_tag?(token)
86
+ pop
87
+ elsif subnode_class = @current_node.subnode_class(token)
88
+ push_node(subnode_class.new)
89
+ elsif @current_node.can_ignore?(token)
90
+ nil
91
+ else
92
+ push_node(@default_node_class.new) if @current_node == @root
93
+ push_token(token)
94
+ end
95
+ end
96
+
97
+ def push_escaped_token(token)
98
+ part_to_be_escaped = @current_node.part_to_be_escaped(token)
99
+ remaining_part = nil
100
+ if part_to_be_escaped
101
+ remaining_part = @current_node.remove_escaped_part(token)
102
+ push_token(part_to_be_escaped)
103
+ end
104
+ pop
105
+ push_token(remaining_part) if remaining_part
106
+ end
107
+ end
108
+
109
+ class Node
110
+ attr_accessor :node_below
111
+
112
+ class << self
113
+ attr_reader :start_tag, :end_tag, :subnode_classes
114
+ attr_reader :start_tag_to_subnode, :tokens_to_be_ignored
115
+
116
+ def register_subnode_classes(*subnode_classes)
117
+ @subnode_classes = subnode_classes
118
+ subnode_classes.each do |subnode|
119
+ @start_tag_to_subnode[subnode.start_tag] = subnode
120
+ end
121
+ end
122
+
123
+ def setup(start_tag, end_tag, to_be_ignored=[])
124
+ @start_tag_to_subnode = {}
125
+ @tokens_to_be_ignored = []
126
+ @start_tag = start_tag
127
+ @end_tag = end_tag
128
+ @tokens_to_be_ignored.concat(to_be_ignored) if to_be_ignored
129
+ end
130
+ end
131
+
132
+ attr_reader :subnodes
133
+
134
+ def initialize
135
+ @subnodes = []
136
+ @self_class = self.class
137
+ @cannot_ignore = @self_class.tokens_to_be_ignored.empty?
138
+ end
139
+
140
+ def accept(visitor, memo=nil)
141
+ visitor.visit(self, memo)
142
+ end
143
+
144
+ def to_s
145
+ @subnodes.join
146
+ end
147
+
148
+ def subnode_class(token)
149
+ @self_class.start_tag_to_subnode[token]
150
+ end
151
+
152
+ def end_tag?(token)
153
+ @self_class.end_tag == token
154
+ end
155
+
156
+ def can_ignore?(token)
157
+ if @cannot_ignore
158
+ false
159
+ else
160
+ @self_class.tokens_to_be_ignored.include?(token)
161
+ end
162
+ end
163
+
164
+ def push(token)
165
+ @subnodes.push token
166
+ end
167
+ end
168
+
169
+ class EscapeNode < Node
170
+ class << self
171
+ attr_reader :to_be_escaped, :to_be_escaped_re
172
+
173
+ def setup(start_tag, end_tag, to_be_ignored=[], to_be_escaped=[])
174
+ super(start_tag, end_tag, to_be_ignored)
175
+ @to_be_escaped = to_be_escaped
176
+ @to_be_escaped_re = compile_to_be_escaped_re(to_be_escaped)
177
+ end
178
+
179
+ def compile_to_be_escaped_re(to_be_escaped)
180
+ re_str = to_be_escaped.map {|e| Regexp.escape(e) }.join("|")
181
+ /\A(?:#{re_str})/
182
+ end
183
+ end
184
+
185
+ def remove_escaped_part(token)
186
+ token.sub(@self_class.to_be_escaped_re, ''.freeze)
187
+ end
188
+
189
+ def part_to_be_escaped(token)
190
+ @self_class.to_be_escaped.each do |e|
191
+ return e if token.start_with?(e)
192
+ end
193
+ nil
194
+ end
195
+ end
196
+
197
+ module Helpers
198
+ def define_nodes(class_name_and_setup_values)
199
+ class_name_and_setup_values.each do |name, setup_values|
200
+ new_node_class = const_set(name, Class.new(Node))
201
+ new_node_class.setup(*setup_values)
202
+ end
203
+ end
204
+
205
+ def define_node_nesting(parent_children={})
206
+ parent_children.each do |parent, children|
207
+ parent.register_subnode_classes(*children)
208
+ end
209
+ end
210
+ end
211
+ end
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'log_line_parser'
4
+ require 'log_line_parser/utils'
5
+
6
+ # MoeLogParser is added from the personal needs of the original author,
7
+ # and the LogFormat for it is not a widely used format.
8
+ # You may remove this file if you don't need it.
9
+ # (MOE is the acronym of the organization's name for which the author
10
+ # is working at the time of the first release of this program.)
11
+
12
+ module LogLineParser
13
+ # CombinedLogFormat + "%D"
14
+ MoeLogFormat = "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D"
15
+ MoeLogParser = parser(MoeLogFormat)
16
+ PREDEFINED_FORMATS['moe'] = MoeLogParser
17
+ end
18
+
@@ -0,0 +1,290 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ module LogLineParser
4
+ class Query
5
+ class NotAllowableMethodError < StandardError; end
6
+
7
+ module HttpMethods
8
+ OPTIONS = "OPTIONS"
9
+ GET = "GET"
10
+ HEAD = "HEAD"
11
+ POST = "POST"
12
+ PUT = "PUT"
13
+ DELETE = "DELETE"
14
+ TRACE = "TRACE"
15
+ CONNECT = "CONNECT"
16
+ PATCH = "PATCH"
17
+ end
18
+
19
+ TAIL_SLASH_RE = /\/$/
20
+ SLASH = '/'
21
+ DEFAULT_BOTS = %w(
22
+ Googlebot
23
+ Googlebot-Mobile
24
+ Mediapartners-Google
25
+ Bingbot
26
+ Slurp
27
+ Baiduspider
28
+ BaiduImagespider
29
+ BaiduMobaider
30
+ YetiBot
31
+ )
32
+
33
+ ALLOWABLE_METHODS = [
34
+ :access_by_bots?,
35
+ :referred_from_resources?,
36
+ :referred_from_under_resources?,
37
+ :access_to_resources?,
38
+ :access_to_under_resources?,
39
+ :status_code_206?,
40
+ :status_code_301?,
41
+ :status_code_304?,
42
+ :status_code_404?,
43
+ :partial_content?,
44
+ :moved_permanently?,
45
+ :not_modified?,
46
+ :not_found?,
47
+ :options_method?,
48
+ :get_method?,
49
+ :head_method?,
50
+ :post_method?,
51
+ :put_method?,
52
+ :delete_method?,
53
+ :trace_method?,
54
+ :connect_method?,
55
+ :patch_method?,
56
+ ]
57
+
58
+ module ConfigFields
59
+ HOST_NAME = "host_name"
60
+ RESOURCES = "resources"
61
+ MATCH = "match"
62
+ IGNORE_MATCH = "ignore_match"
63
+ OUTPUT_LOG_NAME = "output_log_name"
64
+ MATCH_TYPE = "match_type" # The value should be "all" or "any".
65
+ end
66
+
67
+ def self.compile_bots_re(bot_names=DEFAULT_BOTS)
68
+ bots_str = bot_names.map {|name| Regexp.escape(name) }.join("|")
69
+ Regexp.compile(bots_str, Regexp::IGNORECASE)
70
+ end
71
+
72
+ DEFAULT_BOTS_RE = compile_bots_re
73
+
74
+ def self.access_by_bots?(record, bots_re=DEFAULT_BOTS_RE)
75
+ bots_re =~ record.user_agent
76
+ end
77
+
78
+ def self.referred_from_resources?(record, resources=[])
79
+ resources.include?(record.referer_resource)
80
+ end
81
+
82
+ def self.referred_from_under?(record, path)
83
+ record.referer_resource.start_with?(path)
84
+ end
85
+
86
+ def self.access_to_resources?(record, resources=[])
87
+ resources.include?(record.resource)
88
+ end
89
+
90
+ def self.access_to_under?(record, path)
91
+ record.resource.start_with?(path)
92
+ end
93
+
94
+ class << self
95
+ def register_query_to_log(option, logs)
96
+ query = Query.new(domain: option[ConfigFields::HOST_NAME],
97
+ resources: option[ConfigFields::RESOURCES])
98
+ queries = option[ConfigFields::MATCH]
99
+ reject_unacceptable_queries(queries)
100
+ log = logs[option[ConfigFields::OUTPUT_LOG_NAME]]
101
+ match_type = option[ConfigFields::MATCH_TYPE]
102
+ ignore_match = option[ConfigFields::IGNORE_MATCH]
103
+ reject_unacceptable_queries(ignore_match) if ignore_match
104
+ compile_query(match_type, log, query, queries, ignore_match)
105
+ end
106
+
107
+ private
108
+
109
+ def reject_unacceptable_queries(queries)
110
+ unacceptable_queries = queries - ALLOWABLE_METHODS
111
+ unless unacceptable_queries.empty?
112
+ message = error_message_for_unacceptable_queries(unacceptable_queries)
113
+ raise NotAllowableMethodError.new(message)
114
+ end
115
+ end
116
+
117
+ def error_message_for_unacceptable_queries(unacceptable_queries)
118
+ query_names = unacceptable_queries.join(", ")
119
+ if unacceptable_queries.length == 1
120
+ "An unacceptable query is set: #{query_names}"
121
+ else
122
+ "Unacceptable queries are set: #{query_names}"
123
+ end
124
+ end
125
+
126
+ def log_if_all_match(log, query, queries)
127
+ proc do |line, record|
128
+ if queries.all? {|method| query.send(method, record) }
129
+ log.print line
130
+ end
131
+ end
132
+ end
133
+
134
+ def log_if_any_match(log, query, queries)
135
+ proc do |line, record|
136
+ if queries.any? {|method| query.send(method, record) }
137
+ log.print line
138
+ end
139
+ end
140
+ end
141
+
142
+ def log_if_all_match_but(log, query, queries, ignore_match)
143
+ proc do |line, record|
144
+ if queries.all? {|method| query.send(method, record) } and
145
+ not ignore_match.any? {|method| query.send(method, record) }
146
+ log.print line
147
+ end
148
+ end
149
+ end
150
+
151
+ def log_if_any_match_but(log, query, queries, ignore_match)
152
+ proc do |line, record|
153
+ if queries.any? {|method| query.send(method, record) } and
154
+ not ignore_match.any? {|method| query.send(method, record) }
155
+ log.print line
156
+ end
157
+ end
158
+ end
159
+
160
+ def compile_query(match_type, log, query, queries, ignore_match)
161
+ if match_type == "all".freeze
162
+ if ignore_match
163
+ return log_if_all_match_but(log, query, queries, ignore_match)
164
+ end
165
+ log_if_all_match(log, query, queries)
166
+ else
167
+ if ignore_match
168
+ return log_if_any_match_but(log, query, queries, ignore_match)
169
+ end
170
+ log_if_any_match(log, query, queries)
171
+ end
172
+ end
173
+ end
174
+
175
+ def initialize(domain: nil, resources: [])
176
+ @domain = domain
177
+ @resources = normalize_resources(resources)
178
+ @normalized_resources = normalize_resources(resources)
179
+ @normalized_dirs = @normalized_resources - @resources
180
+ end
181
+
182
+ def access_by_bots?(record, bots_re=DEFAULT_BOTS_RE)
183
+ bots_re =~ record.user_agent
184
+ end
185
+
186
+ def referred_from_resources?(record)
187
+ if_matching_domain(record) and
188
+ @normalized_resources.include?(record.referer_resource)
189
+ end
190
+
191
+ def referred_from_under_resources?(record)
192
+ referer_resource = record.referer_resource
193
+ if_matching_domain(record) and
194
+ @normalized_dirs.include?(referer_resource) or
195
+ @resources.any?{|target| referer_resource.start_with?(target) }
196
+ end
197
+
198
+ def access_to_resources?(record)
199
+ @normalized_resources.include?(record.resource)
200
+ end
201
+
202
+ def access_to_under_resources?(record)
203
+ resource = record.resource
204
+ @normalized_dirs.include?(resource) or
205
+ @resources.any? {|target| resource.start_with?(target) }
206
+ end
207
+
208
+ def status_code_206?(record)
209
+ record.last_request_status == 206
210
+ end
211
+
212
+ def status_code_301?(record)
213
+ record.last_request_status == 301
214
+ end
215
+
216
+ def status_code_304?(record)
217
+ record.last_request_status == 304
218
+ end
219
+
220
+ def status_code_404?(record)
221
+ record.last_request_status == 404
222
+ end
223
+
224
+ alias :partial_content? :status_code_206?
225
+ alias :moved_permanently? :status_code_301?
226
+ alias :not_modified? :status_code_304?
227
+ alias :not_found? :status_code_404?
228
+
229
+ def options_method?(record)
230
+ record.method == HttpMethods::OPTIONS
231
+ end
232
+
233
+ def get_method?(record)
234
+ record.method == HttpMethods::GET
235
+ end
236
+
237
+ def head_method?(record)
238
+ record.method == HttpMethods::HEAD
239
+ end
240
+
241
+ def post_method?(record)
242
+ record.method == HttpMethods::POST
243
+ end
244
+
245
+ def put_method?(record)
246
+ record.method == HttpMethods::PUT
247
+ end
248
+
249
+ def delete_method?(record)
250
+ record.method == HttpMethods::DELETE
251
+ end
252
+
253
+ def trace_method?(record)
254
+ record.method == HttpMethods::TRACE
255
+ end
256
+
257
+ def connect_method?(record)
258
+ record.method == HttpMethods::CONNECT
259
+ end
260
+
261
+ def patch_method?(record)
262
+ record.method == HttpMethods::PATCH
263
+ end
264
+
265
+ private
266
+
267
+ def if_matching_domain(record)
268
+ # When @domain is not set, it should be ignored.
269
+ not @domain or @domain == record.referer_host
270
+ end
271
+
272
+ def normalize_resources(resources)
273
+ [].tap do |normalized|
274
+ resources.each do |resource|
275
+ # record.referer_resource is expected to return '/'
276
+ # even when the value of record.referer doesn't end
277
+ # with a slash (e.g. 'http://www.example.org').
278
+ # So in the normalized result, you don't have to include
279
+ # an empty string that corresponds to the root of a given
280
+ # domain.
281
+ if TAIL_SLASH_RE =~ resource and SLASH != resource
282
+ normalized.push resource.sub(TAIL_SLASH_RE, "".freeze)
283
+ end
284
+
285
+ normalized.push resource
286
+ end
287
+ end
288
+ end
289
+ end
290
+ end
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'log_line_parser'
4
+ require 'log_line_parser/query'
5
+ require 'csv'
6
+
7
+ module LogLineParser
8
+ module Utils
9
+ TAB = "\t"
10
+ SPECIAL_CHARS = {
11
+ "\t" => '\\t',
12
+ "\n" => '\\n',
13
+ "\r" => '\\r',
14
+ '\\\\' => '\\\\',
15
+ }
16
+ SPECIAL_CHARS_RE = Regexp.compile(SPECIAL_CHARS.keys.join("|"))
17
+
18
+ def self.access_by_bots?(record, bots_re=Query::DEFAULT_BOTS_RE)
19
+ Query.access_by_bots?(record, bots_re)
20
+ end
21
+
22
+ def self.open_multiple_output_files(base_names, dir=nil, ext="log")
23
+ logs = {}
24
+ filepath = dir ? File.join(dir, "%s.#{ext}") : "%s.#{ext}"
25
+ base_names.each do |base|
26
+ logs[base] = open(format(filepath, base), "w")
27
+ end
28
+ yield logs
29
+ ensure
30
+ logs.each do |k, v|
31
+ v.close
32
+ end
33
+ end
34
+
35
+ def self.to_tsv(line, escape=true)
36
+ LogLineParser.parse(line).to_a.map do |field|
37
+ escape ? escape_special_chars(field) : field
38
+ end.join(TAB)
39
+ end
40
+
41
+ def self.to_csv(line)
42
+ LogLineParser.parse(line).to_a.to_csv
43
+ end
44
+
45
+ private
46
+
47
+ def self.escape_special_chars(field)
48
+ field.gsub(SPECIAL_CHARS_RE) do |char|
49
+ SPECIAL_CHARS[char]
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,3 @@
1
+ module LogLineParser
2
+ VERSION = "0.1.0"
3
+ end