apachecrunch 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +1 -0
- data/bin/apachecrunch +70 -0
- data/lib/apachecrunch.rb +316 -0
- data/lib/log_element.rb +297 -0
- data/lib/procedure_dsl.rb +308 -0
- data/lib/progress.rb +65 -0
- metadata +74 -0
data/LICENSE
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
http://creativecommons.org/licenses/by-sa/3.0/
|
data/bin/apachecrunch
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# For development while inside the apachecrunch dir:
|
4
|
+
$: << ".."
|
5
|
+
$: << "./lib"
|
6
|
+
require "rubygems"
|
7
|
+
|
8
|
+
require "apachecrunch"
|
9
|
+
require "progress"
|
10
|
+
require "procedure_dsl"
|
11
|
+
|
12
|
+
|
13
|
+
# Prints the usage message and exits with the given exit code
|
14
|
+
def barf_usage(exit_code)
|
15
|
+
puts %q!USAGE:
|
16
|
+
apachecrunch.rb <PROCEDURE> <LOG>
|
17
|
+
[--format=<FORMAT NAME>] [--progress <METER TYPE>]
|
18
|
+
|
19
|
+
--progress: Gives you a progress meter as the log file is parsed. METER TYPE can be "entry",
|
20
|
+
which prints out how many entries have been parsed so far, or "time", which prints
|
21
|
+
out the time of the last entry parsed.!
|
22
|
+
exit exit_code
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
# Parses arguments
|
27
|
+
#
|
28
|
+
# Returns a hash with these keys (as symbols):
|
29
|
+
# procedure: The path to the procedure DSL file
|
30
|
+
# logfile: The path to the log file
|
31
|
+
# format: The name of the log format specified ("ncsa" by default)
|
32
|
+
def parse_args
|
33
|
+
args = ARGV.clone
|
34
|
+
options = {}
|
35
|
+
|
36
|
+
# Defaults
|
37
|
+
options[:format] = "ncsa"
|
38
|
+
options[:progress] = nil
|
39
|
+
|
40
|
+
while a = args.shift
|
41
|
+
if a == "--format"
|
42
|
+
options[:format] = args.shift
|
43
|
+
elsif a == "--help"
|
44
|
+
barf_usage(0)
|
45
|
+
elsif a == "--progress"
|
46
|
+
options[:progress] = args.shift
|
47
|
+
elsif options.key?(:procedure)
|
48
|
+
options[:logfile] = a
|
49
|
+
else
|
50
|
+
options[:procedure] = a
|
51
|
+
end
|
52
|
+
end
|
53
|
+
unless options.key?(:procedure) and options.key?(:logfile)
|
54
|
+
barf_usage(1)
|
55
|
+
end
|
56
|
+
|
57
|
+
return options
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
options = parse_args
|
62
|
+
|
63
|
+
format_string = FormatStringFinder.new.find(options[:format])
|
64
|
+
progress_meter = ProgressMeterFactory.from_options(options)
|
65
|
+
log_parser = LogParserFactory.log_parser(
|
66
|
+
format_string=format_string,
|
67
|
+
path=options[:logfile],
|
68
|
+
progress_meter=progress_meter)
|
69
|
+
proc_env = ProcedureEnvironment.new(log_parser)
|
70
|
+
proc_env.eval_procedure(open(options[:procedure]).read())
|
data/lib/apachecrunch.rb
ADDED
@@ -0,0 +1,316 @@
|
|
1
|
+
require "date"
|
2
|
+
require "tempfile"
|
3
|
+
|
4
|
+
require 'log_element'
|
5
|
+
|
6
|
+
|
7
|
+
# A parsed entry from the log.
|
8
|
+
#
|
9
|
+
# Acts like a hash, in that you get at the log elements (e.g. "url_path", "remote_host") by
|
10
|
+
# as entry[name].
|
11
|
+
class LogEntry
|
12
|
+
def initialize(derivation_map)
|
13
|
+
@_derivation_map = derivation_map
|
14
|
+
@_attributes = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def []=(name, value)
|
18
|
+
@_attributes[name] = value
|
19
|
+
end
|
20
|
+
|
21
|
+
def [](name)
|
22
|
+
return @_attributes[name] if @_attributes.key?(name)
|
23
|
+
|
24
|
+
derived_from_cls = @_derivation_map[name]
|
25
|
+
return nil if derived_from_cls.nil?
|
26
|
+
|
27
|
+
derived_from_cls.derive(name, @_attributes[derived_from_cls.name])
|
28
|
+
end
|
29
|
+
|
30
|
+
def merge!(hsh)
|
31
|
+
@_attributes.merge!(hsh)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
# A bare string in a log format
|
37
|
+
#
|
38
|
+
# Exposes 'regex' for consistency with LogFormatElement, but there shouldn't be anything other
|
39
|
+
# than one-to-one character matching in there.
|
40
|
+
class LogFormatString
|
41
|
+
attr_accessor :regex
|
42
|
+
|
43
|
+
def initialize(regex)
|
44
|
+
@regex = regex
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
# Represents a particular Apache log format
|
50
|
+
class LogFormat
|
51
|
+
attr_accessor :format_string, :tokens
|
52
|
+
|
53
|
+
def initialize
|
54
|
+
@tokens = []
|
55
|
+
@_regex = nil
|
56
|
+
end
|
57
|
+
|
58
|
+
# Appends a given token (a LogFormatElement or LogFormatString) to the tokens list
|
59
|
+
def append(token)
|
60
|
+
@tokens << token
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns a compiled regex to match a log line in this format
|
64
|
+
def regex
|
65
|
+
return @_regex unless @_regex.nil?
|
66
|
+
|
67
|
+
r = "^"
|
68
|
+
@tokens.each do |tok|
|
69
|
+
# We only care to remember the LogFormatElements. No need to put parentheses
|
70
|
+
# around LogFormatString shit.
|
71
|
+
if tok.respond_to?(:name)
|
72
|
+
r += "(" + tok.regex + ")"
|
73
|
+
else
|
74
|
+
r += tok.regex
|
75
|
+
end
|
76
|
+
end
|
77
|
+
r += "$"
|
78
|
+
|
79
|
+
@_regex = Regexp.compile(r)
|
80
|
+
@_regex
|
81
|
+
end
|
82
|
+
|
83
|
+
# Returns the list of LogFormatElements, in order, of the interpolated things in the format.
|
84
|
+
#
|
85
|
+
# For example, if the log format string were "%h %u %{Referer}i", this would return the
|
86
|
+
# LogFormatElement instances for "%h", "%u", and "%{Referer}i".
|
87
|
+
def elements
|
88
|
+
@tokens.find_all do |tok|
|
89
|
+
tok.respond_to?(:name)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns hash mapping names of elements to the element class from which they can be derived.
|
94
|
+
def derivation_map
|
95
|
+
hsh = {}
|
96
|
+
elements.each do |tok|
|
97
|
+
tok.derived_elements.each do |derived_element|
|
98
|
+
hsh[derived_element.name] = tok.class
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
hsh
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
# Turns a string specifying an Apache log format into a LogFormat instance
|
108
|
+
class LogFormatFactory
|
109
|
+
def initialize
|
110
|
+
@element_factory = LogFormatElementFactory.new
|
111
|
+
end
|
112
|
+
|
113
|
+
# Constructs and returns a LogFormat instance based on the given Apache log format string
|
114
|
+
def from_format_string(f_string)
|
115
|
+
logformat = LogFormat.new
|
116
|
+
logformat.format_string = f_string
|
117
|
+
|
118
|
+
until f_string.empty?
|
119
|
+
token, f_string = _shift_token(f_string)
|
120
|
+
logformat.append(token)
|
121
|
+
end
|
122
|
+
|
123
|
+
logformat
|
124
|
+
end
|
125
|
+
|
126
|
+
# Finds the first token (a LogFormatElement or LogFormatString) in a format string
|
127
|
+
#
|
128
|
+
# Returns a list containing the token and the new format string (with the characters that
|
129
|
+
# correspond to the token removed)
|
130
|
+
def _shift_token(f_string)
|
131
|
+
if f_string =~ /^%%(.*)/
|
132
|
+
# Literal "%"
|
133
|
+
return [LogFormatString.new("%%"), $1]
|
134
|
+
elsif f_string =~ /^(%[A-Za-z])(.*)/
|
135
|
+
# Simple element (e.g. "%h", "%u")
|
136
|
+
return [@element_factory.from_abbrev($1), $2]
|
137
|
+
elsif f_string =~ /^(%\{.+?\}[Ceinor])(.*)/
|
138
|
+
# "Contents of" element (e.g. "%{Accept}i")
|
139
|
+
return [@element_factory.from_abbrev($1), $2]
|
140
|
+
elsif f_string =~ /^(.+?)(%.*|$)/
|
141
|
+
# Bare string up until the next %, or up until the end of the format string
|
142
|
+
return [LogFormatString.new($1), $2]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
# Makes log line hashes based on log file text
|
149
|
+
class LogLineParser
|
150
|
+
# Initializes the instance given a LogFormat instance
|
151
|
+
def initialize(log_format, progress_meter)
|
152
|
+
@log_format = log_format
|
153
|
+
@progress_meter = progress_meter
|
154
|
+
|
155
|
+
@_elements = log_format.elements
|
156
|
+
@_derivation_map = log_format.derivation_map
|
157
|
+
end
|
158
|
+
|
159
|
+
# Returns a log line hash built from a line of text, or nil if the line was malformatted
|
160
|
+
#
|
161
|
+
# The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
|
162
|
+
def from_text(log_text)
|
163
|
+
match = (log_text =~ @log_format.regex)
|
164
|
+
if match.nil?
|
165
|
+
warn "Log line did not match expected format: #{log_text}"
|
166
|
+
return nil
|
167
|
+
end
|
168
|
+
|
169
|
+
# Make a hash mapping all parsed elements to their values in the entry
|
170
|
+
match_groups = Regexp.last_match.to_a
|
171
|
+
match_groups.shift # First value is the whole matched string, which we do not want
|
172
|
+
element_values = Hash[*@_elements.zip(match_groups).flatten]
|
173
|
+
|
174
|
+
# Start building the return value
|
175
|
+
entry = LogEntry.new(@_derivation_map)
|
176
|
+
entry[:text] = log_text
|
177
|
+
# Insert all the elements specified in the LogFormat
|
178
|
+
entry.merge!(_elements_to_hash(element_values))
|
179
|
+
|
180
|
+
@progress_meter.output_progress(entry)
|
181
|
+
entry
|
182
|
+
end
|
183
|
+
|
184
|
+
# Returns a hash of "element name" => value pairs based on a hash of element => value pairs.
|
185
|
+
def _elements_to_hash(element_values)
|
186
|
+
hsh = {}
|
187
|
+
element_values.each_pair do |element, value|
|
188
|
+
hsh[element.name] = value
|
189
|
+
end
|
190
|
+
|
191
|
+
hsh
|
192
|
+
end
|
193
|
+
|
194
|
+
# Returns hash of derived "element name" => value pairs from a hash of element => value pairs.
|
195
|
+
#
|
196
|
+
# That is, we go through the elements passed and if any offers derived elements, we include
|
197
|
+
# those in the return value.
|
198
|
+
def _derived_elements(element_values)
|
199
|
+
hsh = {}
|
200
|
+
element_values.each_pair do |element, value|
|
201
|
+
hsh.merge!(element.derived_values(value))
|
202
|
+
end
|
203
|
+
|
204
|
+
hsh
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
|
209
|
+
# Parses a log file given a path and a LogFormat instance
|
210
|
+
class LogParser
|
211
|
+
# Initializes the parser with the path to a log file and a LogLineParser.
|
212
|
+
def initialize(path, ll_parser)
|
213
|
+
@path = path
|
214
|
+
@ll_parser = ll_parser
|
215
|
+
|
216
|
+
@_file = nil
|
217
|
+
end
|
218
|
+
|
219
|
+
# Returns the next entry in the log file as a hash, or nil if we've reached EOF.
|
220
|
+
#
|
221
|
+
# The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
|
222
|
+
def next_entry
|
223
|
+
@_file = open(@path) if @_file.nil?
|
224
|
+
|
225
|
+
while line_text = @_file.gets
|
226
|
+
return nil if line_text.nil?
|
227
|
+
logline = @ll_parser.from_text(line_text)
|
228
|
+
|
229
|
+
# The LogLineFactory returns nil and writes a warning if the line text doesn't
|
230
|
+
# match our expected format.
|
231
|
+
next if logline.nil?
|
232
|
+
|
233
|
+
return logline
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
# Resets the LogParser's filehandle so we can start over.
|
238
|
+
def reset
|
239
|
+
@_file = nil
|
240
|
+
end
|
241
|
+
|
242
|
+
# Makes the LogParser close its current log file and start parsing a new one instead
|
243
|
+
#
|
244
|
+
# `new_target` is a writable file object that the parser should start parsing, and if
|
245
|
+
# in_place is true, we actually replace the contents of the current target with those
|
246
|
+
# of the new target.
|
247
|
+
def replace_target(new_target, in_place)
|
248
|
+
new_target.close
|
249
|
+
|
250
|
+
if in_place
|
251
|
+
old_path = @_file.path
|
252
|
+
File.rename(new_target.path, old_path)
|
253
|
+
else
|
254
|
+
@path = new_target.path
|
255
|
+
end
|
256
|
+
|
257
|
+
@_file = nil
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
# Makes a LogParser given the parameters we want to work with.
|
262
|
+
#
|
263
|
+
# This is the class that most external code should instatiate to begin using this library.
|
264
|
+
class LogParserFactory
|
265
|
+
# Returns a new LogParser instance for the given log file, which should have the given Apache
|
266
|
+
# log format.
|
267
|
+
def self.log_parser(format_string, path, progress_meter)
|
268
|
+
# First we generate a LogFormat instance based on the format string we were given
|
269
|
+
format_factory = LogFormatFactory.new
|
270
|
+
log_format = format_factory.from_format_string(format_string)
|
271
|
+
|
272
|
+
# Now we generate a line parser
|
273
|
+
log_line_parser = LogLineParser.new(log_format, progress_meter)
|
274
|
+
|
275
|
+
# And now we can instantiate and return a LogParser
|
276
|
+
return LogParser.new(path, log_line_parser)
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
|
281
|
+
# Finds a named log format string in the configuration file(s)
|
282
|
+
class FormatStringFinder
|
283
|
+
@@FILE_NAME = "log_formats.rb"
|
284
|
+
@@DEFAULT_FORMATS = {
|
285
|
+
:ncsa => %q!%h %l %u %t \"%r\" %s %b \"%{Referer}i\" \"%{User-agent}i\"!,
|
286
|
+
:ubuntu => %q!%h %l %u %t \"%r\" %s %O \"%{Referer}i\" \"%{User-Agent}i\"!
|
287
|
+
}
|
288
|
+
|
289
|
+
# Finds the given format string in the configuration file(s)
|
290
|
+
#
|
291
|
+
# If none exists, returns nil.
|
292
|
+
def find(format_name)
|
293
|
+
name_as_symbol = format_name.to_sym
|
294
|
+
|
295
|
+
formats = @@DEFAULT_FORMATS.clone
|
296
|
+
_search_path.each do |dir|
|
297
|
+
config_path = File.join(dir, @@FILE_NAME)
|
298
|
+
if File.readable?(config_path)
|
299
|
+
config_file = open(File.join(dir, @@FILE_NAME))
|
300
|
+
eval config_file.read
|
301
|
+
end
|
302
|
+
|
303
|
+
if formats.key?(format_name.to_sym)
|
304
|
+
return formats[format_name.to_sym].gsub(/\\"/, '"')
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
raise "Failed to find the format '#{format_name}' in the search path: #{_search_path.inspect}"
|
309
|
+
end
|
310
|
+
|
311
|
+
def _search_path
|
312
|
+
[".", "./etc",
|
313
|
+
File.join(ENV["HOME"], ".apachecrunch"),
|
314
|
+
"/etc/apachecrunch"]
|
315
|
+
end
|
316
|
+
end
|
data/lib/log_element.rb
ADDED
@@ -0,0 +1,297 @@
|
|
1
|
+
# Converts a string to an integer
|
2
|
+
class IntegerCast
|
3
|
+
def self.cast(string_value)
|
4
|
+
string_value.to_i
|
5
|
+
end
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
# Converts a CLF-formatted string to an integer
|
10
|
+
#
|
11
|
+
# "CLF-formatted" means that if the value is 0, the string will be a single hyphen instead of
|
12
|
+
# a number. Like %b, for instance.
|
13
|
+
class CLFIntegerCast
|
14
|
+
def self.cast(string_value)
|
15
|
+
if string_value == "-"
|
16
|
+
return 0
|
17
|
+
end
|
18
|
+
string_value.to_i
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
# An element in a log format. Abstract from which all elements inherit.
|
24
|
+
#
|
25
|
+
# Exposes:
|
26
|
+
# abbrev: The Apache abbreviation for the element (such as "%h" or "%u" or "%{Referer}i")
|
27
|
+
# name: A short name for the element (such as "remote_host", "remote_user", or "reqhead_referer")
|
28
|
+
# regex: A regex that should match such an element ("[A-Za-z0-9.-]+", "[^:]+", ".+")
|
29
|
+
#
|
30
|
+
# If '_caster' is not nil, it should be a class with a method called "cast" that
|
31
|
+
# transforms a string to the appropriate data type or format for consumption.
|
32
|
+
# For example, the IntegerCast class transforms "562" to 562. The correct cast
|
33
|
+
# of a string can then be performed by passing that string to this LogFormaElement
|
34
|
+
# instance's "cast" method.
|
35
|
+
#
|
36
|
+
# 'derive_elements' manages elements that can be derived from the instance's value. See
|
37
|
+
# ReqFirstlineElement for an example.
|
38
|
+
class LogFormatElement
|
39
|
+
@_caster = nil
|
40
|
+
|
41
|
+
attr_accessor :abbrev, :name, :regex
|
42
|
+
# Class variables that determine the _default_ for abbrev, name, and regex in an instance.
|
43
|
+
# That is, an instance will initialize with these values for the instance variables @abbrev,
|
44
|
+
# @name, and @regex.
|
45
|
+
class << self; attr_accessor :abbrev, :name, :regex end
|
46
|
+
# Additionally we need to access this from within the instance:
|
47
|
+
class << self; attr_accessor :_caster end
|
48
|
+
|
49
|
+
def initialize
|
50
|
+
@abbrev = self.class.abbrev
|
51
|
+
@name = self.class.name
|
52
|
+
@regex = self.class.regex
|
53
|
+
end
|
54
|
+
|
55
|
+
# Casts a string found in the log to the correct type, using the class's @@_caster attribute.
|
56
|
+
def cast(string_value)
|
57
|
+
if self.class._caster.nil?
|
58
|
+
return string_value
|
59
|
+
else
|
60
|
+
return self.class._caster.cast(string_value)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Derives the named element (e.g. "url_path") from a given value for this one.
|
65
|
+
#
|
66
|
+
# See ReqFirstlineElement for an example.
|
67
|
+
def self.derive(name, our_own_value)
|
68
|
+
raise NotImplementedError
|
69
|
+
end
|
70
|
+
|
71
|
+
# Returns a list of the element classes that can be derived from this one.
|
72
|
+
#
|
73
|
+
# See ReqFirstlineElement for an example.
|
74
|
+
def derived_elements
|
75
|
+
[]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
class RemoteHostElement < LogFormatElement
|
81
|
+
@abbrev = "%h"
|
82
|
+
@name = :remote_host
|
83
|
+
@regex = %q![A-Za-z0-9.-]+!
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
class LogNameElement < LogFormatElement
|
88
|
+
@abbrev = "%l"
|
89
|
+
@name = :log_name
|
90
|
+
@regex = %q!\S+!
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
class RemoteUserElement < LogFormatElement
|
95
|
+
@abbrev = "%u"
|
96
|
+
@name = :remote_user
|
97
|
+
@regex = %q![^:]+!
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
class TimeElement < LogFormatElement
|
102
|
+
@abbrev = "%t"
|
103
|
+
@name = :time
|
104
|
+
@regex = %q!\[\d\d/[A-Za-z]{3}/\d\d\d\d:\d\d:\d\d:\d\d [-+]\d\d\d\d\]!
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
class ReqFirstlineElement < LogFormatElement
|
109
|
+
@abbrev = "%r"
|
110
|
+
@name = :req_firstline
|
111
|
+
@regex = %q![^"]+!
|
112
|
+
|
113
|
+
@_derivation_regex = nil
|
114
|
+
|
115
|
+
def self.derive(name, our_own_value)
|
116
|
+
if @_derivation_regex.nil?
|
117
|
+
@_derivation_regex = Regexp.compile("^(#{ReqMethodElement.regex})\s+(#{UrlPathElement.regex})(#{QueryStringElement.regex})\s+(#{ProtocolElement.regex})$")
|
118
|
+
end
|
119
|
+
|
120
|
+
hsh = {}
|
121
|
+
if our_own_value =~ @_derivation_regex
|
122
|
+
hsh[ReqMethodElement.name] = $1
|
123
|
+
hsh[UrlPathElement.name] = $2
|
124
|
+
hsh[QueryStringElement.name] = $3
|
125
|
+
hsh[ProtocolElement.name] = $4
|
126
|
+
end
|
127
|
+
|
128
|
+
hsh[name]
|
129
|
+
end
|
130
|
+
|
131
|
+
def derived_elements
|
132
|
+
return [ReqMethodElement, UrlPathElement, QueryStringElement, ProtocolElement]
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
class StatusElement < LogFormatElement
|
138
|
+
@abbrev = "%s"
|
139
|
+
@name = :status
|
140
|
+
@regex = %q!\d+|-!
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
class BytesSentElement < LogFormatElement
|
145
|
+
@abbrev = "%b"
|
146
|
+
@name = :bytes_sent
|
147
|
+
@regex = %q!\d+!
|
148
|
+
|
149
|
+
@@_caster = IntegerCast
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
class BytesSentElement < LogFormatElement
|
154
|
+
@abbrev = "%b"
|
155
|
+
@name = :bytes_sent
|
156
|
+
@regex = %q![\d-]+!
|
157
|
+
|
158
|
+
@@_caster = CLFIntegerCast
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
class BytesSentWithHeadersElement < LogFormatElement
|
163
|
+
@abbrev = "%O"
|
164
|
+
@name = :bytes_sent_with_headers
|
165
|
+
@regex = %q!\d+!
|
166
|
+
|
167
|
+
@@_caster = IntegerCast
|
168
|
+
end
|
169
|
+
|
170
|
+
|
171
|
+
class ServeTimeMicroElement < LogFormatElement
|
172
|
+
@abbrev = "%D"
|
173
|
+
@name = :serve_time_micro
|
174
|
+
@regex = %q!\d+!
|
175
|
+
|
176
|
+
@@_caster = IntegerCast
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
class UrlPathElement < LogFormatElement
|
181
|
+
@abbrev = "%U"
|
182
|
+
@name = :url_path
|
183
|
+
@regex = %q!/[^?]*!
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
class QueryStringElement < LogFormatElement
|
188
|
+
@abbrev = "%q"
|
189
|
+
@name = :query_string
|
190
|
+
@regex = %q!\??\S*!
|
191
|
+
end
|
192
|
+
|
193
|
+
|
194
|
+
class ReqMethodElement < LogFormatElement
|
195
|
+
@abbrev = "%m"
|
196
|
+
@name = :req_method
|
197
|
+
@regex = %q![A-Z]+!
|
198
|
+
end
|
199
|
+
|
200
|
+
|
201
|
+
class ProtocolElement < LogFormatElement
|
202
|
+
@abbrev = "%H"
|
203
|
+
@name = :protocol
|
204
|
+
@regex = %q!\S+!
|
205
|
+
end
|
206
|
+
|
207
|
+
|
208
|
+
class ReqheaderElement < LogFormatElement
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
class RegexElement < LogFormatElement
|
213
|
+
end
|
214
|
+
|
215
|
+
|
216
|
+
# Finds log format elements given information about them.
|
217
|
+
class ElementDictionary
|
218
|
+
@@_ELEMENTS = [
|
219
|
+
RemoteHostElement,
|
220
|
+
LogNameElement,
|
221
|
+
RemoteUserElement,
|
222
|
+
TimeElement,
|
223
|
+
ReqFirstlineElement,
|
224
|
+
StatusElement,
|
225
|
+
BytesSentElement,
|
226
|
+
BytesSentElement,
|
227
|
+
BytesSentWithHeadersElement,
|
228
|
+
ServeTimeMicroElement,
|
229
|
+
UrlPathElement,
|
230
|
+
QueryStringElement,
|
231
|
+
ReqMethodElement,
|
232
|
+
ProtocolElement
|
233
|
+
]
|
234
|
+
|
235
|
+
# Returns the LogFormatElement subclass with the given format-string abbreviation.
|
236
|
+
#
|
237
|
+
# If none exists, returns nil.
|
238
|
+
def self.find_by_abbrev(abbrev)
|
239
|
+
@@_ELEMENTS.each do |element|
|
240
|
+
if element.abbrev == abbrev
|
241
|
+
return element
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
nil
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
|
250
|
+
# Generates LogFormatElement instances.
|
251
|
+
#
|
252
|
+
# This class does the work of figuring out which LogFormatElement subclass to make and makes it.
|
253
|
+
class LogFormatElementFactory
|
254
|
+
# Takes an Apache log format abbreviation and returns a corresponding LogFormatElement
|
255
|
+
def from_abbrev(abbrev)
|
256
|
+
element_cls = ElementDictionary.find_by_abbrev(abbrev)
|
257
|
+
if element_cls
|
258
|
+
# We found it in the dictionary, so just return an instance
|
259
|
+
return element_cls.new
|
260
|
+
elsif abbrev =~ /^%\{([A-Za-z0-9-]+)\}i/
|
261
|
+
# HTTP request header
|
262
|
+
return _reqheader_element(abbrev, $1)
|
263
|
+
elsif abbrev =~ /^%\{(.*?):([^}]+)\}r/
|
264
|
+
# Arbitrary regex
|
265
|
+
return _regex_element(abbrev, $1, $2)
|
266
|
+
end
|
267
|
+
|
268
|
+
raise "Unknown element format '#{abbrev}'"
|
269
|
+
end
|
270
|
+
|
271
|
+
# Returns a format element based on an HTTP header
|
272
|
+
def _reqheader_element(abbrev, header_name)
|
273
|
+
element = ReqheaderElement.new
|
274
|
+
|
275
|
+
element.abbrev = abbrev
|
276
|
+
element.regex = %q![^"]*!
|
277
|
+
element.name = _header_name_to_element_name(header_name)
|
278
|
+
|
279
|
+
element
|
280
|
+
end
|
281
|
+
|
282
|
+
# Returns a format element based on an arbitrary regex
|
283
|
+
def _regex_element(abbrev, regex_name, regex)
|
284
|
+
element = RegexElement.new
|
285
|
+
|
286
|
+
element.abbrev = abbrev
|
287
|
+
element.regex = regex
|
288
|
+
element.name = "regex_#{regex_name}".to_sym
|
289
|
+
|
290
|
+
element
|
291
|
+
end
|
292
|
+
|
293
|
+
# Lowercases header name and turns hyphens into underscores
|
294
|
+
def _header_name_to_element_name(header_name)
|
295
|
+
("reqheader_" + header_name.downcase().gsub("-", "_")).to_sym
|
296
|
+
end
|
297
|
+
end
|
@@ -0,0 +1,308 @@
|
|
1
|
+
# Abstract for a procedure routine.
|
2
|
+
class ProcedureRoutine
|
3
|
+
def initialize(log_parser)
|
4
|
+
@_log_parser = log_parser
|
5
|
+
@_current_entry = nil
|
6
|
+
end
|
7
|
+
|
8
|
+
# Allows blocks passed to a DSL routine to access parameters from the current log entry
|
9
|
+
def method_missing(sym, *args)
|
10
|
+
@_current_entry[sym]
|
11
|
+
end
|
12
|
+
|
13
|
+
# Executes the DSL routine using the given block
|
14
|
+
#
|
15
|
+
# Abstract method
|
16
|
+
def execute(&blk)
|
17
|
+
raise "Not implemented"
|
18
|
+
end
|
19
|
+
|
20
|
+
# Anything that needs to happen after the routine completes but before it returns its
|
21
|
+
# result can go in here.
|
22
|
+
def finish
|
23
|
+
@_log_parser.reset
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
# DSL routine that returns the number of log entries where the block evaluates to true
|
29
|
+
class CountWhere < ProcedureRoutine
|
30
|
+
def execute(&blk)
|
31
|
+
count = 0
|
32
|
+
while @_current_entry = @_log_parser.next_entry
|
33
|
+
if instance_eval(&blk)
|
34
|
+
count += 1
|
35
|
+
end
|
36
|
+
end
|
37
|
+
count
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# DSL routine that executes the block for every log entry
|
43
|
+
class Each < ProcedureRoutine
|
44
|
+
def execute(&blk)
|
45
|
+
while @_current_entry = @_log_parser.next_entry
|
46
|
+
instance_eval(&blk)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
# DSL routine(s) that filter(s) for entries for which the given block evaluates to true
|
53
|
+
#
|
54
|
+
# This can be called as 'filter()', which means the filtering happens in a temporary file, or
|
55
|
+
# as 'filter(path)', which means the filtering happens in the given file. It can also be called
|
56
|
+
# as 'filter!()', which means the filtering happens in place, clobbering what's in apachecrunch's
|
57
|
+
# target file.
|
58
|
+
class Filter < ProcedureRoutine
|
59
|
+
def execute(path=nil, in_place=false, &blk)
|
60
|
+
@_in_place = in_place
|
61
|
+
@_results_file = _make_results_file(path, in_place)
|
62
|
+
|
63
|
+
while @_current_entry = @_log_parser.next_entry
|
64
|
+
if instance_eval(&blk)
|
65
|
+
@_results_file.write(@_current_entry[:text])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def finish
|
71
|
+
@_log_parser.replace_target(@_results_file, @_in_place)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Returns a writable file object to which the results of the filter should be written.
|
75
|
+
def _make_results_file(path, in_place)
|
76
|
+
if path.nil?
|
77
|
+
# If no path passed (this includes the case where the filter is being performed
|
78
|
+
# in place), we want a temp file.
|
79
|
+
return Tempfile.new("apachecrunch")
|
80
|
+
else
|
81
|
+
return open(path, "w")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# DSL routine that returns the count of entries with each found value of the given block
|
87
|
+
#
|
88
|
+
# You might for instance run this with the block { status }, and you'd get back something like
|
89
|
+
# {"200" => 941, "301" => 41, "404" => 2, "500" => 0}
|
90
|
+
class CountBy < ProcedureRoutine
|
91
|
+
def execute(&blk)
|
92
|
+
counts = {}
|
93
|
+
while @_current_entry = @_log_parser.next_entry
|
94
|
+
val = instance_eval(&blk)
|
95
|
+
if counts.key?(val)
|
96
|
+
counts[val] += 1
|
97
|
+
else
|
98
|
+
counts[val] = 1
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return counts
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
# DSL routine that finds the distribution of (numeric) values to which the given block evaluates
|
107
|
+
#
|
108
|
+
# For example,
|
109
|
+
#
|
110
|
+
# distribution 100 do
|
111
|
+
# bytes_sent
|
112
|
+
# end
|
113
|
+
#
|
114
|
+
# would return a hash with keys from 0 up by multiples of 100, the value of each being the number
|
115
|
+
# of entries for which bytes_sent is between that key and the next key.
|
116
|
+
class Distribution < ProcedureRoutine
|
117
|
+
def execute(bucket_width, &blk)
|
118
|
+
dist = {}
|
119
|
+
while @_current_entry = @_log_parser.next_entry
|
120
|
+
val = instance_eval(&blk)
|
121
|
+
k = _key_for(val, bucket_width)
|
122
|
+
if dist.key?(k)
|
123
|
+
dist[k] += 1
|
124
|
+
else
|
125
|
+
dist[k] = 1
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Backfill keys for which we didn't find a value
|
130
|
+
0.step(dist.keys.max, bucket_width).each do |k|
|
131
|
+
dist[k] = 0 unless dist.key?(k)
|
132
|
+
end
|
133
|
+
|
134
|
+
dist
|
135
|
+
end
|
136
|
+
|
137
|
+
# Determines the key for the distribution hash given the value and step
|
138
|
+
def _key_for(val, bucket_width)
|
139
|
+
(val.to_i / bucket_width) * bucket_width
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
# Same as Distribution, but the buckets get expenentially wider
|
145
|
+
class LogDistribution < ProcedureRoutine
|
146
|
+
def execute(width_base, &blk)
|
147
|
+
dist = {}
|
148
|
+
while @_current_entry = @_log_parser.next_entry
|
149
|
+
val = instance_eval(&blk)
|
150
|
+
k = _key_for(val, width_base)
|
151
|
+
if dist.key?(k)
|
152
|
+
dist[k] += 1
|
153
|
+
else
|
154
|
+
dist[k] = 1
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# Backfill keys for which we didn't find a value
|
159
|
+
k = dist.keys.min
|
160
|
+
max_key = dist.keys.max
|
161
|
+
while k *= width_base and k < max_key
|
162
|
+
dist[k] = 0 unless dist.key?(k)
|
163
|
+
end
|
164
|
+
|
165
|
+
dist
|
166
|
+
end
|
167
|
+
|
168
|
+
# Determines the key for the distribution hash given the value and logarithmic base for
|
169
|
+
# the bucket width
|
170
|
+
def _key_for(val, width_base)
|
171
|
+
exp = (Math.log(val) / Math.log(width_base)).to_i
|
172
|
+
width_base ** exp
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
# DSL routine that determines a confidence interval for the values to which the block evaluates
|
178
|
+
#
|
179
|
+
# For example,
|
180
|
+
#
|
181
|
+
# confidence_interval 95 do
|
182
|
+
# time_to_serve
|
183
|
+
# end
|
184
|
+
#
|
185
|
+
# would return two numbers, the lower and upper bound of a 95% confidence interval for the values
|
186
|
+
# of time_to_serve.
|
187
|
+
class ConfidenceInterval < ProcedureRoutine
|
188
|
+
def execute(confidence, &blk)
|
189
|
+
# Build a list of all the values found
|
190
|
+
values = []
|
191
|
+
while @_current_entry = @_log_parser.next_entry
|
192
|
+
values << instance_eval(&blk)
|
193
|
+
end
|
194
|
+
values.sort!
|
195
|
+
|
196
|
+
# Determine how many values are outside the bounds of the CI
|
197
|
+
count_outside = (values.length * (1.0 - confidence/100.0)).to_i
|
198
|
+
|
199
|
+
# Find the bounds of the confidence interval
|
200
|
+
return values[count_outside / 2], values[-count_outside / 2]
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
|
205
|
+
# DSL routine that finds the most common n values for the given block.
|
206
|
+
#
|
207
|
+
# Returns a list of lists, each of which is [value, count]. This list is sorted by count.
|
208
|
+
class MostCommon < ProcedureRoutine
|
209
|
+
def execute(n, &blk)
|
210
|
+
counts = CountBy.new(@_log_parser).execute(&blk)
|
211
|
+
|
212
|
+
# Sort the block values descending
|
213
|
+
sorted_vals = counts.keys.sort do |val_a,val_b|
|
214
|
+
- (counts[val_a] <=> counts[val_b])
|
215
|
+
end
|
216
|
+
|
217
|
+
sorted_vals[0..n].map do |val|
|
218
|
+
[val, counts[val]]
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
|
224
|
+
# The environment in which a procedure file is evaluated.
|
225
|
+
#
|
226
|
+
# A procedure file is some ruby code that uses our DSL.
|
227
|
+
class ProcedureEnvironment
|
228
|
+
def initialize(log_parser)
|
229
|
+
@_log_parser = log_parser
|
230
|
+
end
|
231
|
+
|
232
|
+
# Evaluates the given string as a procedure in our DSL
|
233
|
+
def eval_procedure(proc_string)
|
234
|
+
eval proc_string
|
235
|
+
end
|
236
|
+
|
237
|
+
# DSL routine 'count_where'
|
238
|
+
def count_where(&blk)
|
239
|
+
routine = CountWhere.new(@_log_parser)
|
240
|
+
rv = routine.execute(&blk)
|
241
|
+
routine.finish
|
242
|
+
rv
|
243
|
+
end
|
244
|
+
|
245
|
+
# DSL routine 'filter!'
|
246
|
+
def filter!(&blk)
|
247
|
+
routine = Filter.new(@_log_parser)
|
248
|
+
routine.execute(nil, true, &blk)
|
249
|
+
routine.finish
|
250
|
+
nil
|
251
|
+
end
|
252
|
+
|
253
|
+
# DSL routine 'filter'
|
254
|
+
def filter(target_path=nil, &blk)
|
255
|
+
routine = Filter.new(@_log_parser)
|
256
|
+
routine.execute(target_path, &blk)
|
257
|
+
routine.finish
|
258
|
+
nil
|
259
|
+
end
|
260
|
+
|
261
|
+
# DSL routine 'each'
|
262
|
+
def each(&blk)
|
263
|
+
routine = Each.new(@_log_parser)
|
264
|
+
routine.execute(&blk)
|
265
|
+
routine.finish
|
266
|
+
nil
|
267
|
+
end
|
268
|
+
|
269
|
+
# DSL routine 'count_by'
|
270
|
+
def count_by(&blk)
|
271
|
+
routine = CountBy.new(@_log_parser)
|
272
|
+
rv = routine.execute(&blk)
|
273
|
+
routine.finish
|
274
|
+
rv
|
275
|
+
end
|
276
|
+
|
277
|
+
# DSL routine 'distribution'
|
278
|
+
def distribution(bucket_width, &blk)
|
279
|
+
routine = Distribution.new(@_log_parser)
|
280
|
+
rv = routine.execute(bucket_width, &blk)
|
281
|
+
routine.finish
|
282
|
+
rv
|
283
|
+
end
|
284
|
+
|
285
|
+
# DSL routine 'log_distribution'
|
286
|
+
def log_distribution(width_base, &blk)
|
287
|
+
routine = LogDistribution.new(@_log_parser)
|
288
|
+
rv = routine.execute(width_base, &blk)
|
289
|
+
routine.finish
|
290
|
+
rv
|
291
|
+
end
|
292
|
+
|
293
|
+
# DSL routine 'confidence_interval'
|
294
|
+
def confidence_interval(confidence, &blk)
|
295
|
+
routine = ConfidenceInterval.new(@_log_parser)
|
296
|
+
rv = routine.execute(confidence, &blk)
|
297
|
+
routine.finish
|
298
|
+
rv
|
299
|
+
end
|
300
|
+
|
301
|
+
# DSL routine 'most_common'
|
302
|
+
def most_common(n, &blk)
|
303
|
+
routine = MostCommon.new(@_log_parser)
|
304
|
+
rv = routine.execute(n, &blk)
|
305
|
+
routine.finish
|
306
|
+
rv
|
307
|
+
end
|
308
|
+
end
|
data/lib/progress.rb
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
class ProgressMeter
|
2
|
+
def initialize
|
3
|
+
@_entry_count = 0
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
|
8
|
+
# Progress meter that prints the number of entries parsed every (n) lines.
|
9
|
+
class EntryCountProgressMeter < ProgressMeter
|
10
|
+
def initialize
|
11
|
+
# 'period' is how many entries we wait between printing output. So if 'period' is 10 000,
|
12
|
+
# we'll print output every 10 000 lines.
|
13
|
+
@_period = 10000
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
# Outputs the number of entries that have been parsed so far (every once in a while).
|
18
|
+
#
|
19
|
+
# 'entry' should be the latest log entry to be parsed, in hash form.
|
20
|
+
def output_progress(entry)
|
21
|
+
@_entry_count += 1
|
22
|
+
if @_entry_count % @_period == 0
|
23
|
+
puts "Processed %d entries" % [@_entry_count]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class TimeProgressMeter < ProgressMeter
|
29
|
+
def initialize
|
30
|
+
# 'period' is how many entries we wait between printing output. So if 'period' is 10 000,
|
31
|
+
# we'll print output every 10 000 lines.
|
32
|
+
@_period = 10000
|
33
|
+
super
|
34
|
+
end
|
35
|
+
|
36
|
+
# Outputs the number of entries that have been parsed so far (every once in a while).
|
37
|
+
#
|
38
|
+
# 'entry' should be the latest log entry to be parsed, in hash form.
|
39
|
+
def output_progress(entry)
|
40
|
+
@_entry_count += 1
|
41
|
+
if @_entry_count % @_period == 0
|
42
|
+
puts "Processed through %s" % [entry["time"]]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class NullProgressMeter < ProgressMeter
|
48
|
+
def output_progress(entry)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
# Constructs progress meters that output progress info to the user.
|
54
|
+
class ProgressMeterFactory
|
55
|
+
# Constructs a progress meter from a hash containing the options passed on the command line.
|
56
|
+
def self.from_options(options)
|
57
|
+
pm_class = {
|
58
|
+
"entry" => EntryCountProgressMeter,
|
59
|
+
"time" => TimeProgressMeter
|
60
|
+
}
|
61
|
+
pm_class.default = NullProgressMeter
|
62
|
+
|
63
|
+
pm_class[options[:progress]].new
|
64
|
+
end
|
65
|
+
end
|
metadata
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: apachecrunch
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 9
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: "0.1"
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Dan Slimmon
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-07-09 00:00:00 -04:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: |-
|
22
|
+
Apache Crunch is an analysis tool for Apache logs. You write little scripts
|
23
|
+
to do the analysis, using our DSL to make the procedure as simple and readable
|
24
|
+
as possible. See our homepage for more details.
|
25
|
+
email: dan@danslimmon.com
|
26
|
+
executables:
|
27
|
+
- apachecrunch
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files: []
|
31
|
+
|
32
|
+
files:
|
33
|
+
- lib/apachecrunch.rb
|
34
|
+
- lib/log_element.rb
|
35
|
+
- lib/procedure_dsl.rb
|
36
|
+
- lib/progress.rb
|
37
|
+
- bin/apachecrunch
|
38
|
+
- LICENSE
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: https://github.com/danslimmon/apachecrunch/
|
41
|
+
licenses:
|
42
|
+
- Creative Commons Share-Alike
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
hash: 3
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
hash: 3
|
63
|
+
segments:
|
64
|
+
- 0
|
65
|
+
version: "0"
|
66
|
+
requirements: []
|
67
|
+
|
68
|
+
rubyforge_project:
|
69
|
+
rubygems_version: 1.6.2
|
70
|
+
signing_key:
|
71
|
+
specification_version: 3
|
72
|
+
summary: Apache log analysis tool designed for ease of use
|
73
|
+
test_files: []
|
74
|
+
|