apachecrunch 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1 @@
1
+ http://creativecommons.org/licenses/by-sa/3.0/
data/bin/apachecrunch ADDED
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # For development while inside the apachecrunch dir:
4
+ $: << ".."
5
+ $: << "./lib"
6
+ require "rubygems"
7
+
8
+ require "apachecrunch"
9
+ require "progress"
10
+ require "procedure_dsl"
11
+
12
+
13
+ # Prints the usage message and exits with the given exit code
14
+ def barf_usage(exit_code)
15
+ puts %q!USAGE:
16
+ apachecrunch.rb <PROCEDURE> <LOG>
17
+ [--format=<FORMAT NAME>] [--progress <METER TYPE>]
18
+
19
+ --progress: Gives you a progress meter as the log file is parsed. METER TYPE can be "entry",
20
+ which prints out how many entries have been parsed so far, or "time", which prints
21
+ out the time of the last entry parsed.!
22
+ exit exit_code
23
+ end
24
+
25
+
26
+ # Parses arguments
27
+ #
28
+ # Returns a hash with these keys (as symbols):
29
+ # procedure: The path to the procedure DSL file
30
+ # logfile: The path to the log file
31
+ # format: The name of the log format specified ("ncsa" by default)
32
+ def parse_args
33
+ args = ARGV.clone
34
+ options = {}
35
+
36
+ # Defaults
37
+ options[:format] = "ncsa"
38
+ options[:progress] = nil
39
+
40
+ while a = args.shift
41
+ if a == "--format"
42
+ options[:format] = args.shift
43
+ elsif a == "--help"
44
+ barf_usage(0)
45
+ elsif a == "--progress"
46
+ options[:progress] = args.shift
47
+ elsif options.key?(:procedure)
48
+ options[:logfile] = a
49
+ else
50
+ options[:procedure] = a
51
+ end
52
+ end
53
+ unless options.key?(:procedure) and options.key?(:logfile)
54
+ barf_usage(1)
55
+ end
56
+
57
+ return options
58
+ end
59
+
60
+
61
+ options = parse_args
62
+
63
+ format_string = FormatStringFinder.new.find(options[:format])
64
+ progress_meter = ProgressMeterFactory.from_options(options)
65
+ log_parser = LogParserFactory.log_parser(
66
+ format_string=format_string,
67
+ path=options[:logfile],
68
+ progress_meter=progress_meter)
69
+ proc_env = ProcedureEnvironment.new(log_parser)
70
+ proc_env.eval_procedure(open(options[:procedure]).read())
@@ -0,0 +1,316 @@
1
+ require "date"
2
+ require "tempfile"
3
+
4
+ require 'log_element'
5
+
6
+
7
+ # A parsed entry from the log.
8
+ #
9
+ # Acts like a hash, in that you get at the log elements (e.g. "url_path", "remote_host") by
10
+ # as entry[name].
11
+ class LogEntry
12
+ def initialize(derivation_map)
13
+ @_derivation_map = derivation_map
14
+ @_attributes = {}
15
+ end
16
+
17
+ def []=(name, value)
18
+ @_attributes[name] = value
19
+ end
20
+
21
+ def [](name)
22
+ return @_attributes[name] if @_attributes.key?(name)
23
+
24
+ derived_from_cls = @_derivation_map[name]
25
+ return nil if derived_from_cls.nil?
26
+
27
+ derived_from_cls.derive(name, @_attributes[derived_from_cls.name])
28
+ end
29
+
30
+ def merge!(hsh)
31
+ @_attributes.merge!(hsh)
32
+ end
33
+ end
34
+
35
+
36
+ # A bare string in a log format
37
+ #
38
+ # Exposes 'regex' for consistency with LogFormatElement, but there shouldn't be anything other
39
+ # than one-to-one character matching in there.
40
+ class LogFormatString
41
+ attr_accessor :regex
42
+
43
+ def initialize(regex)
44
+ @regex = regex
45
+ end
46
+ end
47
+
48
+
49
+ # Represents a particular Apache log format
50
+ class LogFormat
51
+ attr_accessor :format_string, :tokens
52
+
53
+ def initialize
54
+ @tokens = []
55
+ @_regex = nil
56
+ end
57
+
58
+ # Appends a given token (a LogFormatElement or LogFormatString) to the tokens list
59
+ def append(token)
60
+ @tokens << token
61
+ end
62
+
63
+ # Returns a compiled regex to match a log line in this format
64
+ def regex
65
+ return @_regex unless @_regex.nil?
66
+
67
+ r = "^"
68
+ @tokens.each do |tok|
69
+ # We only care to remember the LogFormatElements. No need to put parentheses
70
+ # around LogFormatString shit.
71
+ if tok.respond_to?(:name)
72
+ r += "(" + tok.regex + ")"
73
+ else
74
+ r += tok.regex
75
+ end
76
+ end
77
+ r += "$"
78
+
79
+ @_regex = Regexp.compile(r)
80
+ @_regex
81
+ end
82
+
83
+ # Returns the list of LogFormatElements, in order, of the interpolated things in the format.
84
+ #
85
+ # For example, if the log format string were "%h %u %{Referer}i", this would return the
86
+ # LogFormatElement instances for "%h", "%u", and "%{Referer}i".
87
+ def elements
88
+ @tokens.find_all do |tok|
89
+ tok.respond_to?(:name)
90
+ end
91
+ end
92
+
93
+ # Returns hash mapping names of elements to the element class from which they can be derived.
94
+ def derivation_map
95
+ hsh = {}
96
+ elements.each do |tok|
97
+ tok.derived_elements.each do |derived_element|
98
+ hsh[derived_element.name] = tok.class
99
+ end
100
+ end
101
+
102
+ hsh
103
+ end
104
+ end
105
+
106
+
107
+ # Turns a string specifying an Apache log format into a LogFormat instance
108
+ class LogFormatFactory
109
+ def initialize
110
+ @element_factory = LogFormatElementFactory.new
111
+ end
112
+
113
+ # Constructs and returns a LogFormat instance based on the given Apache log format string
114
+ def from_format_string(f_string)
115
+ logformat = LogFormat.new
116
+ logformat.format_string = f_string
117
+
118
+ until f_string.empty?
119
+ token, f_string = _shift_token(f_string)
120
+ logformat.append(token)
121
+ end
122
+
123
+ logformat
124
+ end
125
+
126
+ # Finds the first token (a LogFormatElement or LogFormatString) in a format string
127
+ #
128
+ # Returns a list containing the token and the new format string (with the characters that
129
+ # correspond to the token removed)
130
+ def _shift_token(f_string)
131
+ if f_string =~ /^%%(.*)/
132
+ # Literal "%"
133
+ return [LogFormatString.new("%%"), $1]
134
+ elsif f_string =~ /^(%[A-Za-z])(.*)/
135
+ # Simple element (e.g. "%h", "%u")
136
+ return [@element_factory.from_abbrev($1), $2]
137
+ elsif f_string =~ /^(%\{.+?\}[Ceinor])(.*)/
138
+ # "Contents of" element (e.g. "%{Accept}i")
139
+ return [@element_factory.from_abbrev($1), $2]
140
+ elsif f_string =~ /^(.+?)(%.*|$)/
141
+ # Bare string up until the next %, or up until the end of the format string
142
+ return [LogFormatString.new($1), $2]
143
+ end
144
+ end
145
+ end
146
+
147
+
148
+ # Makes log line hashes based on log file text
149
+ class LogLineParser
150
+ # Initializes the instance given a LogFormat instance
151
+ def initialize(log_format, progress_meter)
152
+ @log_format = log_format
153
+ @progress_meter = progress_meter
154
+
155
+ @_elements = log_format.elements
156
+ @_derivation_map = log_format.derivation_map
157
+ end
158
+
159
+ # Returns a log line hash built from a line of text, or nil if the line was malformatted
160
+ #
161
+ # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
162
+ def from_text(log_text)
163
+ match = (log_text =~ @log_format.regex)
164
+ if match.nil?
165
+ warn "Log line did not match expected format: #{log_text}"
166
+ return nil
167
+ end
168
+
169
+ # Make a hash mapping all parsed elements to their values in the entry
170
+ match_groups = Regexp.last_match.to_a
171
+ match_groups.shift # First value is the whole matched string, which we do not want
172
+ element_values = Hash[*@_elements.zip(match_groups).flatten]
173
+
174
+ # Start building the return value
175
+ entry = LogEntry.new(@_derivation_map)
176
+ entry[:text] = log_text
177
+ # Insert all the elements specified in the LogFormat
178
+ entry.merge!(_elements_to_hash(element_values))
179
+
180
+ @progress_meter.output_progress(entry)
181
+ entry
182
+ end
183
+
184
+ # Returns a hash of "element name" => value pairs based on a hash of element => value pairs.
185
+ def _elements_to_hash(element_values)
186
+ hsh = {}
187
+ element_values.each_pair do |element, value|
188
+ hsh[element.name] = value
189
+ end
190
+
191
+ hsh
192
+ end
193
+
194
+ # Returns hash of derived "element name" => value pairs from a hash of element => value pairs.
195
+ #
196
+ # That is, we go through the elements passed and if any offers derived elements, we include
197
+ # those in the return value.
198
+ def _derived_elements(element_values)
199
+ hsh = {}
200
+ element_values.each_pair do |element, value|
201
+ hsh.merge!(element.derived_values(value))
202
+ end
203
+
204
+ hsh
205
+ end
206
+ end
207
+
208
+
209
+ # Parses a log file given a path and a LogFormat instance
210
+ class LogParser
211
+ # Initializes the parser with the path to a log file and a LogLineParser.
212
+ def initialize(path, ll_parser)
213
+ @path = path
214
+ @ll_parser = ll_parser
215
+
216
+ @_file = nil
217
+ end
218
+
219
+ # Returns the next entry in the log file as a hash, or nil if we've reached EOF.
220
+ #
221
+ # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
222
+ def next_entry
223
+ @_file = open(@path) if @_file.nil?
224
+
225
+ while line_text = @_file.gets
226
+ return nil if line_text.nil?
227
+ logline = @ll_parser.from_text(line_text)
228
+
229
+ # The LogLineFactory returns nil and writes a warning if the line text doesn't
230
+ # match our expected format.
231
+ next if logline.nil?
232
+
233
+ return logline
234
+ end
235
+ end
236
+
237
+ # Resets the LogParser's filehandle so we can start over.
238
+ def reset
239
+ @_file = nil
240
+ end
241
+
242
+ # Makes the LogParser close its current log file and start parsing a new one instead
243
+ #
244
+ # `new_target` is a writable file object that the parser should start parsing, and if
245
+ # in_place is true, we actually replace the contents of the current target with those
246
+ # of the new target.
247
+ def replace_target(new_target, in_place)
248
+ new_target.close
249
+
250
+ if in_place
251
+ old_path = @_file.path
252
+ File.rename(new_target.path, old_path)
253
+ else
254
+ @path = new_target.path
255
+ end
256
+
257
+ @_file = nil
258
+ end
259
+ end
260
+
261
+ # Makes a LogParser given the parameters we want to work with.
262
+ #
263
+ # This is the class that most external code should instatiate to begin using this library.
264
+ class LogParserFactory
265
+ # Returns a new LogParser instance for the given log file, which should have the given Apache
266
+ # log format.
267
+ def self.log_parser(format_string, path, progress_meter)
268
+ # First we generate a LogFormat instance based on the format string we were given
269
+ format_factory = LogFormatFactory.new
270
+ log_format = format_factory.from_format_string(format_string)
271
+
272
+ # Now we generate a line parser
273
+ log_line_parser = LogLineParser.new(log_format, progress_meter)
274
+
275
+ # And now we can instantiate and return a LogParser
276
+ return LogParser.new(path, log_line_parser)
277
+ end
278
+ end
279
+
280
+
281
+ # Finds a named log format string in the configuration file(s)
282
+ class FormatStringFinder
283
+ @@FILE_NAME = "log_formats.rb"
284
+ @@DEFAULT_FORMATS = {
285
+ :ncsa => %q!%h %l %u %t \"%r\" %s %b \"%{Referer}i\" \"%{User-agent}i\"!,
286
+ :ubuntu => %q!%h %l %u %t \"%r\" %s %O \"%{Referer}i\" \"%{User-Agent}i\"!
287
+ }
288
+
289
+ # Finds the given format string in the configuration file(s)
290
+ #
291
+ # If none exists, returns nil.
292
+ def find(format_name)
293
+ name_as_symbol = format_name.to_sym
294
+
295
+ formats = @@DEFAULT_FORMATS.clone
296
+ _search_path.each do |dir|
297
+ config_path = File.join(dir, @@FILE_NAME)
298
+ if File.readable?(config_path)
299
+ config_file = open(File.join(dir, @@FILE_NAME))
300
+ eval config_file.read
301
+ end
302
+
303
+ if formats.key?(format_name.to_sym)
304
+ return formats[format_name.to_sym].gsub(/\\"/, '"')
305
+ end
306
+ end
307
+
308
+ raise "Failed to find the format '#{format_name}' in the search path: #{_search_path.inspect}"
309
+ end
310
+
311
+ def _search_path
312
+ [".", "./etc",
313
+ File.join(ENV["HOME"], ".apachecrunch"),
314
+ "/etc/apachecrunch"]
315
+ end
316
+ end
@@ -0,0 +1,297 @@
1
+ # Converts a string to an integer
2
+ class IntegerCast
3
+ def self.cast(string_value)
4
+ string_value.to_i
5
+ end
6
+ end
7
+
8
+
9
+ # Converts a CLF-formatted string to an integer
10
+ #
11
+ # "CLF-formatted" means that if the value is 0, the string will be a single hyphen instead of
12
+ # a number. Like %b, for instance.
13
+ class CLFIntegerCast
14
+ def self.cast(string_value)
15
+ if string_value == "-"
16
+ return 0
17
+ end
18
+ string_value.to_i
19
+ end
20
+ end
21
+
22
+
23
+ # An element in a log format. Abstract from which all elements inherit.
24
+ #
25
+ # Exposes:
26
+ # abbrev: The Apache abbreviation for the element (such as "%h" or "%u" or "%{Referer}i")
27
+ # name: A short name for the element (such as "remote_host", "remote_user", or "reqhead_referer")
28
+ # regex: A regex that should match such an element ("[A-Za-z0-9.-]+", "[^:]+", ".+")
29
+ #
30
+ # If '_caster' is not nil, it should be a class with a method called "cast" that
31
+ # transforms a string to the appropriate data type or format for consumption.
32
+ # For example, the IntegerCast class transforms "562" to 562. The correct cast
33
+ # of a string can then be performed by passing that string to this LogFormaElement
34
+ # instance's "cast" method.
35
+ #
36
+ # 'derive_elements' manages elements that can be derived from the instance's value. See
37
+ # ReqFirstlineElement for an example.
38
+ class LogFormatElement
39
+ @_caster = nil
40
+
41
+ attr_accessor :abbrev, :name, :regex
42
+ # Class variables that determine the _default_ for abbrev, name, and regex in an instance.
43
+ # That is, an instance will initialize with these values for the instance variables @abbrev,
44
+ # @name, and @regex.
45
+ class << self; attr_accessor :abbrev, :name, :regex end
46
+ # Additionally we need to access this from within the instance:
47
+ class << self; attr_accessor :_caster end
48
+
49
+ def initialize
50
+ @abbrev = self.class.abbrev
51
+ @name = self.class.name
52
+ @regex = self.class.regex
53
+ end
54
+
55
+ # Casts a string found in the log to the correct type, using the class's @@_caster attribute.
56
+ def cast(string_value)
57
+ if self.class._caster.nil?
58
+ return string_value
59
+ else
60
+ return self.class._caster.cast(string_value)
61
+ end
62
+ end
63
+
64
+ # Derives the named element (e.g. "url_path") from a given value for this one.
65
+ #
66
+ # See ReqFirstlineElement for an example.
67
+ def self.derive(name, our_own_value)
68
+ raise NotImplementedError
69
+ end
70
+
71
+ # Returns a list of the element classes that can be derived from this one.
72
+ #
73
+ # See ReqFirstlineElement for an example.
74
+ def derived_elements
75
+ []
76
+ end
77
+ end
78
+
79
+
80
+ class RemoteHostElement < LogFormatElement
81
+ @abbrev = "%h"
82
+ @name = :remote_host
83
+ @regex = %q![A-Za-z0-9.-]+!
84
+ end
85
+
86
+
87
+ class LogNameElement < LogFormatElement
88
+ @abbrev = "%l"
89
+ @name = :log_name
90
+ @regex = %q!\S+!
91
+ end
92
+
93
+
94
+ class RemoteUserElement < LogFormatElement
95
+ @abbrev = "%u"
96
+ @name = :remote_user
97
+ @regex = %q![^:]+!
98
+ end
99
+
100
+
101
+ class TimeElement < LogFormatElement
102
+ @abbrev = "%t"
103
+ @name = :time
104
+ @regex = %q!\[\d\d/[A-Za-z]{3}/\d\d\d\d:\d\d:\d\d:\d\d [-+]\d\d\d\d\]!
105
+ end
106
+
107
+
108
+ class ReqFirstlineElement < LogFormatElement
109
+ @abbrev = "%r"
110
+ @name = :req_firstline
111
+ @regex = %q![^"]+!
112
+
113
+ @_derivation_regex = nil
114
+
115
+ def self.derive(name, our_own_value)
116
+ if @_derivation_regex.nil?
117
+ @_derivation_regex = Regexp.compile("^(#{ReqMethodElement.regex})\s+(#{UrlPathElement.regex})(#{QueryStringElement.regex})\s+(#{ProtocolElement.regex})$")
118
+ end
119
+
120
+ hsh = {}
121
+ if our_own_value =~ @_derivation_regex
122
+ hsh[ReqMethodElement.name] = $1
123
+ hsh[UrlPathElement.name] = $2
124
+ hsh[QueryStringElement.name] = $3
125
+ hsh[ProtocolElement.name] = $4
126
+ end
127
+
128
+ hsh[name]
129
+ end
130
+
131
+ def derived_elements
132
+ return [ReqMethodElement, UrlPathElement, QueryStringElement, ProtocolElement]
133
+ end
134
+ end
135
+
136
+
137
+ class StatusElement < LogFormatElement
138
+ @abbrev = "%s"
139
+ @name = :status
140
+ @regex = %q!\d+|-!
141
+ end
142
+
143
+
144
+ class BytesSentElement < LogFormatElement
145
+ @abbrev = "%b"
146
+ @name = :bytes_sent
147
+ @regex = %q!\d+!
148
+
149
+ @@_caster = IntegerCast
150
+ end
151
+
152
+
153
+ class BytesSentElement < LogFormatElement
154
+ @abbrev = "%b"
155
+ @name = :bytes_sent
156
+ @regex = %q![\d-]+!
157
+
158
+ @@_caster = CLFIntegerCast
159
+ end
160
+
161
+
162
+ class BytesSentWithHeadersElement < LogFormatElement
163
+ @abbrev = "%O"
164
+ @name = :bytes_sent_with_headers
165
+ @regex = %q!\d+!
166
+
167
+ @@_caster = IntegerCast
168
+ end
169
+
170
+
171
+ class ServeTimeMicroElement < LogFormatElement
172
+ @abbrev = "%D"
173
+ @name = :serve_time_micro
174
+ @regex = %q!\d+!
175
+
176
+ @@_caster = IntegerCast
177
+ end
178
+
179
+
180
+ class UrlPathElement < LogFormatElement
181
+ @abbrev = "%U"
182
+ @name = :url_path
183
+ @regex = %q!/[^?]*!
184
+ end
185
+
186
+
187
+ class QueryStringElement < LogFormatElement
188
+ @abbrev = "%q"
189
+ @name = :query_string
190
+ @regex = %q!\??\S*!
191
+ end
192
+
193
+
194
+ class ReqMethodElement < LogFormatElement
195
+ @abbrev = "%m"
196
+ @name = :req_method
197
+ @regex = %q![A-Z]+!
198
+ end
199
+
200
+
201
+ class ProtocolElement < LogFormatElement
202
+ @abbrev = "%H"
203
+ @name = :protocol
204
+ @regex = %q!\S+!
205
+ end
206
+
207
+
208
+ class ReqheaderElement < LogFormatElement
209
+ end
210
+
211
+
212
+ class RegexElement < LogFormatElement
213
+ end
214
+
215
+
216
+ # Finds log format elements given information about them.
217
+ class ElementDictionary
218
+ @@_ELEMENTS = [
219
+ RemoteHostElement,
220
+ LogNameElement,
221
+ RemoteUserElement,
222
+ TimeElement,
223
+ ReqFirstlineElement,
224
+ StatusElement,
225
+ BytesSentElement,
226
+ BytesSentElement,
227
+ BytesSentWithHeadersElement,
228
+ ServeTimeMicroElement,
229
+ UrlPathElement,
230
+ QueryStringElement,
231
+ ReqMethodElement,
232
+ ProtocolElement
233
+ ]
234
+
235
+ # Returns the LogFormatElement subclass with the given format-string abbreviation.
236
+ #
237
+ # If none exists, returns nil.
238
+ def self.find_by_abbrev(abbrev)
239
+ @@_ELEMENTS.each do |element|
240
+ if element.abbrev == abbrev
241
+ return element
242
+ end
243
+ end
244
+
245
+ nil
246
+ end
247
+ end
248
+
249
+
250
+ # Generates LogFormatElement instances.
251
+ #
252
+ # This class does the work of figuring out which LogFormatElement subclass to make and makes it.
253
+ class LogFormatElementFactory
254
+ # Takes an Apache log format abbreviation and returns a corresponding LogFormatElement
255
+ def from_abbrev(abbrev)
256
+ element_cls = ElementDictionary.find_by_abbrev(abbrev)
257
+ if element_cls
258
+ # We found it in the dictionary, so just return an instance
259
+ return element_cls.new
260
+ elsif abbrev =~ /^%\{([A-Za-z0-9-]+)\}i/
261
+ # HTTP request header
262
+ return _reqheader_element(abbrev, $1)
263
+ elsif abbrev =~ /^%\{(.*?):([^}]+)\}r/
264
+ # Arbitrary regex
265
+ return _regex_element(abbrev, $1, $2)
266
+ end
267
+
268
+ raise "Unknown element format '#{abbrev}'"
269
+ end
270
+
271
+ # Returns a format element based on an HTTP header
272
+ def _reqheader_element(abbrev, header_name)
273
+ element = ReqheaderElement.new
274
+
275
+ element.abbrev = abbrev
276
+ element.regex = %q![^"]*!
277
+ element.name = _header_name_to_element_name(header_name)
278
+
279
+ element
280
+ end
281
+
282
+ # Returns a format element based on an arbitrary regex
283
+ def _regex_element(abbrev, regex_name, regex)
284
+ element = RegexElement.new
285
+
286
+ element.abbrev = abbrev
287
+ element.regex = regex
288
+ element.name = "regex_#{regex_name}".to_sym
289
+
290
+ element
291
+ end
292
+
293
+ # Lowercases header name and turns hyphens into underscores
294
+ def _header_name_to_element_name(header_name)
295
+ ("reqheader_" + header_name.downcase().gsub("-", "_")).to_sym
296
+ end
297
+ end
@@ -0,0 +1,308 @@
1
+ # Abstract for a procedure routine.
2
+ class ProcedureRoutine
3
+ def initialize(log_parser)
4
+ @_log_parser = log_parser
5
+ @_current_entry = nil
6
+ end
7
+
8
+ # Allows blocks passed to a DSL routine to access parameters from the current log entry
9
+ def method_missing(sym, *args)
10
+ @_current_entry[sym]
11
+ end
12
+
13
+ # Executes the DSL routine using the given block
14
+ #
15
+ # Abstract method
16
+ def execute(&blk)
17
+ raise "Not implemented"
18
+ end
19
+
20
+ # Anything that needs to happen after the routine completes but before it returns its
21
+ # result can go in here.
22
+ def finish
23
+ @_log_parser.reset
24
+ end
25
+ end
26
+
27
+
28
+ # DSL routine that returns the number of log entries where the block evaluates to true
29
+ class CountWhere < ProcedureRoutine
30
+ def execute(&blk)
31
+ count = 0
32
+ while @_current_entry = @_log_parser.next_entry
33
+ if instance_eval(&blk)
34
+ count += 1
35
+ end
36
+ end
37
+ count
38
+ end
39
+ end
40
+
41
+
42
+ # DSL routine that executes the block for every log entry
43
+ class Each < ProcedureRoutine
44
+ def execute(&blk)
45
+ while @_current_entry = @_log_parser.next_entry
46
+ instance_eval(&blk)
47
+ end
48
+ end
49
+ end
50
+
51
+
52
+ # DSL routine(s) that filter(s) for entries for which the given block evaluates to true
53
+ #
54
+ # This can be called as 'filter()', which means the filtering happens in a temporary file, or
55
+ # as 'filter(path)', which means the filtering happens in the given file. It can also be called
56
+ # as 'filter!()', which means the filtering happens in place, clobbering what's in apachecrunch's
57
+ # target file.
58
+ class Filter < ProcedureRoutine
59
+ def execute(path=nil, in_place=false, &blk)
60
+ @_in_place = in_place
61
+ @_results_file = _make_results_file(path, in_place)
62
+
63
+ while @_current_entry = @_log_parser.next_entry
64
+ if instance_eval(&blk)
65
+ @_results_file.write(@_current_entry[:text])
66
+ end
67
+ end
68
+ end
69
+
70
+ def finish
71
+ @_log_parser.replace_target(@_results_file, @_in_place)
72
+ end
73
+
74
+ # Returns a writable file object to which the results of the filter should be written.
75
+ def _make_results_file(path, in_place)
76
+ if path.nil?
77
+ # If no path passed (this includes the case where the filter is being performed
78
+ # in place), we want a temp file.
79
+ return Tempfile.new("apachecrunch")
80
+ else
81
+ return open(path, "w")
82
+ end
83
+ end
84
+ end
85
+
86
+ # DSL routine that returns the count of entries with each found value of the given block
87
+ #
88
+ # You might for instance run this with the block { status }, and you'd get back something like
89
+ # {"200" => 941, "301" => 41, "404" => 2, "500" => 0}
90
+ class CountBy < ProcedureRoutine
91
+ def execute(&blk)
92
+ counts = {}
93
+ while @_current_entry = @_log_parser.next_entry
94
+ val = instance_eval(&blk)
95
+ if counts.key?(val)
96
+ counts[val] += 1
97
+ else
98
+ counts[val] = 1
99
+ end
100
+ end
101
+ return counts
102
+ end
103
+ end
104
+
105
+
106
+ # DSL routine that finds the distribution of (numeric) values to which the given block evaluates
107
+ #
108
+ # For example,
109
+ #
110
+ # distribution 100 do
111
+ # bytes_sent
112
+ # end
113
+ #
114
+ # would return a hash with keys from 0 up by multiples of 100, the value of each being the number
115
+ # of entries for which bytes_sent is between that key and the next key.
116
+ class Distribution < ProcedureRoutine
117
+ def execute(bucket_width, &blk)
118
+ dist = {}
119
+ while @_current_entry = @_log_parser.next_entry
120
+ val = instance_eval(&blk)
121
+ k = _key_for(val, bucket_width)
122
+ if dist.key?(k)
123
+ dist[k] += 1
124
+ else
125
+ dist[k] = 1
126
+ end
127
+ end
128
+
129
+ # Backfill keys for which we didn't find a value
130
+ 0.step(dist.keys.max, bucket_width).each do |k|
131
+ dist[k] = 0 unless dist.key?(k)
132
+ end
133
+
134
+ dist
135
+ end
136
+
137
+ # Determines the key for the distribution hash given the value and step
138
+ def _key_for(val, bucket_width)
139
+ (val.to_i / bucket_width) * bucket_width
140
+ end
141
+ end
142
+
143
+
144
+ # Same as Distribution, but the buckets get expenentially wider
145
+ class LogDistribution < ProcedureRoutine
146
+ def execute(width_base, &blk)
147
+ dist = {}
148
+ while @_current_entry = @_log_parser.next_entry
149
+ val = instance_eval(&blk)
150
+ k = _key_for(val, width_base)
151
+ if dist.key?(k)
152
+ dist[k] += 1
153
+ else
154
+ dist[k] = 1
155
+ end
156
+ end
157
+
158
+ # Backfill keys for which we didn't find a value
159
+ k = dist.keys.min
160
+ max_key = dist.keys.max
161
+ while k *= width_base and k < max_key
162
+ dist[k] = 0 unless dist.key?(k)
163
+ end
164
+
165
+ dist
166
+ end
167
+
168
+ # Determines the key for the distribution hash given the value and logarithmic base for
169
+ # the bucket width
170
+ def _key_for(val, width_base)
171
+ exp = (Math.log(val) / Math.log(width_base)).to_i
172
+ width_base ** exp
173
+ end
174
+ end
175
+
176
+
177
+ # DSL routine that determines a confidence interval for the values to which the block evaluates
178
+ #
179
+ # For example,
180
+ #
181
+ # confidence_interval 95 do
182
+ # time_to_serve
183
+ # end
184
+ #
185
+ # would return two numbers, the lower and upper bound of a 95% confidence interval for the values
186
+ # of time_to_serve.
187
+ class ConfidenceInterval < ProcedureRoutine
188
+ def execute(confidence, &blk)
189
+ # Build a list of all the values found
190
+ values = []
191
+ while @_current_entry = @_log_parser.next_entry
192
+ values << instance_eval(&blk)
193
+ end
194
+ values.sort!
195
+
196
+ # Determine how many values are outside the bounds of the CI
197
+ count_outside = (values.length * (1.0 - confidence/100.0)).to_i
198
+
199
+ # Find the bounds of the confidence interval
200
+ return values[count_outside / 2], values[-count_outside / 2]
201
+ end
202
+ end
203
+
204
+
205
+ # DSL routine that finds the most common n values for the given block.
206
+ #
207
+ # Returns a list of lists, each of which is [value, count]. This list is sorted by count.
208
+ class MostCommon < ProcedureRoutine
209
+ def execute(n, &blk)
210
+ counts = CountBy.new(@_log_parser).execute(&blk)
211
+
212
+ # Sort the block values descending
213
+ sorted_vals = counts.keys.sort do |val_a,val_b|
214
+ - (counts[val_a] <=> counts[val_b])
215
+ end
216
+
217
+ sorted_vals[0..n].map do |val|
218
+ [val, counts[val]]
219
+ end
220
+ end
221
+ end
222
+
223
+
224
+ # The environment in which a procedure file is evaluated.
225
+ #
226
+ # A procedure file is some ruby code that uses our DSL.
227
+ class ProcedureEnvironment
228
+ def initialize(log_parser)
229
+ @_log_parser = log_parser
230
+ end
231
+
232
+ # Evaluates the given string as a procedure in our DSL
233
+ def eval_procedure(proc_string)
234
+ eval proc_string
235
+ end
236
+
237
+ # DSL routine 'count_where'
238
+ def count_where(&blk)
239
+ routine = CountWhere.new(@_log_parser)
240
+ rv = routine.execute(&blk)
241
+ routine.finish
242
+ rv
243
+ end
244
+
245
+ # DSL routine 'filter!'
246
+ def filter!(&blk)
247
+ routine = Filter.new(@_log_parser)
248
+ routine.execute(nil, true, &blk)
249
+ routine.finish
250
+ nil
251
+ end
252
+
253
+ # DSL routine 'filter'
254
+ def filter(target_path=nil, &blk)
255
+ routine = Filter.new(@_log_parser)
256
+ routine.execute(target_path, &blk)
257
+ routine.finish
258
+ nil
259
+ end
260
+
261
+ # DSL routine 'each'
262
+ def each(&blk)
263
+ routine = Each.new(@_log_parser)
264
+ routine.execute(&blk)
265
+ routine.finish
266
+ nil
267
+ end
268
+
269
+ # DSL routine 'count_by'
270
+ def count_by(&blk)
271
+ routine = CountBy.new(@_log_parser)
272
+ rv = routine.execute(&blk)
273
+ routine.finish
274
+ rv
275
+ end
276
+
277
+ # DSL routine 'distribution'
278
+ def distribution(bucket_width, &blk)
279
+ routine = Distribution.new(@_log_parser)
280
+ rv = routine.execute(bucket_width, &blk)
281
+ routine.finish
282
+ rv
283
+ end
284
+
285
+ # DSL routine 'log_distribution'
286
+ def log_distribution(width_base, &blk)
287
+ routine = LogDistribution.new(@_log_parser)
288
+ rv = routine.execute(width_base, &blk)
289
+ routine.finish
290
+ rv
291
+ end
292
+
293
+ # DSL routine 'confidence_interval'
294
+ def confidence_interval(confidence, &blk)
295
+ routine = ConfidenceInterval.new(@_log_parser)
296
+ rv = routine.execute(confidence, &blk)
297
+ routine.finish
298
+ rv
299
+ end
300
+
301
+ # DSL routine 'most_common'
302
+ def most_common(n, &blk)
303
+ routine = MostCommon.new(@_log_parser)
304
+ rv = routine.execute(n, &blk)
305
+ routine.finish
306
+ rv
307
+ end
308
+ end
data/lib/progress.rb ADDED
@@ -0,0 +1,65 @@
1
+ class ProgressMeter
2
+ def initialize
3
+ @_entry_count = 0
4
+ end
5
+ end
6
+
7
+
8
+ # Progress meter that prints the number of entries parsed every (n) lines.
9
+ class EntryCountProgressMeter < ProgressMeter
10
+ def initialize
11
+ # 'period' is how many entries we wait between printing output. So if 'period' is 10 000,
12
+ # we'll print output every 10 000 lines.
13
+ @_period = 10000
14
+ super
15
+ end
16
+
17
+ # Outputs the number of entries that have been parsed so far (every once in a while).
18
+ #
19
+ # 'entry' should be the latest log entry to be parsed, in hash form.
20
+ def output_progress(entry)
21
+ @_entry_count += 1
22
+ if @_entry_count % @_period == 0
23
+ puts "Processed %d entries" % [@_entry_count]
24
+ end
25
+ end
26
+ end
27
+
28
+ class TimeProgressMeter < ProgressMeter
29
+ def initialize
30
+ # 'period' is how many entries we wait between printing output. So if 'period' is 10 000,
31
+ # we'll print output every 10 000 lines.
32
+ @_period = 10000
33
+ super
34
+ end
35
+
36
+ # Outputs the number of entries that have been parsed so far (every once in a while).
37
+ #
38
+ # 'entry' should be the latest log entry to be parsed, in hash form.
39
+ def output_progress(entry)
40
+ @_entry_count += 1
41
+ if @_entry_count % @_period == 0
42
+ puts "Processed through %s" % [entry["time"]]
43
+ end
44
+ end
45
+ end
46
+
47
+ class NullProgressMeter < ProgressMeter
48
+ def output_progress(entry)
49
+ end
50
+ end
51
+
52
+
53
+ # Constructs progress meters that output progress info to the user.
54
+ class ProgressMeterFactory
55
+ # Constructs a progress meter from a hash containing the options passed on the command line.
56
+ def self.from_options(options)
57
+ pm_class = {
58
+ "entry" => EntryCountProgressMeter,
59
+ "time" => TimeProgressMeter
60
+ }
61
+ pm_class.default = NullProgressMeter
62
+
63
+ pm_class[options[:progress]].new
64
+ end
65
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: apachecrunch
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - Dan Slimmon
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-07-09 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: |-
22
+ Apache Crunch is an analysis tool for Apache logs. You write little scripts
23
+ to do the analysis, using our DSL to make the procedure as simple and readable
24
+ as possible. See our homepage for more details.
25
+ email: dan@danslimmon.com
26
+ executables:
27
+ - apachecrunch
28
+ extensions: []
29
+
30
+ extra_rdoc_files: []
31
+
32
+ files:
33
+ - lib/apachecrunch.rb
34
+ - lib/log_element.rb
35
+ - lib/procedure_dsl.rb
36
+ - lib/progress.rb
37
+ - bin/apachecrunch
38
+ - LICENSE
39
+ has_rdoc: true
40
+ homepage: https://github.com/danslimmon/apachecrunch/
41
+ licenses:
42
+ - Creative Commons Share-Alike
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ hash: 3
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project:
69
+ rubygems_version: 1.6.2
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: Apache log analysis tool designed for ease of use
73
+ test_files: []
74
+