apachecrunch 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1 @@
1
+ http://creativecommons.org/licenses/by-sa/3.0/
data/bin/apachecrunch ADDED
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # For development while inside the apachecrunch dir:
4
+ $: << ".."
5
+ $: << "./lib"
6
+ require "rubygems"
7
+
8
+ require "apachecrunch"
9
+ require "progress"
10
+ require "procedure_dsl"
11
+
12
+
13
+ # Prints the usage message and exits with the given exit code
14
+ def barf_usage(exit_code)
15
+ puts %q!USAGE:
16
+ apachecrunch.rb <PROCEDURE> <LOG>
17
+ [--format=<FORMAT NAME>] [--progress <METER TYPE>]
18
+
19
+ --progress: Gives you a progress meter as the log file is parsed. METER TYPE can be "entry",
20
+ which prints out how many entries have been parsed so far, or "time", which prints
21
+ out the time of the last entry parsed.!
22
+ exit exit_code
23
+ end
24
+
25
+
26
+ # Parses arguments
27
+ #
28
+ # Returns a hash with these keys (as symbols):
29
+ # procedure: The path to the procedure DSL file
30
+ # logfile: The path to the log file
31
+ # format: The name of the log format specified ("ncsa" by default)
32
+ def parse_args
33
+ args = ARGV.clone
34
+ options = {}
35
+
36
+ # Defaults
37
+ options[:format] = "ncsa"
38
+ options[:progress] = nil
39
+
40
+ while a = args.shift
41
+ if a == "--format"
42
+ options[:format] = args.shift
43
+ elsif a == "--help"
44
+ barf_usage(0)
45
+ elsif a == "--progress"
46
+ options[:progress] = args.shift
47
+ elsif options.key?(:procedure)
48
+ options[:logfile] = a
49
+ else
50
+ options[:procedure] = a
51
+ end
52
+ end
53
+ unless options.key?(:procedure) and options.key?(:logfile)
54
+ barf_usage(1)
55
+ end
56
+
57
+ return options
58
+ end
59
+
60
+
61
+ options = parse_args
62
+
63
+ format_string = FormatStringFinder.new.find(options[:format])
64
+ progress_meter = ProgressMeterFactory.from_options(options)
65
+ log_parser = LogParserFactory.log_parser(
66
+ format_string=format_string,
67
+ path=options[:logfile],
68
+ progress_meter=progress_meter)
69
+ proc_env = ProcedureEnvironment.new(log_parser)
70
+ proc_env.eval_procedure(open(options[:procedure]).read())
@@ -0,0 +1,316 @@
1
+ require "date"
2
+ require "tempfile"
3
+
4
+ require 'log_element'
5
+
6
+
7
+ # A parsed entry from the log.
8
+ #
9
+ # Acts like a hash, in that you get at the log elements (e.g. "url_path", "remote_host") by
10
+ # as entry[name].
11
+ class LogEntry
12
+ def initialize(derivation_map)
13
+ @_derivation_map = derivation_map
14
+ @_attributes = {}
15
+ end
16
+
17
+ def []=(name, value)
18
+ @_attributes[name] = value
19
+ end
20
+
21
+ def [](name)
22
+ return @_attributes[name] if @_attributes.key?(name)
23
+
24
+ derived_from_cls = @_derivation_map[name]
25
+ return nil if derived_from_cls.nil?
26
+
27
+ derived_from_cls.derive(name, @_attributes[derived_from_cls.name])
28
+ end
29
+
30
+ def merge!(hsh)
31
+ @_attributes.merge!(hsh)
32
+ end
33
+ end
34
+
35
+
36
+ # A bare string in a log format
37
+ #
38
+ # Exposes 'regex' for consistency with LogFormatElement, but there shouldn't be anything other
39
+ # than one-to-one character matching in there.
40
+ class LogFormatString
41
+ attr_accessor :regex
42
+
43
+ def initialize(regex)
44
+ @regex = regex
45
+ end
46
+ end
47
+
48
+
49
+ # Represents a particular Apache log format
50
+ class LogFormat
51
+ attr_accessor :format_string, :tokens
52
+
53
+ def initialize
54
+ @tokens = []
55
+ @_regex = nil
56
+ end
57
+
58
+ # Appends a given token (a LogFormatElement or LogFormatString) to the tokens list
59
+ def append(token)
60
+ @tokens << token
61
+ end
62
+
63
+ # Returns a compiled regex to match a log line in this format
64
+ def regex
65
+ return @_regex unless @_regex.nil?
66
+
67
+ r = "^"
68
+ @tokens.each do |tok|
69
+ # We only care to remember the LogFormatElements. No need to put parentheses
70
+ # around LogFormatString shit.
71
+ if tok.respond_to?(:name)
72
+ r += "(" + tok.regex + ")"
73
+ else
74
+ r += tok.regex
75
+ end
76
+ end
77
+ r += "$"
78
+
79
+ @_regex = Regexp.compile(r)
80
+ @_regex
81
+ end
82
+
83
+ # Returns the list of LogFormatElements, in order, of the interpolated things in the format.
84
+ #
85
+ # For example, if the log format string were "%h %u %{Referer}i", this would return the
86
+ # LogFormatElement instances for "%h", "%u", and "%{Referer}i".
87
+ def elements
88
+ @tokens.find_all do |tok|
89
+ tok.respond_to?(:name)
90
+ end
91
+ end
92
+
93
+ # Returns hash mapping names of elements to the element class from which they can be derived.
94
+ def derivation_map
95
+ hsh = {}
96
+ elements.each do |tok|
97
+ tok.derived_elements.each do |derived_element|
98
+ hsh[derived_element.name] = tok.class
99
+ end
100
+ end
101
+
102
+ hsh
103
+ end
104
+ end
105
+
106
+
107
+ # Turns a string specifying an Apache log format into a LogFormat instance
108
+ class LogFormatFactory
109
+ def initialize
110
+ @element_factory = LogFormatElementFactory.new
111
+ end
112
+
113
+ # Constructs and returns a LogFormat instance based on the given Apache log format string
114
+ def from_format_string(f_string)
115
+ logformat = LogFormat.new
116
+ logformat.format_string = f_string
117
+
118
+ until f_string.empty?
119
+ token, f_string = _shift_token(f_string)
120
+ logformat.append(token)
121
+ end
122
+
123
+ logformat
124
+ end
125
+
126
+ # Finds the first token (a LogFormatElement or LogFormatString) in a format string
127
+ #
128
+ # Returns a list containing the token and the new format string (with the characters that
129
+ # correspond to the token removed)
130
+ def _shift_token(f_string)
131
+ if f_string =~ /^%%(.*)/
132
+ # Literal "%"
133
+ return [LogFormatString.new("%%"), $1]
134
+ elsif f_string =~ /^(%[A-Za-z])(.*)/
135
+ # Simple element (e.g. "%h", "%u")
136
+ return [@element_factory.from_abbrev($1), $2]
137
+ elsif f_string =~ /^(%\{.+?\}[Ceinor])(.*)/
138
+ # "Contents of" element (e.g. "%{Accept}i")
139
+ return [@element_factory.from_abbrev($1), $2]
140
+ elsif f_string =~ /^(.+?)(%.*|$)/
141
+ # Bare string up until the next %, or up until the end of the format string
142
+ return [LogFormatString.new($1), $2]
143
+ end
144
+ end
145
+ end
146
+
147
+
148
+ # Makes log line hashes based on log file text
149
+ class LogLineParser
150
+ # Initializes the instance given a LogFormat instance
151
+ def initialize(log_format, progress_meter)
152
+ @log_format = log_format
153
+ @progress_meter = progress_meter
154
+
155
+ @_elements = log_format.elements
156
+ @_derivation_map = log_format.derivation_map
157
+ end
158
+
159
+ # Returns a log line hash built from a line of text, or nil if the line was malformatted
160
+ #
161
+ # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
162
+ def from_text(log_text)
163
+ match = (log_text =~ @log_format.regex)
164
+ if match.nil?
165
+ warn "Log line did not match expected format: #{log_text}"
166
+ return nil
167
+ end
168
+
169
+ # Make a hash mapping all parsed elements to their values in the entry
170
+ match_groups = Regexp.last_match.to_a
171
+ match_groups.shift # First value is the whole matched string, which we do not want
172
+ element_values = Hash[*@_elements.zip(match_groups).flatten]
173
+
174
+ # Start building the return value
175
+ entry = LogEntry.new(@_derivation_map)
176
+ entry[:text] = log_text
177
+ # Insert all the elements specified in the LogFormat
178
+ entry.merge!(_elements_to_hash(element_values))
179
+
180
+ @progress_meter.output_progress(entry)
181
+ entry
182
+ end
183
+
184
+ # Returns a hash of "element name" => value pairs based on a hash of element => value pairs.
185
+ def _elements_to_hash(element_values)
186
+ hsh = {}
187
+ element_values.each_pair do |element, value|
188
+ hsh[element.name] = value
189
+ end
190
+
191
+ hsh
192
+ end
193
+
194
+ # Returns hash of derived "element name" => value pairs from a hash of element => value pairs.
195
+ #
196
+ # That is, we go through the elements passed and if any offers derived elements, we include
197
+ # those in the return value.
198
+ def _derived_elements(element_values)
199
+ hsh = {}
200
+ element_values.each_pair do |element, value|
201
+ hsh.merge!(element.derived_values(value))
202
+ end
203
+
204
+ hsh
205
+ end
206
+ end
207
+
208
+
209
+ # Parses a log file given a path and a LogFormat instance
210
+ class LogParser
211
+ # Initializes the parser with the path to a log file and a LogLineParser.
212
+ def initialize(path, ll_parser)
213
+ @path = path
214
+ @ll_parser = ll_parser
215
+
216
+ @_file = nil
217
+ end
218
+
219
+ # Returns the next entry in the log file as a hash, or nil if we've reached EOF.
220
+ #
221
+ # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
222
+ def next_entry
223
+ @_file = open(@path) if @_file.nil?
224
+
225
+ while line_text = @_file.gets
226
+ return nil if line_text.nil?
227
+ logline = @ll_parser.from_text(line_text)
228
+
229
+ # The LogLineFactory returns nil and writes a warning if the line text doesn't
230
+ # match our expected format.
231
+ next if logline.nil?
232
+
233
+ return logline
234
+ end
235
+ end
236
+
237
+ # Resets the LogParser's filehandle so we can start over.
238
+ def reset
239
+ @_file = nil
240
+ end
241
+
242
+ # Makes the LogParser close its current log file and start parsing a new one instead
243
+ #
244
+ # `new_target` is a writable file object that the parser should start parsing, and if
245
+ # in_place is true, we actually replace the contents of the current target with those
246
+ # of the new target.
247
+ def replace_target(new_target, in_place)
248
+ new_target.close
249
+
250
+ if in_place
251
+ old_path = @_file.path
252
+ File.rename(new_target.path, old_path)
253
+ else
254
+ @path = new_target.path
255
+ end
256
+
257
+ @_file = nil
258
+ end
259
+ end
260
+
261
+ # Makes a LogParser given the parameters we want to work with.
262
+ #
263
+ # This is the class that most external code should instatiate to begin using this library.
264
+ class LogParserFactory
265
+ # Returns a new LogParser instance for the given log file, which should have the given Apache
266
+ # log format.
267
+ def self.log_parser(format_string, path, progress_meter)
268
+ # First we generate a LogFormat instance based on the format string we were given
269
+ format_factory = LogFormatFactory.new
270
+ log_format = format_factory.from_format_string(format_string)
271
+
272
+ # Now we generate a line parser
273
+ log_line_parser = LogLineParser.new(log_format, progress_meter)
274
+
275
+ # And now we can instantiate and return a LogParser
276
+ return LogParser.new(path, log_line_parser)
277
+ end
278
+ end
279
+
280
+
281
+ # Finds a named log format string in the configuration file(s)
282
+ class FormatStringFinder
283
+ @@FILE_NAME = "log_formats.rb"
284
+ @@DEFAULT_FORMATS = {
285
+ :ncsa => %q!%h %l %u %t \"%r\" %s %b \"%{Referer}i\" \"%{User-agent}i\"!,
286
+ :ubuntu => %q!%h %l %u %t \"%r\" %s %O \"%{Referer}i\" \"%{User-Agent}i\"!
287
+ }
288
+
289
+ # Finds the given format string in the configuration file(s)
290
+ #
291
+ # If none exists, returns nil.
292
+ def find(format_name)
293
+ name_as_symbol = format_name.to_sym
294
+
295
+ formats = @@DEFAULT_FORMATS.clone
296
+ _search_path.each do |dir|
297
+ config_path = File.join(dir, @@FILE_NAME)
298
+ if File.readable?(config_path)
299
+ config_file = open(File.join(dir, @@FILE_NAME))
300
+ eval config_file.read
301
+ end
302
+
303
+ if formats.key?(format_name.to_sym)
304
+ return formats[format_name.to_sym].gsub(/\\"/, '"')
305
+ end
306
+ end
307
+
308
+ raise "Failed to find the format '#{format_name}' in the search path: #{_search_path.inspect}"
309
+ end
310
+
311
+ def _search_path
312
+ [".", "./etc",
313
+ File.join(ENV["HOME"], ".apachecrunch"),
314
+ "/etc/apachecrunch"]
315
+ end
316
+ end
@@ -0,0 +1,297 @@
1
+ # Converts a string to an integer
2
+ class IntegerCast
3
+ def self.cast(string_value)
4
+ string_value.to_i
5
+ end
6
+ end
7
+
8
+
9
+ # Converts a CLF-formatted string to an integer
10
+ #
11
+ # "CLF-formatted" means that if the value is 0, the string will be a single hyphen instead of
12
+ # a number. Like %b, for instance.
13
+ class CLFIntegerCast
14
+ def self.cast(string_value)
15
+ if string_value == "-"
16
+ return 0
17
+ end
18
+ string_value.to_i
19
+ end
20
+ end
21
+
22
+
23
+ # An element in a log format. Abstract from which all elements inherit.
24
+ #
25
+ # Exposes:
26
+ # abbrev: The Apache abbreviation for the element (such as "%h" or "%u" or "%{Referer}i")
27
+ # name: A short name for the element (such as "remote_host", "remote_user", or "reqhead_referer")
28
+ # regex: A regex that should match such an element ("[A-Za-z0-9.-]+", "[^:]+", ".+")
29
+ #
30
+ # If '_caster' is not nil, it should be a class with a method called "cast" that
31
+ # transforms a string to the appropriate data type or format for consumption.
32
+ # For example, the IntegerCast class transforms "562" to 562. The correct cast
33
+ # of a string can then be performed by passing that string to this LogFormaElement
34
+ # instance's "cast" method.
35
+ #
36
+ # 'derive_elements' manages elements that can be derived from the instance's value. See
37
+ # ReqFirstlineElement for an example.
38
+ class LogFormatElement
39
+ @_caster = nil
40
+
41
+ attr_accessor :abbrev, :name, :regex
42
+ # Class variables that determine the _default_ for abbrev, name, and regex in an instance.
43
+ # That is, an instance will initialize with these values for the instance variables @abbrev,
44
+ # @name, and @regex.
45
+ class << self; attr_accessor :abbrev, :name, :regex end
46
+ # Additionally we need to access this from within the instance:
47
+ class << self; attr_accessor :_caster end
48
+
49
+ def initialize
50
+ @abbrev = self.class.abbrev
51
+ @name = self.class.name
52
+ @regex = self.class.regex
53
+ end
54
+
55
+ # Casts a string found in the log to the correct type, using the class's @@_caster attribute.
56
+ def cast(string_value)
57
+ if self.class._caster.nil?
58
+ return string_value
59
+ else
60
+ return self.class._caster.cast(string_value)
61
+ end
62
+ end
63
+
64
+ # Derives the named element (e.g. "url_path") from a given value for this one.
65
+ #
66
+ # See ReqFirstlineElement for an example.
67
+ def self.derive(name, our_own_value)
68
+ raise NotImplementedError
69
+ end
70
+
71
+ # Returns a list of the element classes that can be derived from this one.
72
+ #
73
+ # See ReqFirstlineElement for an example.
74
+ def derived_elements
75
+ []
76
+ end
77
+ end
78
+
79
+
80
+ class RemoteHostElement < LogFormatElement
81
+ @abbrev = "%h"
82
+ @name = :remote_host
83
+ @regex = %q![A-Za-z0-9.-]+!
84
+ end
85
+
86
+
87
+ class LogNameElement < LogFormatElement
88
+ @abbrev = "%l"
89
+ @name = :log_name
90
+ @regex = %q!\S+!
91
+ end
92
+
93
+
94
+ class RemoteUserElement < LogFormatElement
95
+ @abbrev = "%u"
96
+ @name = :remote_user
97
+ @regex = %q![^:]+!
98
+ end
99
+
100
+
101
+ class TimeElement < LogFormatElement
102
+ @abbrev = "%t"
103
+ @name = :time
104
+ @regex = %q!\[\d\d/[A-Za-z]{3}/\d\d\d\d:\d\d:\d\d:\d\d [-+]\d\d\d\d\]!
105
+ end
106
+
107
+
108
+ class ReqFirstlineElement < LogFormatElement
109
+ @abbrev = "%r"
110
+ @name = :req_firstline
111
+ @regex = %q![^"]+!
112
+
113
+ @_derivation_regex = nil
114
+
115
+ def self.derive(name, our_own_value)
116
+ if @_derivation_regex.nil?
117
+ @_derivation_regex = Regexp.compile("^(#{ReqMethodElement.regex})\s+(#{UrlPathElement.regex})(#{QueryStringElement.regex})\s+(#{ProtocolElement.regex})$")
118
+ end
119
+
120
+ hsh = {}
121
+ if our_own_value =~ @_derivation_regex
122
+ hsh[ReqMethodElement.name] = $1
123
+ hsh[UrlPathElement.name] = $2
124
+ hsh[QueryStringElement.name] = $3
125
+ hsh[ProtocolElement.name] = $4
126
+ end
127
+
128
+ hsh[name]
129
+ end
130
+
131
+ def derived_elements
132
+ return [ReqMethodElement, UrlPathElement, QueryStringElement, ProtocolElement]
133
+ end
134
+ end
135
+
136
+
137
+ class StatusElement < LogFormatElement
138
+ @abbrev = "%s"
139
+ @name = :status
140
+ @regex = %q!\d+|-!
141
+ end
142
+
143
+
144
+ class BytesSentElement < LogFormatElement
145
+ @abbrev = "%b"
146
+ @name = :bytes_sent
147
+ @regex = %q!\d+!
148
+
149
+ @@_caster = IntegerCast
150
+ end
151
+
152
+
153
+ class BytesSentElement < LogFormatElement
154
+ @abbrev = "%b"
155
+ @name = :bytes_sent
156
+ @regex = %q![\d-]+!
157
+
158
+ @@_caster = CLFIntegerCast
159
+ end
160
+
161
+
162
+ class BytesSentWithHeadersElement < LogFormatElement
163
+ @abbrev = "%O"
164
+ @name = :bytes_sent_with_headers
165
+ @regex = %q!\d+!
166
+
167
+ @@_caster = IntegerCast
168
+ end
169
+
170
+
171
+ class ServeTimeMicroElement < LogFormatElement
172
+ @abbrev = "%D"
173
+ @name = :serve_time_micro
174
+ @regex = %q!\d+!
175
+
176
+ @@_caster = IntegerCast
177
+ end
178
+
179
+
180
+ class UrlPathElement < LogFormatElement
181
+ @abbrev = "%U"
182
+ @name = :url_path
183
+ @regex = %q!/[^?]*!
184
+ end
185
+
186
+
187
+ class QueryStringElement < LogFormatElement
188
+ @abbrev = "%q"
189
+ @name = :query_string
190
+ @regex = %q!\??\S*!
191
+ end
192
+
193
+
194
+ class ReqMethodElement < LogFormatElement
195
+ @abbrev = "%m"
196
+ @name = :req_method
197
+ @regex = %q![A-Z]+!
198
+ end
199
+
200
+
201
+ class ProtocolElement < LogFormatElement
202
+ @abbrev = "%H"
203
+ @name = :protocol
204
+ @regex = %q!\S+!
205
+ end
206
+
207
+
208
+ class ReqheaderElement < LogFormatElement
209
+ end
210
+
211
+
212
+ class RegexElement < LogFormatElement
213
+ end
214
+
215
+
216
+ # Finds log format elements given information about them.
217
+ class ElementDictionary
218
+ @@_ELEMENTS = [
219
+ RemoteHostElement,
220
+ LogNameElement,
221
+ RemoteUserElement,
222
+ TimeElement,
223
+ ReqFirstlineElement,
224
+ StatusElement,
225
+ BytesSentElement,
226
+ BytesSentElement,
227
+ BytesSentWithHeadersElement,
228
+ ServeTimeMicroElement,
229
+ UrlPathElement,
230
+ QueryStringElement,
231
+ ReqMethodElement,
232
+ ProtocolElement
233
+ ]
234
+
235
+ # Returns the LogFormatElement subclass with the given format-string abbreviation.
236
+ #
237
+ # If none exists, returns nil.
238
+ def self.find_by_abbrev(abbrev)
239
+ @@_ELEMENTS.each do |element|
240
+ if element.abbrev == abbrev
241
+ return element
242
+ end
243
+ end
244
+
245
+ nil
246
+ end
247
+ end
248
+
249
+
250
+ # Generates LogFormatElement instances.
251
+ #
252
+ # This class does the work of figuring out which LogFormatElement subclass to make and makes it.
253
+ class LogFormatElementFactory
254
+ # Takes an Apache log format abbreviation and returns a corresponding LogFormatElement
255
+ def from_abbrev(abbrev)
256
+ element_cls = ElementDictionary.find_by_abbrev(abbrev)
257
+ if element_cls
258
+ # We found it in the dictionary, so just return an instance
259
+ return element_cls.new
260
+ elsif abbrev =~ /^%\{([A-Za-z0-9-]+)\}i/
261
+ # HTTP request header
262
+ return _reqheader_element(abbrev, $1)
263
+ elsif abbrev =~ /^%\{(.*?):([^}]+)\}r/
264
+ # Arbitrary regex
265
+ return _regex_element(abbrev, $1, $2)
266
+ end
267
+
268
+ raise "Unknown element format '#{abbrev}'"
269
+ end
270
+
271
+ # Returns a format element based on an HTTP header
272
+ def _reqheader_element(abbrev, header_name)
273
+ element = ReqheaderElement.new
274
+
275
+ element.abbrev = abbrev
276
+ element.regex = %q![^"]*!
277
+ element.name = _header_name_to_element_name(header_name)
278
+
279
+ element
280
+ end
281
+
282
+ # Returns a format element based on an arbitrary regex
283
+ def _regex_element(abbrev, regex_name, regex)
284
+ element = RegexElement.new
285
+
286
+ element.abbrev = abbrev
287
+ element.regex = regex
288
+ element.name = "regex_#{regex_name}".to_sym
289
+
290
+ element
291
+ end
292
+
293
+ # Lowercases header name and turns hyphens into underscores
294
+ def _header_name_to_element_name(header_name)
295
+ ("reqheader_" + header_name.downcase().gsub("-", "_")).to_sym
296
+ end
297
+ end
@@ -0,0 +1,308 @@
1
+ # Abstract for a procedure routine.
2
+ class ProcedureRoutine
3
+ def initialize(log_parser)
4
+ @_log_parser = log_parser
5
+ @_current_entry = nil
6
+ end
7
+
8
+ # Allows blocks passed to a DSL routine to access parameters from the current log entry
9
+ def method_missing(sym, *args)
10
+ @_current_entry[sym]
11
+ end
12
+
13
+ # Executes the DSL routine using the given block
14
+ #
15
+ # Abstract method
16
+ def execute(&blk)
17
+ raise "Not implemented"
18
+ end
19
+
20
+ # Anything that needs to happen after the routine completes but before it returns its
21
+ # result can go in here.
22
+ def finish
23
+ @_log_parser.reset
24
+ end
25
+ end
26
+
27
+
28
+ # DSL routine that returns the number of log entries where the block evaluates to true
29
+ class CountWhere < ProcedureRoutine
30
+ def execute(&blk)
31
+ count = 0
32
+ while @_current_entry = @_log_parser.next_entry
33
+ if instance_eval(&blk)
34
+ count += 1
35
+ end
36
+ end
37
+ count
38
+ end
39
+ end
40
+
41
+
42
+ # DSL routine that executes the block for every log entry
43
+ class Each < ProcedureRoutine
44
+ def execute(&blk)
45
+ while @_current_entry = @_log_parser.next_entry
46
+ instance_eval(&blk)
47
+ end
48
+ end
49
+ end
50
+
51
+
52
+ # DSL routine(s) that filter(s) for entries for which the given block evaluates to true
53
+ #
54
+ # This can be called as 'filter()', which means the filtering happens in a temporary file, or
55
+ # as 'filter(path)', which means the filtering happens in the given file. It can also be called
56
+ # as 'filter!()', which means the filtering happens in place, clobbering what's in apachecrunch's
57
+ # target file.
58
+ class Filter < ProcedureRoutine
59
+ def execute(path=nil, in_place=false, &blk)
60
+ @_in_place = in_place
61
+ @_results_file = _make_results_file(path, in_place)
62
+
63
+ while @_current_entry = @_log_parser.next_entry
64
+ if instance_eval(&blk)
65
+ @_results_file.write(@_current_entry[:text])
66
+ end
67
+ end
68
+ end
69
+
70
+ def finish
71
+ @_log_parser.replace_target(@_results_file, @_in_place)
72
+ end
73
+
74
+ # Returns a writable file object to which the results of the filter should be written.
75
+ def _make_results_file(path, in_place)
76
+ if path.nil?
77
+ # If no path passed (this includes the case where the filter is being performed
78
+ # in place), we want a temp file.
79
+ return Tempfile.new("apachecrunch")
80
+ else
81
+ return open(path, "w")
82
+ end
83
+ end
84
+ end
85
+
86
+ # DSL routine that returns the count of entries with each found value of the given block
87
+ #
88
+ # You might for instance run this with the block { status }, and you'd get back something like
89
+ # {"200" => 941, "301" => 41, "404" => 2, "500" => 0}
90
+ class CountBy < ProcedureRoutine
91
+ def execute(&blk)
92
+ counts = {}
93
+ while @_current_entry = @_log_parser.next_entry
94
+ val = instance_eval(&blk)
95
+ if counts.key?(val)
96
+ counts[val] += 1
97
+ else
98
+ counts[val] = 1
99
+ end
100
+ end
101
+ return counts
102
+ end
103
+ end
104
+
105
+
106
+ # DSL routine that finds the distribution of (numeric) values to which the given block evaluates
107
+ #
108
+ # For example,
109
+ #
110
+ # distribution 100 do
111
+ # bytes_sent
112
+ # end
113
+ #
114
+ # would return a hash with keys from 0 up by multiples of 100, the value of each being the number
115
+ # of entries for which bytes_sent is between that key and the next key.
116
+ class Distribution < ProcedureRoutine
117
+ def execute(bucket_width, &blk)
118
+ dist = {}
119
+ while @_current_entry = @_log_parser.next_entry
120
+ val = instance_eval(&blk)
121
+ k = _key_for(val, bucket_width)
122
+ if dist.key?(k)
123
+ dist[k] += 1
124
+ else
125
+ dist[k] = 1
126
+ end
127
+ end
128
+
129
+ # Backfill keys for which we didn't find a value
130
+ 0.step(dist.keys.max, bucket_width).each do |k|
131
+ dist[k] = 0 unless dist.key?(k)
132
+ end
133
+
134
+ dist
135
+ end
136
+
137
+ # Determines the key for the distribution hash given the value and step
138
+ def _key_for(val, bucket_width)
139
+ (val.to_i / bucket_width) * bucket_width
140
+ end
141
+ end
142
+
143
+
144
+ # Same as Distribution, but the buckets get expenentially wider
145
+ class LogDistribution < ProcedureRoutine
146
+ def execute(width_base, &blk)
147
+ dist = {}
148
+ while @_current_entry = @_log_parser.next_entry
149
+ val = instance_eval(&blk)
150
+ k = _key_for(val, width_base)
151
+ if dist.key?(k)
152
+ dist[k] += 1
153
+ else
154
+ dist[k] = 1
155
+ end
156
+ end
157
+
158
+ # Backfill keys for which we didn't find a value
159
+ k = dist.keys.min
160
+ max_key = dist.keys.max
161
+ while k *= width_base and k < max_key
162
+ dist[k] = 0 unless dist.key?(k)
163
+ end
164
+
165
+ dist
166
+ end
167
+
168
+ # Determines the key for the distribution hash given the value and logarithmic base for
169
+ # the bucket width
170
+ def _key_for(val, width_base)
171
+ exp = (Math.log(val) / Math.log(width_base)).to_i
172
+ width_base ** exp
173
+ end
174
+ end
175
+
176
+
177
+ # DSL routine that determines a confidence interval for the values to which the block evaluates
178
+ #
179
+ # For example,
180
+ #
181
+ # confidence_interval 95 do
182
+ # time_to_serve
183
+ # end
184
+ #
185
+ # would return two numbers, the lower and upper bound of a 95% confidence interval for the values
186
+ # of time_to_serve.
187
+ class ConfidenceInterval < ProcedureRoutine
188
+ def execute(confidence, &blk)
189
+ # Build a list of all the values found
190
+ values = []
191
+ while @_current_entry = @_log_parser.next_entry
192
+ values << instance_eval(&blk)
193
+ end
194
+ values.sort!
195
+
196
+ # Determine how many values are outside the bounds of the CI
197
+ count_outside = (values.length * (1.0 - confidence/100.0)).to_i
198
+
199
+ # Find the bounds of the confidence interval
200
+ return values[count_outside / 2], values[-count_outside / 2]
201
+ end
202
+ end
203
+
204
+
205
+ # DSL routine that finds the most common n values for the given block.
206
+ #
207
+ # Returns a list of lists, each of which is [value, count]. This list is sorted by count.
208
+ class MostCommon < ProcedureRoutine
209
+ def execute(n, &blk)
210
+ counts = CountBy.new(@_log_parser).execute(&blk)
211
+
212
+ # Sort the block values descending
213
+ sorted_vals = counts.keys.sort do |val_a,val_b|
214
+ - (counts[val_a] <=> counts[val_b])
215
+ end
216
+
217
+ sorted_vals[0..n].map do |val|
218
+ [val, counts[val]]
219
+ end
220
+ end
221
+ end
222
+
223
+
224
+ # The environment in which a procedure file is evaluated.
225
+ #
226
+ # A procedure file is some ruby code that uses our DSL.
227
+ class ProcedureEnvironment
228
+ def initialize(log_parser)
229
+ @_log_parser = log_parser
230
+ end
231
+
232
+ # Evaluates the given string as a procedure in our DSL
233
+ def eval_procedure(proc_string)
234
+ eval proc_string
235
+ end
236
+
237
+ # DSL routine 'count_where'
238
+ def count_where(&blk)
239
+ routine = CountWhere.new(@_log_parser)
240
+ rv = routine.execute(&blk)
241
+ routine.finish
242
+ rv
243
+ end
244
+
245
+ # DSL routine 'filter!'
246
+ def filter!(&blk)
247
+ routine = Filter.new(@_log_parser)
248
+ routine.execute(nil, true, &blk)
249
+ routine.finish
250
+ nil
251
+ end
252
+
253
+ # DSL routine 'filter'
254
+ def filter(target_path=nil, &blk)
255
+ routine = Filter.new(@_log_parser)
256
+ routine.execute(target_path, &blk)
257
+ routine.finish
258
+ nil
259
+ end
260
+
261
+ # DSL routine 'each'
262
+ def each(&blk)
263
+ routine = Each.new(@_log_parser)
264
+ routine.execute(&blk)
265
+ routine.finish
266
+ nil
267
+ end
268
+
269
+ # DSL routine 'count_by'
270
+ def count_by(&blk)
271
+ routine = CountBy.new(@_log_parser)
272
+ rv = routine.execute(&blk)
273
+ routine.finish
274
+ rv
275
+ end
276
+
277
+ # DSL routine 'distribution'
278
+ def distribution(bucket_width, &blk)
279
+ routine = Distribution.new(@_log_parser)
280
+ rv = routine.execute(bucket_width, &blk)
281
+ routine.finish
282
+ rv
283
+ end
284
+
285
+ # DSL routine 'log_distribution'
286
+ def log_distribution(width_base, &blk)
287
+ routine = LogDistribution.new(@_log_parser)
288
+ rv = routine.execute(width_base, &blk)
289
+ routine.finish
290
+ rv
291
+ end
292
+
293
+ # DSL routine 'confidence_interval'
294
+ def confidence_interval(confidence, &blk)
295
+ routine = ConfidenceInterval.new(@_log_parser)
296
+ rv = routine.execute(confidence, &blk)
297
+ routine.finish
298
+ rv
299
+ end
300
+
301
+ # DSL routine 'most_common'
302
+ def most_common(n, &blk)
303
+ routine = MostCommon.new(@_log_parser)
304
+ rv = routine.execute(n, &blk)
305
+ routine.finish
306
+ rv
307
+ end
308
+ end
data/lib/progress.rb ADDED
@@ -0,0 +1,65 @@
1
+ class ProgressMeter
2
+ def initialize
3
+ @_entry_count = 0
4
+ end
5
+ end
6
+
7
+
8
+ # Progress meter that prints the number of entries parsed every (n) lines.
9
+ class EntryCountProgressMeter < ProgressMeter
10
+ def initialize
11
+ # 'period' is how many entries we wait between printing output. So if 'period' is 10 000,
12
+ # we'll print output every 10 000 lines.
13
+ @_period = 10000
14
+ super
15
+ end
16
+
17
+ # Outputs the number of entries that have been parsed so far (every once in a while).
18
+ #
19
+ # 'entry' should be the latest log entry to be parsed, in hash form.
20
+ def output_progress(entry)
21
+ @_entry_count += 1
22
+ if @_entry_count % @_period == 0
23
+ puts "Processed %d entries" % [@_entry_count]
24
+ end
25
+ end
26
+ end
27
+
28
+ class TimeProgressMeter < ProgressMeter
29
+ def initialize
30
+ # 'period' is how many entries we wait between printing output. So if 'period' is 10 000,
31
+ # we'll print output every 10 000 lines.
32
+ @_period = 10000
33
+ super
34
+ end
35
+
36
+ # Outputs the number of entries that have been parsed so far (every once in a while).
37
+ #
38
+ # 'entry' should be the latest log entry to be parsed, in hash form.
39
+ def output_progress(entry)
40
+ @_entry_count += 1
41
+ if @_entry_count % @_period == 0
42
+ puts "Processed through %s" % [entry["time"]]
43
+ end
44
+ end
45
+ end
46
+
47
+ class NullProgressMeter < ProgressMeter
48
+ def output_progress(entry)
49
+ end
50
+ end
51
+
52
+
53
+ # Constructs progress meters that output progress info to the user.
54
+ class ProgressMeterFactory
55
+ # Constructs a progress meter from a hash containing the options passed on the command line.
56
+ def self.from_options(options)
57
+ pm_class = {
58
+ "entry" => EntryCountProgressMeter,
59
+ "time" => TimeProgressMeter
60
+ }
61
+ pm_class.default = NullProgressMeter
62
+
63
+ pm_class[options[:progress]].new
64
+ end
65
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: apachecrunch
3
+ version: !ruby/object:Gem::Version
4
+ hash: 9
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ version: "0.1"
10
+ platform: ruby
11
+ authors:
12
+ - Dan Slimmon
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-07-09 00:00:00 -04:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: |-
22
+ Apache Crunch is an analysis tool for Apache logs. You write little scripts
23
+ to do the analysis, using our DSL to make the procedure as simple and readable
24
+ as possible. See our homepage for more details.
25
+ email: dan@danslimmon.com
26
+ executables:
27
+ - apachecrunch
28
+ extensions: []
29
+
30
+ extra_rdoc_files: []
31
+
32
+ files:
33
+ - lib/apachecrunch.rb
34
+ - lib/log_element.rb
35
+ - lib/procedure_dsl.rb
36
+ - lib/progress.rb
37
+ - bin/apachecrunch
38
+ - LICENSE
39
+ has_rdoc: true
40
+ homepage: https://github.com/danslimmon/apachecrunch/
41
+ licenses:
42
+ - Creative Commons Share-Alike
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ hash: 3
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ requirements: []
67
+
68
+ rubyforge_project:
69
+ rubygems_version: 1.6.2
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: Apache log analysis tool designed for ease of use
73
+ test_files: []
74
+