apachecrunch 0.4 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/apachecrunch CHANGED
@@ -66,5 +66,5 @@ log_parser = ApacheCrunch::LogParserFactory.log_parser(
66
66
  format_def,
67
67
  options[:logfile],
68
68
  progress_meter)
69
- proc_env = ProcedureEnvironment.new(log_parser)
69
+ proc_env = ApacheCrunch::ProcedureEnvironment.new(log_parser)
70
70
  proc_env.eval_procedure(open(options[:procedure]).read())
data/lib/apachecrunch.rb CHANGED
@@ -2,21 +2,11 @@ require "date"
2
2
  require "tempfile"
3
3
 
4
4
  require 'config'
5
+ require 'progress'
5
6
  require 'entry'
6
7
  require 'format'
8
+ require 'derivation'
9
+ require 'format_token'
7
10
  require 'log_parser'
8
- require 'log_element'
9
-
10
- class ApacheCrunch
11
- # A bare string in a log format
12
- #
13
- # Exposes 'regex' for consistency with LogFormatElement, but there shouldn't be anything other
14
- # than one-to-one character matching in there.
15
- class LogFormatString
16
- attr_accessor :regex
17
-
18
- def initialize(regex)
19
- @regex = regex
20
- end
21
- end
22
- end
11
+ require 'element'
12
+ require 'element_value_fetcher'
data/lib/cast.rb ADDED
@@ -0,0 +1,21 @@
1
+ class ApacheCrunch
2
+ # Converts a string to an integer
3
+ class IntegerCast
4
+ def cast(string_value)
5
+ string_value.to_i
6
+ end
7
+ end
8
+
9
+ # Converts a CLF-formatted string to an integer
10
+ #
11
+ # "CLF-formatted" means that if the value is 0, the string will be a single hyphen instead of
12
+ # a number. Like %b, for instance.
13
+ class CLFIntegerCast
14
+ def cast(string_value)
15
+ if string_value == "-"
16
+ return 0
17
+ end
18
+ string_value.to_i
19
+ end
20
+ end
21
+ end
data/lib/derivation.rb ADDED
@@ -0,0 +1,113 @@
1
+ class ApacheCrunch
2
+ # Abstract for classes that define how to obtain a given element from the value of another.
3
+ class DerivationRule
4
+ # Returns the name of the element from which this rule derives values
5
+ def source_name
6
+ raise NotImplementedError
7
+ end
8
+
9
+ # Derives the given derivable element from the given element value
10
+ def derive(name, source_value)
11
+ raise NotImplementedError
12
+ end
13
+ end
14
+
15
+
16
+ # Dummy rule that doesn't derive anything
17
+ class NullDerivationRule
18
+ def source_name; nil; end
19
+ def target_names; []; end
20
+ def derive(name, source_value); nil; end
21
+ end
22
+
23
+
24
+ # Derivation rule for elements derived from TimeToken
25
+ class TimeDerivationRule < DerivationRule
26
+ def initialize
27
+ @_derivation_regex = nil
28
+ @_month_map = {"Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4,
29
+ "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8,
30
+ "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12}
31
+ end
32
+
33
+ def source_name
34
+ :time
35
+ end
36
+
37
+ def target_names
38
+ [:year, :month, :day, :hour, :minute, :second]
39
+ end
40
+
41
+ def derive(name, source_value)
42
+ if @_derivation_regex.nil?
43
+ @_derivation_regex = Regexp.compile(%q!^\[(\d\d)/([A-Za-z]{3})/(\d\d\d\d):(\d\d):(\d\d):(\d\d)!)
44
+ end
45
+
46
+ hsh = {}
47
+ if source_value =~ @_derivation_regex
48
+ hsh[:year] = $3.to_i
49
+ hsh[:month] = @_month_map[$2]
50
+ hsh[:day] = $1.to_i
51
+
52
+ hsh[:hour] = $4.to_i
53
+ hsh[:minute] = $5.to_i
54
+ hsh[:second] = $6.to_i
55
+ end
56
+
57
+ hsh[name]
58
+ end
59
+ end
60
+
61
+ class ReqFirstlineDerivationRule
62
+ def initialize
63
+ @_derivation_regex = nil
64
+ end
65
+
66
+ def source_name
67
+ :req_firstline
68
+ end
69
+
70
+ def target_names
71
+ [ReqMethodTokenDefinition.name, UrlPathTokenDefinition.name, QueryStringTokenDefinition.name, ProtocolTokenDefinition.name]
72
+ end
73
+
74
+ def derive(name, source_value)
75
+ if @_derivation_regex.nil?
76
+ @_derivation_regex = Regexp.compile("^(#{ReqMethodTokenDefinition.regex})\s+(#{UrlPathTokenDefinition.regex})(#{QueryStringTokenDefinition.regex})\s+(#{ProtocolTokenDefinition.regex})$")
77
+ end
78
+
79
+ hsh = {}
80
+ if source_value =~ @_derivation_regex
81
+ hsh[ReqMethodTokenDefinition.name] = $1
82
+ hsh[UrlPathTokenDefinition.name] = $2
83
+ hsh[QueryStringTokenDefinition.name] = $3
84
+ hsh[ProtocolTokenDefinition.name] = $4
85
+ end
86
+
87
+ hsh[name]
88
+ end
89
+ end
90
+
91
+ class DerivationRuleFinder
92
+ @_rule_map = nil
93
+ @_rules = [NullDerivationRule, TimeDerivationRule, ReqFirstlineDerivationRule]
94
+
95
+ # Returns a derivation rule that derives element with the given name
96
+ def self.find(element_name)
97
+ @_rule_map = self._build_rule_map if @_rule_map.nil?
98
+ @_rule_map[element_name]
99
+ end
100
+
101
+ def self._build_rule_map
102
+ hsh = {}
103
+ @_rules.each do |rule_cls|
104
+ r = rule_cls.new
105
+ r.target_names.each do |target_element|
106
+ hsh[target_element] = r
107
+ end
108
+ end
109
+
110
+ hsh
111
+ end
112
+ end
113
+ end
data/lib/element.rb ADDED
@@ -0,0 +1,16 @@
1
+ class ApacheCrunch
2
+ class Element
3
+ attr_accessor :token, :value
4
+
5
+ def populate!(token, value)
6
+ @token = token
7
+ @value = value
8
+ end
9
+
10
+ def name; @token.name; end
11
+
12
+ def derivation_rule
13
+ @token.derivation_rule
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,72 @@
1
+ class ApacheCrunch
2
+ # Finds a value from an Entry. Value may be directly from log or derived.
3
+ class ElementValueFetcher
4
+ def initialize
5
+ @_RawValueFetcher = RawValueFetcher
6
+ @_DerivedValueFetcher = DerivedValueFetcher
7
+ end
8
+
9
+ # Handles dependency injection
10
+ def dep_inject!(raw_value_fetcher_cls, derived_value_fetcher_cls)
11
+ @_RawValueFetcher = raw_value_fetcher_cls
12
+ @_DerivedValueFetcher = derived_value_fetcher_cls
13
+ end
14
+
15
+ # Returns the value of the element with the given name from the Entry instance.
16
+ #
17
+ # So element_name might be :minute or :reqheader_firstline for instance.
18
+ def fetch(entry, element_name)
19
+ v = @_RawValueFetcher.new.fetch(entry, element_name)
20
+ return v unless v.nil?
21
+
22
+ v = @_DerivedValueFetcher.new.fetch(entry, element_name)
23
+ return v unless v.nil?
24
+
25
+ nil
26
+ end
27
+ end
28
+
29
+
30
+ # Returns the value of an element that was captured straight from the log.
31
+ class RawValueFetcher
32
+ # Returns the value of the Element with the given name in the given Entry.
33
+ #
34
+ # Only works for elements based on tokens that we parsed directly into the Entry. If no
35
+ # matching element is found, we return nil.
36
+ def fetch(entry, element_name)
37
+ element = entry.captured_elements[element_name]
38
+ return nil if element.nil?
39
+ element.value
40
+ end
41
+ end
42
+
43
+
44
+ # Returns the value of an element derived from one captured directly from the log.
45
+ class DerivedValueFetcher
46
+ def initialize
47
+ @_DerivationRuleFinder = DerivationRuleFinder
48
+ end
49
+
50
+ # Handles dependency injection
51
+ def dep_inject!(derivation_rule_finder_cls)
52
+ @_DerivationRuleFinder = derivation_rule_finder_cls
53
+ end
54
+
55
+ # Returns the value for the given name by deriving from an Element in the Entry.
56
+ #
57
+ # Returns nil if no such value can be derived.
58
+ def fetch(entry, element_name)
59
+ # Find the derivation rule that will get us the element we want
60
+ rule = @_DerivationRuleFinder.find(element_name)
61
+ return nil if rule.nil?
62
+
63
+ # Get the value of the element from which we're deriving
64
+ source_element_name = rule.source_name
65
+ source_element = entry.captured_elements[source_element_name]
66
+ return nil if source_element.nil?
67
+
68
+ # Do the derivation
69
+ rule.derive(element_name, source_element.value)
70
+ end
71
+ end
72
+ end
data/lib/entry.rb CHANGED
@@ -1,90 +1,100 @@
1
+ require 'element'
2
+ require 'progress'
3
+
1
4
  class ApacheCrunch
2
5
  # A parsed entry from the log.
3
6
  #
4
7
  # Acts like a hash, in that you get at the log elements (e.g. "url_path", "remote_host") by
5
8
  # as entry[name].
6
9
  class Entry
7
- def initialize(derivation_map)
8
- @_derivation_map = derivation_map
9
- @_attributes = {}
10
- end
10
+ attr_accessor :captured_elements
11
11
 
12
- def []=(name, value)
13
- @_attributes[name] = value
14
- end
12
+ def initialize
13
+ @captured_elements = {}
14
+ @_value_fetcher = nil
15
15
 
16
- def [](name)
17
- return @_attributes[name] if @_attributes.key?(name)
18
-
19
- derived_from_cls = @_derivation_map[name]
20
- return nil if derived_from_cls.nil?
16
+ @_ElementValueFetcher = ElementValueFetcher
17
+ end
21
18
 
22
- derived_from_cls.derive(name, @_attributes[derived_from_cls.name])
19
+ def dep_inject!(element_value_fetcher_cls)
20
+ @_ElementValueFetcher = element_value-fetcher_cls
23
21
  end
24
22
 
25
- def merge!(hsh)
26
- @_attributes.merge!(hsh)
23
+ def fetch(name)
24
+ @_value_fetcher = @_ElementValueFetcher.new if @_value_fetcher.nil?
25
+ @_value_fetcher.fetch(self, name)
27
26
  end
28
27
  end
29
28
 
30
29
 
31
30
  # Makes Entry instances based on log file text
32
31
  class EntryParser
33
- # Initializes the instance given a LogFormat instance
34
- def initialize(log_format, progress_meter)
35
- @log_format = log_format
36
- @progress_meter = progress_meter
32
+ # Initializes the instance given a ProgressMeter instance
33
+ def initialize
34
+ @_Entry = Entry
35
+ @_Element = Element
36
+
37
+ @_progress_meter = NullProgressMeter.new
38
+ @_regex = nil
39
+ end
37
40
 
38
- @_elements = log_format.elements
39
- @_derivation_map = log_format.derivation_map
41
+ # Handles dependency injection
42
+ def dep_inject!(entry_cls, element_cls)
43
+ @_Entry = entry_cls
44
+ @_Element = element_cls
40
45
  end
41
46
 
42
- # Returns a log line hash built from a line of text, or nil if the line was malformatted
47
+ # Applies the given ProgressMeter to the parser so that it will output progress.
43
48
  #
44
- # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
45
- def from_text(log_text)
46
- match = (log_text =~ @log_format.regex)
49
+ # The meter's output_progress method will get called every time we finish parsing
50
+ # a log entry.
51
+ def add_progress_meter!(meter)
52
+ @_progress_meter = meter
53
+ end
54
+
55
+ # Returns an Entry instance built from a line of text, or nil if the line was malformatted
56
+ def parse(format, log_text)
57
+ @_regex = _build_regex(format) if @_regex.nil?
58
+
59
+ match = (log_text =~ @_regex)
47
60
  if match.nil?
48
- warn "Log line did not match expected format: #{log_text}"
61
+ warn "Log line did not match expected format: #{log_text.rstrip}"
49
62
  return nil
50
63
  end
51
-
52
- # Make a hash mapping all parsed elements to their values in the entry
64
+
53
65
  match_groups = Regexp.last_match.to_a
54
66
  match_groups.shift # First value is the whole matched string, which we do not want
55
- element_values = Hash[*@_elements.zip(match_groups).flatten]
56
-
57
- # Start building the return value
58
- entry = Entry.new(@_derivation_map)
59
- entry[:text] = log_text
60
- # Insert all the elements specified in the LogFormat
61
- entry.merge!(_elements_to_hash(element_values))
62
67
 
63
- @progress_meter.output_progress(entry)
64
- entry
65
- end
66
-
67
- # Returns a hash of "element name" => value pairs based on a hash of element => value pairs.
68
- def _elements_to_hash(element_values)
69
- hsh = {}
70
- element_values.each_pair do |element, value|
71
- hsh[element.name] = value
68
+ entry = @_Entry.new
69
+ format.captured_tokens.each_with_index do |tok,i|
70
+ element = Element.new
71
+ element.populate!(tok, match_groups[i])
72
+ entry.captured_elements[tok.name] = element
72
73
  end
73
74
 
74
- hsh
75
+ # Add the full text of the log entry to the Entry instance as well.
76
+ text_element = Element.new
77
+ text_element.populate!(StringToken.new, log_text)
78
+ entry.captured_elements[:text] = text_element
79
+
80
+ @_progress_meter.output_progress(entry)
81
+ entry
75
82
  end
76
83
 
77
- # Returns hash of derived "element name" => value pairs from a hash of element => value pairs.
78
- #
79
- # That is, we go through the elements passed and if any offers derived elements, we include
80
- # those in the return value.
81
- def _derived_elements(element_values)
82
- hsh = {}
83
- element_values.each_pair do |element, value|
84
- hsh.merge!(element.derived_values(value))
84
+ def _build_regex(format)
85
+ r = "^"
86
+ format.tokens.each do |tok|
87
+ # We only care to remember the captured LogFormatElements. No need to put
88
+ # parentheses around StringElements that aren't interpolated.
89
+ if tok.captured?
90
+ r += "(" + tok.regex + ")"
91
+ else
92
+ r += tok.regex
93
+ end
85
94
  end
95
+ r += "$"
86
96
 
87
- hsh
97
+ Regexp.compile(r)
88
98
  end
89
99
  end
90
100
  end
data/lib/format.rb CHANGED
@@ -1,77 +1,38 @@
1
+ require 'format_token'
2
+
1
3
  class ApacheCrunch
2
4
  # Represents a particular Apache log format
3
5
  class Format
4
- attr_accessor :format_def, :tokens
6
+ attr_accessor :tokens
5
7
 
6
8
  def initialize
7
9
  @tokens = []
8
- @_regex = nil
9
- end
10
-
11
- # Appends a given token (a LogFormatElement or LogFormatString) to the tokens list
12
- def append(token)
13
- @tokens << token
14
- end
15
-
16
- # Returns a compiled regex to match a log line in this format
17
- #
18
- # Each group matched will correspond to an element in the log format.
19
- def regex
20
- return @_regex unless @_regex.nil?
21
-
22
- r = "^"
23
- @tokens.each do |tok|
24
- # We only care to remember the LogFormatElements. No need to put parentheses
25
- # around LogFormatString shit.
26
- if tok.respond_to?(:name)
27
- r += "(" + tok.regex + ")"
28
- else
29
- r += tok.regex
30
- end
31
- end
32
- r += "$"
33
-
34
- @_regex = Regexp.compile(r)
35
- @_regex
36
10
  end
37
11
 
38
- # Returns the list of LogFormatElements, in order, of the interpolated things in the format.
39
- #
40
- # For example, if the log format definition were "%h %u %{Referer}i", this would return the
41
- # LogFormatElement instances for "%h", "%u", and "%{Referer}i".
42
- def elements
12
+ def captured_tokens
43
13
  @tokens.find_all do |tok|
44
- tok.respond_to?(:name)
14
+ tok.captured?
45
15
  end
46
16
  end
47
-
48
- # Returns hash mapping names of elements to the element class from which they can be derived.
49
- def derivation_map
50
- hsh = {}
51
- elements.each do |tok|
52
- tok.derived_elements.each do |derived_element|
53
- hsh[derived_element.name] = tok.class
54
- end
55
- end
56
-
57
- hsh
58
- end
59
17
  end
60
18
 
61
19
  # Parses a log format definition
62
20
  class FormatParser
63
21
  # Initializes the FormatParser
64
22
  #
65
- # Takes a FormatElementFactory instance, and you can inject a replacement for the
66
- # LogFormatString class.
67
- def initialize(format_element_factory, format_string_cls=LogFormatString)
68
- @_element_factory = format_element_factory
69
- @_format_string_cls = format_string_cls
23
+ # Takes a FormatElementFactory instance.
24
+ def initialize
25
+ @_FormatTokenFactory = FormatTokenFactory
26
+ end
27
+
28
+ # Handles dependency injection
29
+ def dep_inject!(format_token_factory_cls)
30
+ @_FormatTokenFactory = format_token_factory_cls
70
31
  end
71
32
 
72
33
  # Parses the given format_def (e.g. "%h %u %s #{Referer}i") and returns a list of tokens.
73
34
  #
74
- # These tokens are all instances of LogFormatString or LogFormatElement.
35
+ # These tokens are all instances of a LogFormatElement subclass.
75
36
  def parse_def(format_def)
76
37
  s = format_def
77
38
  tokens = []
@@ -84,27 +45,27 @@ class ApacheCrunch
84
45
  tokens
85
46
  end
86
47
 
87
- # Finds the first token (a LogFormatElement or LogFormatString) in a format definition
48
+ # Finds the first token in a format definition
88
49
  #
89
50
  # Returns a list containing the token and the new format definition (with the characters
90
51
  # that correspond to the token removed)
91
52
  def _shift_token(format_def)
92
53
  if format_def =~ /^%%(.*)/
93
54
  # Literal "%"
94
- return [@_format_string_cls.new("%%"), $1]
55
+ return [@_FormatTokenFactory.from_abbrev("%%"), $1]
95
56
  elsif format_def =~ /^(%[A-Za-z])(.*)/
96
57
  # Simple element (e.g. "%h", "%u")
97
- return [@_element_factory.from_abbrev($1), $2]
58
+ return [@_FormatTokenFactory.from_abbrev($1), $2]
98
59
  elsif format_def =~ /^%[<>]([A-Za-z])(.*)/
99
60
  # No idea how to handle mod_log_config's "which request" system yet, so we
100
61
  # ignore it.
101
- return [@_element_factory.from_abbrev("%" + $1), $2]
62
+ return [@_FormatTokenFactory.from_abbrev("%" + $1), $2]
102
63
  elsif format_def =~ /^(%\{.+?\}[Ceinor])(.*)/
103
64
  # "Contents of" element (e.g. "%{Accept}i")
104
- return [@_element_factory.from_abbrev($1), $2]
65
+ return [@_FormatTokenFactory.from_abbrev($1), $2]
105
66
  elsif format_def =~ /^(.+?)(%.*|$)/
106
67
  # Bare string up until the next %, or up until the end of the format definition
107
- return [@_format_string_cls.new($1), $2]
68
+ return [@_FormatTokenFactory.from_abbrev($1), $2]
108
69
  end
109
70
  end
110
71
  end
@@ -115,11 +76,8 @@ class ApacheCrunch
115
76
  # Constructs and returns a Format instance based on the given Apache log format string
116
77
  def self.from_format_def(format_def)
117
78
  logformat = Format.new
118
- logformat.format_def = format_def
119
-
120
- element_factory = LogFormatElementFactory.new
121
79
 
122
- format_parser = FormatParser.new(element_factory)
80
+ format_parser = FormatParser.new
123
81
  logformat.tokens = format_parser.parse_def(format_def)
124
82
 
125
83
  logformat