apachecrunch 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
data/bin/apachecrunch CHANGED
@@ -66,5 +66,5 @@ log_parser = ApacheCrunch::LogParserFactory.log_parser(
66
66
  format_def,
67
67
  options[:logfile],
68
68
  progress_meter)
69
- proc_env = ProcedureEnvironment.new(log_parser)
69
+ proc_env = ApacheCrunch::ProcedureEnvironment.new(log_parser)
70
70
  proc_env.eval_procedure(open(options[:procedure]).read())
data/lib/apachecrunch.rb CHANGED
@@ -2,21 +2,11 @@ require "date"
2
2
  require "tempfile"
3
3
 
4
4
  require 'config'
5
+ require 'progress'
5
6
  require 'entry'
6
7
  require 'format'
8
+ require 'derivation'
9
+ require 'format_token'
7
10
  require 'log_parser'
8
- require 'log_element'
9
-
10
- class ApacheCrunch
11
- # A bare string in a log format
12
- #
13
- # Exposes 'regex' for consistency with LogFormatElement, but there shouldn't be anything other
14
- # than one-to-one character matching in there.
15
- class LogFormatString
16
- attr_accessor :regex
17
-
18
- def initialize(regex)
19
- @regex = regex
20
- end
21
- end
22
- end
11
+ require 'element'
12
+ require 'element_value_fetcher'
data/lib/cast.rb ADDED
@@ -0,0 +1,21 @@
1
+ class ApacheCrunch
2
+ # Converts a string to an integer
3
+ class IntegerCast
4
+ def cast(string_value)
5
+ string_value.to_i
6
+ end
7
+ end
8
+
9
+ # Converts a CLF-formatted string to an integer
10
+ #
11
+ # "CLF-formatted" means that if the value is 0, the string will be a single hyphen instead of
12
+ # a number. Like %b, for instance.
13
+ class CLFIntegerCast
14
+ def cast(string_value)
15
+ if string_value == "-"
16
+ return 0
17
+ end
18
+ string_value.to_i
19
+ end
20
+ end
21
+ end
data/lib/derivation.rb ADDED
@@ -0,0 +1,113 @@
1
+ class ApacheCrunch
2
+ # Abstract for classes that define how to obtain a given element from the value of another.
3
+ class DerivationRule
4
+ # Returns the name of the element from which this rule derives values
5
+ def source_name
6
+ raise NotImplementedError
7
+ end
8
+
9
+ # Derives the given derivable element from the given element value
10
+ def derive(name, source_value)
11
+ raise NotImplementedError
12
+ end
13
+ end
14
+
15
+
16
+ # Dummy rule that doesn't derive anything
17
+ class NullDerivationRule
18
+ def source_name; nil; end
19
+ def target_names; []; end
20
+ def derive(name, source_value); nil; end
21
+ end
22
+
23
+
24
+ # Derivation rule for elements derived from TimeToken
25
+ class TimeDerivationRule < DerivationRule
26
+ def initialize
27
+ @_derivation_regex = nil
28
+ @_month_map = {"Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4,
29
+ "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8,
30
+ "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12}
31
+ end
32
+
33
+ def source_name
34
+ :time
35
+ end
36
+
37
+ def target_names
38
+ [:year, :month, :day, :hour, :minute, :second]
39
+ end
40
+
41
+ def derive(name, source_value)
42
+ if @_derivation_regex.nil?
43
+ @_derivation_regex = Regexp.compile(%q!^\[(\d\d)/([A-Za-z]{3})/(\d\d\d\d):(\d\d):(\d\d):(\d\d)!)
44
+ end
45
+
46
+ hsh = {}
47
+ if source_value =~ @_derivation_regex
48
+ hsh[:year] = $3.to_i
49
+ hsh[:month] = @_month_map[$2]
50
+ hsh[:day] = $1.to_i
51
+
52
+ hsh[:hour] = $4.to_i
53
+ hsh[:minute] = $5.to_i
54
+ hsh[:second] = $6.to_i
55
+ end
56
+
57
+ hsh[name]
58
+ end
59
+ end
60
+
61
+ class ReqFirstlineDerivationRule
62
+ def initialize
63
+ @_derivation_regex = nil
64
+ end
65
+
66
+ def source_name
67
+ :req_firstline
68
+ end
69
+
70
+ def target_names
71
+ [ReqMethodTokenDefinition.name, UrlPathTokenDefinition.name, QueryStringTokenDefinition.name, ProtocolTokenDefinition.name]
72
+ end
73
+
74
+ def derive(name, source_value)
75
+ if @_derivation_regex.nil?
76
+ @_derivation_regex = Regexp.compile("^(#{ReqMethodTokenDefinition.regex})\s+(#{UrlPathTokenDefinition.regex})(#{QueryStringTokenDefinition.regex})\s+(#{ProtocolTokenDefinition.regex})$")
77
+ end
78
+
79
+ hsh = {}
80
+ if source_value =~ @_derivation_regex
81
+ hsh[ReqMethodTokenDefinition.name] = $1
82
+ hsh[UrlPathTokenDefinition.name] = $2
83
+ hsh[QueryStringTokenDefinition.name] = $3
84
+ hsh[ProtocolTokenDefinition.name] = $4
85
+ end
86
+
87
+ hsh[name]
88
+ end
89
+ end
90
+
91
+ class DerivationRuleFinder
92
+ @_rule_map = nil
93
+ @_rules = [NullDerivationRule, TimeDerivationRule, ReqFirstlineDerivationRule]
94
+
95
+ # Returns a derivation rule that derives element with the given name
96
+ def self.find(element_name)
97
+ @_rule_map = self._build_rule_map if @_rule_map.nil?
98
+ @_rule_map[element_name]
99
+ end
100
+
101
+ def self._build_rule_map
102
+ hsh = {}
103
+ @_rules.each do |rule_cls|
104
+ r = rule_cls.new
105
+ r.target_names.each do |target_element|
106
+ hsh[target_element] = r
107
+ end
108
+ end
109
+
110
+ hsh
111
+ end
112
+ end
113
+ end
data/lib/element.rb ADDED
@@ -0,0 +1,16 @@
1
+ class ApacheCrunch
2
+ class Element
3
+ attr_accessor :token, :value
4
+
5
+ def populate!(token, value)
6
+ @token = token
7
+ @value = value
8
+ end
9
+
10
+ def name; @token.name; end
11
+
12
+ def derivation_rule
13
+ @token.derivation_rule
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,72 @@
1
+ class ApacheCrunch
2
+ # Finds a value from an Entry. Value may be directly from log or derived.
3
+ class ElementValueFetcher
4
+ def initialize
5
+ @_RawValueFetcher = RawValueFetcher
6
+ @_DerivedValueFetcher = DerivedValueFetcher
7
+ end
8
+
9
+ # Handles dependency injection
10
+ def dep_inject!(raw_value_fetcher_cls, derived_value_fetcher_cls)
11
+ @_RawValueFetcher = raw_value_fetcher_cls
12
+ @_DerivedValueFetcher = derived_value_fetcher_cls
13
+ end
14
+
15
+ # Returns the value of the element with the given name from the Entry instance.
16
+ #
17
+ # So element_name might be :minute or :reqheader_firstline for instance.
18
+ def fetch(entry, element_name)
19
+ v = @_RawValueFetcher.new.fetch(entry, element_name)
20
+ return v unless v.nil?
21
+
22
+ v = @_DerivedValueFetcher.new.fetch(entry, element_name)
23
+ return v unless v.nil?
24
+
25
+ nil
26
+ end
27
+ end
28
+
29
+
30
+ # Returns the value of an element that was captured straight from the log.
31
+ class RawValueFetcher
32
+ # Returns the value of the Element with the given name in the given Entry.
33
+ #
34
+ # Only works for elements based on tokens that we parsed directly into the Entry. If no
35
+ # matching element is found, we return nil.
36
+ def fetch(entry, element_name)
37
+ element = entry.captured_elements[element_name]
38
+ return nil if element.nil?
39
+ element.value
40
+ end
41
+ end
42
+
43
+
44
+ # Returns the value of an element derived from one captured directly from the log.
45
+ class DerivedValueFetcher
46
+ def initialize
47
+ @_DerivationRuleFinder = DerivationRuleFinder
48
+ end
49
+
50
+ # Handles dependency injection
51
+ def dep_inject!(derivation_rule_finder_cls)
52
+ @_DerivationRuleFinder = derivation_rule_finder_cls
53
+ end
54
+
55
+ # Returns the value for the given name by deriving from an Element in the Entry.
56
+ #
57
+ # Returns nil if no such value can be derived.
58
+ def fetch(entry, element_name)
59
+ # Find the derivation rule that will get us the element we want
60
+ rule = @_DerivationRuleFinder.find(element_name)
61
+ return nil if rule.nil?
62
+
63
+ # Get the value of the element from which we're deriving
64
+ source_element_name = rule.source_name
65
+ source_element = entry.captured_elements[source_element_name]
66
+ return nil if source_element.nil?
67
+
68
+ # Do the derivation
69
+ rule.derive(element_name, source_element.value)
70
+ end
71
+ end
72
+ end
data/lib/entry.rb CHANGED
@@ -1,90 +1,100 @@
1
+ require 'element'
2
+ require 'progress'
3
+
1
4
  class ApacheCrunch
2
5
  # A parsed entry from the log.
3
6
  #
4
7
  # Acts like a hash, in that you get at the log elements (e.g. "url_path", "remote_host") by
5
8
  # as entry[name].
6
9
  class Entry
7
- def initialize(derivation_map)
8
- @_derivation_map = derivation_map
9
- @_attributes = {}
10
- end
10
+ attr_accessor :captured_elements
11
11
 
12
- def []=(name, value)
13
- @_attributes[name] = value
14
- end
12
+ def initialize
13
+ @captured_elements = {}
14
+ @_value_fetcher = nil
15
15
 
16
- def [](name)
17
- return @_attributes[name] if @_attributes.key?(name)
18
-
19
- derived_from_cls = @_derivation_map[name]
20
- return nil if derived_from_cls.nil?
16
+ @_ElementValueFetcher = ElementValueFetcher
17
+ end
21
18
 
22
- derived_from_cls.derive(name, @_attributes[derived_from_cls.name])
19
+ def dep_inject!(element_value_fetcher_cls)
20
+ @_ElementValueFetcher = element_value-fetcher_cls
23
21
  end
24
22
 
25
- def merge!(hsh)
26
- @_attributes.merge!(hsh)
23
+ def fetch(name)
24
+ @_value_fetcher = @_ElementValueFetcher.new if @_value_fetcher.nil?
25
+ @_value_fetcher.fetch(self, name)
27
26
  end
28
27
  end
29
28
 
30
29
 
31
30
  # Makes Entry instances based on log file text
32
31
  class EntryParser
33
- # Initializes the instance given a LogFormat instance
34
- def initialize(log_format, progress_meter)
35
- @log_format = log_format
36
- @progress_meter = progress_meter
32
+ # Initializes the instance given a ProgressMeter instance
33
+ def initialize
34
+ @_Entry = Entry
35
+ @_Element = Element
36
+
37
+ @_progress_meter = NullProgressMeter.new
38
+ @_regex = nil
39
+ end
37
40
 
38
- @_elements = log_format.elements
39
- @_derivation_map = log_format.derivation_map
41
+ # Handles dependency injection
42
+ def dep_inject!(entry_cls, element_cls)
43
+ @_Entry = entry_cls
44
+ @_Element = element_cls
40
45
  end
41
46
 
42
- # Returns a log line hash built from a line of text, or nil if the line was malformatted
47
+ # Applies the given ProgressMeter to the parser so that it will output progress.
43
48
  #
44
- # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
45
- def from_text(log_text)
46
- match = (log_text =~ @log_format.regex)
49
+ # The meter's output_progress method will get called every time we finish parsing
50
+ # a log entry.
51
+ def add_progress_meter!(meter)
52
+ @_progress_meter = meter
53
+ end
54
+
55
+ # Returns an Entry instance built from a line of text, or nil if the line was malformatted
56
+ def parse(format, log_text)
57
+ @_regex = _build_regex(format) if @_regex.nil?
58
+
59
+ match = (log_text =~ @_regex)
47
60
  if match.nil?
48
- warn "Log line did not match expected format: #{log_text}"
61
+ warn "Log line did not match expected format: #{log_text.rstrip}"
49
62
  return nil
50
63
  end
51
-
52
- # Make a hash mapping all parsed elements to their values in the entry
64
+
53
65
  match_groups = Regexp.last_match.to_a
54
66
  match_groups.shift # First value is the whole matched string, which we do not want
55
- element_values = Hash[*@_elements.zip(match_groups).flatten]
56
-
57
- # Start building the return value
58
- entry = Entry.new(@_derivation_map)
59
- entry[:text] = log_text
60
- # Insert all the elements specified in the LogFormat
61
- entry.merge!(_elements_to_hash(element_values))
62
67
 
63
- @progress_meter.output_progress(entry)
64
- entry
65
- end
66
-
67
- # Returns a hash of "element name" => value pairs based on a hash of element => value pairs.
68
- def _elements_to_hash(element_values)
69
- hsh = {}
70
- element_values.each_pair do |element, value|
71
- hsh[element.name] = value
68
+ entry = @_Entry.new
69
+ format.captured_tokens.each_with_index do |tok,i|
70
+ element = Element.new
71
+ element.populate!(tok, match_groups[i])
72
+ entry.captured_elements[tok.name] = element
72
73
  end
73
74
 
74
- hsh
75
+ # Add the full text of the log entry to the Entry instance as well.
76
+ text_element = Element.new
77
+ text_element.populate!(StringToken.new, log_text)
78
+ entry.captured_elements[:text] = text_element
79
+
80
+ @_progress_meter.output_progress(entry)
81
+ entry
75
82
  end
76
83
 
77
- # Returns hash of derived "element name" => value pairs from a hash of element => value pairs.
78
- #
79
- # That is, we go through the elements passed and if any offers derived elements, we include
80
- # those in the return value.
81
- def _derived_elements(element_values)
82
- hsh = {}
83
- element_values.each_pair do |element, value|
84
- hsh.merge!(element.derived_values(value))
84
+ def _build_regex(format)
85
+ r = "^"
86
+ format.tokens.each do |tok|
87
+ # We only care to remember the captured LogFormatElements. No need to put
88
+ # parentheses around StringElements that aren't interpolated.
89
+ if tok.captured?
90
+ r += "(" + tok.regex + ")"
91
+ else
92
+ r += tok.regex
93
+ end
85
94
  end
95
+ r += "$"
86
96
 
87
- hsh
97
+ Regexp.compile(r)
88
98
  end
89
99
  end
90
100
  end
data/lib/format.rb CHANGED
@@ -1,77 +1,38 @@
1
+ require 'format_token'
2
+
1
3
  class ApacheCrunch
2
4
  # Represents a particular Apache log format
3
5
  class Format
4
- attr_accessor :format_def, :tokens
6
+ attr_accessor :tokens
5
7
 
6
8
  def initialize
7
9
  @tokens = []
8
- @_regex = nil
9
- end
10
-
11
- # Appends a given token (a LogFormatElement or LogFormatString) to the tokens list
12
- def append(token)
13
- @tokens << token
14
- end
15
-
16
- # Returns a compiled regex to match a log line in this format
17
- #
18
- # Each group matched will correspond to an element in the log format.
19
- def regex
20
- return @_regex unless @_regex.nil?
21
-
22
- r = "^"
23
- @tokens.each do |tok|
24
- # We only care to remember the LogFormatElements. No need to put parentheses
25
- # around LogFormatString shit.
26
- if tok.respond_to?(:name)
27
- r += "(" + tok.regex + ")"
28
- else
29
- r += tok.regex
30
- end
31
- end
32
- r += "$"
33
-
34
- @_regex = Regexp.compile(r)
35
- @_regex
36
10
  end
37
11
 
38
- # Returns the list of LogFormatElements, in order, of the interpolated things in the format.
39
- #
40
- # For example, if the log format definition were "%h %u %{Referer}i", this would return the
41
- # LogFormatElement instances for "%h", "%u", and "%{Referer}i".
42
- def elements
12
+ def captured_tokens
43
13
  @tokens.find_all do |tok|
44
- tok.respond_to?(:name)
14
+ tok.captured?
45
15
  end
46
16
  end
47
-
48
- # Returns hash mapping names of elements to the element class from which they can be derived.
49
- def derivation_map
50
- hsh = {}
51
- elements.each do |tok|
52
- tok.derived_elements.each do |derived_element|
53
- hsh[derived_element.name] = tok.class
54
- end
55
- end
56
-
57
- hsh
58
- end
59
17
  end
60
18
 
61
19
  # Parses a log format definition
62
20
  class FormatParser
63
21
  # Initializes the FormatParser
64
22
  #
65
- # Takes a FormatElementFactory instance, and you can inject a replacement for the
66
- # LogFormatString class.
67
- def initialize(format_element_factory, format_string_cls=LogFormatString)
68
- @_element_factory = format_element_factory
69
- @_format_string_cls = format_string_cls
23
+ # Takes a FormatElementFactory instance.
24
+ def initialize
25
+ @_FormatTokenFactory = FormatTokenFactory
26
+ end
27
+
28
+ # Handles dependency injection
29
+ def dep_inject!(format_token_factory_cls)
30
+ @_FormatTokenFactory = format_token_factory_cls
70
31
  end
71
32
 
72
33
  # Parses the given format_def (e.g. "%h %u %s #{Referer}i") and returns a list of tokens.
73
34
  #
74
- # These tokens are all instances of LogFormatString or LogFormatElement.
35
+ # These tokens are all instances of a LogFormatElement subclass.
75
36
  def parse_def(format_def)
76
37
  s = format_def
77
38
  tokens = []
@@ -84,27 +45,27 @@ class ApacheCrunch
84
45
  tokens
85
46
  end
86
47
 
87
- # Finds the first token (a LogFormatElement or LogFormatString) in a format definition
48
+ # Finds the first token in a format definition
88
49
  #
89
50
  # Returns a list containing the token and the new format definition (with the characters
90
51
  # that correspond to the token removed)
91
52
  def _shift_token(format_def)
92
53
  if format_def =~ /^%%(.*)/
93
54
  # Literal "%"
94
- return [@_format_string_cls.new("%%"), $1]
55
+ return [@_FormatTokenFactory.from_abbrev("%%"), $1]
95
56
  elsif format_def =~ /^(%[A-Za-z])(.*)/
96
57
  # Simple element (e.g. "%h", "%u")
97
- return [@_element_factory.from_abbrev($1), $2]
58
+ return [@_FormatTokenFactory.from_abbrev($1), $2]
98
59
  elsif format_def =~ /^%[<>]([A-Za-z])(.*)/
99
60
  # No idea how to handle mod_log_config's "which request" system yet, so we
100
61
  # ignore it.
101
- return [@_element_factory.from_abbrev("%" + $1), $2]
62
+ return [@_FormatTokenFactory.from_abbrev("%" + $1), $2]
102
63
  elsif format_def =~ /^(%\{.+?\}[Ceinor])(.*)/
103
64
  # "Contents of" element (e.g. "%{Accept}i")
104
- return [@_element_factory.from_abbrev($1), $2]
65
+ return [@_FormatTokenFactory.from_abbrev($1), $2]
105
66
  elsif format_def =~ /^(.+?)(%.*|$)/
106
67
  # Bare string up until the next %, or up until the end of the format definition
107
- return [@_format_string_cls.new($1), $2]
68
+ return [@_FormatTokenFactory.from_abbrev($1), $2]
108
69
  end
109
70
  end
110
71
  end
@@ -115,11 +76,8 @@ class ApacheCrunch
115
76
  # Constructs and returns a Format instance based on the given Apache log format string
116
77
  def self.from_format_def(format_def)
117
78
  logformat = Format.new
118
- logformat.format_def = format_def
119
-
120
- element_factory = LogFormatElementFactory.new
121
79
 
122
- format_parser = FormatParser.new(element_factory)
80
+ format_parser = FormatParser.new
123
81
  logformat.tokens = format_parser.parse_def(format_def)
124
82
 
125
83
  logformat