request-log-analyzer 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/DESIGN +14 -0
  2. data/HACKING +7 -0
  3. data/LICENSE +20 -0
  4. data/README.textile +36 -0
  5. data/Rakefile +5 -0
  6. data/bin/request-log-analyzer +123 -0
  7. data/lib/cli/bashcolorizer.rb +60 -0
  8. data/lib/cli/command_line_arguments.rb +301 -0
  9. data/lib/cli/progressbar.rb +236 -0
  10. data/lib/request_log_analyzer.rb +14 -0
  11. data/lib/request_log_analyzer/aggregator/base.rb +45 -0
  12. data/lib/request_log_analyzer/aggregator/database.rb +148 -0
  13. data/lib/request_log_analyzer/aggregator/echo.rb +25 -0
  14. data/lib/request_log_analyzer/aggregator/summarizer.rb +116 -0
  15. data/lib/request_log_analyzer/controller.rb +201 -0
  16. data/lib/request_log_analyzer/file_format.rb +81 -0
  17. data/lib/request_log_analyzer/file_format/merb.rb +33 -0
  18. data/lib/request_log_analyzer/file_format/rails.rb +90 -0
  19. data/lib/request_log_analyzer/filter/base.rb +29 -0
  20. data/lib/request_log_analyzer/filter/field.rb +36 -0
  21. data/lib/request_log_analyzer/filter/timespan.rb +32 -0
  22. data/lib/request_log_analyzer/line_definition.rb +159 -0
  23. data/lib/request_log_analyzer/log_parser.rb +173 -0
  24. data/lib/request_log_analyzer/log_processor.rb +121 -0
  25. data/lib/request_log_analyzer/request.rb +95 -0
  26. data/lib/request_log_analyzer/source/base.rb +42 -0
  27. data/lib/request_log_analyzer/source/log_file.rb +170 -0
  28. data/lib/request_log_analyzer/tracker/base.rb +54 -0
  29. data/lib/request_log_analyzer/tracker/category.rb +71 -0
  30. data/lib/request_log_analyzer/tracker/duration.rb +81 -0
  31. data/lib/request_log_analyzer/tracker/hourly_spread.rb +80 -0
  32. data/lib/request_log_analyzer/tracker/timespan.rb +54 -0
  33. data/spec/controller_spec.rb +40 -0
  34. data/spec/database_inserter_spec.rb +101 -0
  35. data/spec/file_format_spec.rb +78 -0
  36. data/spec/file_formats/spec_format.rb +26 -0
  37. data/spec/filter_spec.rb +137 -0
  38. data/spec/fixtures/merb.log +84 -0
  39. data/spec/fixtures/multiple_files_1.log +5 -0
  40. data/spec/fixtures/multiple_files_2.log +2 -0
  41. data/spec/fixtures/rails_1x.log +59 -0
  42. data/spec/fixtures/rails_22.log +12 -0
  43. data/spec/fixtures/rails_22_cached.log +10 -0
  44. data/spec/fixtures/rails_unordered.log +24 -0
  45. data/spec/fixtures/syslog_1x.log +5 -0
  46. data/spec/fixtures/test_file_format.log +13 -0
  47. data/spec/fixtures/test_language_combined.log +14 -0
  48. data/spec/fixtures/test_order.log +16 -0
  49. data/spec/line_definition_spec.rb +124 -0
  50. data/spec/log_parser_spec.rb +68 -0
  51. data/spec/log_processor_spec.rb +57 -0
  52. data/spec/merb_format_spec.rb +38 -0
  53. data/spec/rails_format_spec.rb +76 -0
  54. data/spec/request_spec.rb +72 -0
  55. data/spec/spec_helper.rb +67 -0
  56. data/spec/summarizer_spec.rb +9 -0
  57. data/tasks/github-gem.rake +177 -0
  58. data/tasks/request_log_analyzer.rake +10 -0
  59. data/tasks/rspec.rake +6 -0
  60. metadata +135 -0
@@ -0,0 +1,29 @@
1
+ module RequestLogAnalyzer
2
+ module Filter
3
+ # Base filter class used to filter input requests.
4
+ # All filters should interit from this base.
5
+ class Base
6
+
7
+ include RequestLogAnalyzer::FileFormat::Awareness
8
+
9
+ attr_reader :log_parser
10
+ attr_reader :options
11
+
12
+ # Initializer
13
+ # <tt>format</tt> The file format
14
+ # <tt>options</tt> Are passed to the filters.
15
+ def initialize(format, options = {})
16
+ @options = options
17
+ register_file_format(format)
18
+ end
19
+
20
+ def prepare
21
+ end
22
+
23
+ def filter(request)
24
+ return nil unless request
25
+ return request
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,36 @@
1
+ module RequestLogAnalyzer::Filter
2
+
3
+ # Filter to select or reject a specific field
4
+ # Options
5
+ # * <tt>:mode</tt> :reject or :accept.
6
+ # * <tt>:field</tt> Specific field to accept or reject.
7
+ # * <tt>:value</tt> Value that the field should match to be accepted or rejected.
8
+ class Field < Base
9
+
10
+ attr_reader :field, :value, :mode
11
+
12
+ def prepare
13
+ @mode = (@options[:mode] || :accept).to_sym
14
+ @field = @options[:field].to_sym
15
+
16
+ # Convert the timestamp to the correct formats for quick timestamp comparisons
17
+ if @options[:value].kind_of?(String) && @options[:value][0, 1] == '/' && @options[:value][-1, 1] == '/'
18
+ @value = Regexp.new(@options[:value][1..-2])
19
+ else
20
+ @value = @options[:value] # TODO: convert value?
21
+ end
22
+ end
23
+
24
+ def filter(request)
25
+ return nil unless request
26
+
27
+ found_field = request.every(@field).any? { |value| @value === value }
28
+
29
+ return nil if !found_field && @mode == :select
30
+ return nil if found_field && @mode == :reject
31
+
32
+ return request
33
+ end
34
+ end
35
+
36
+ end
@@ -0,0 +1,32 @@
1
+ module RequestLogAnalyzer::Filter
2
+
3
+ # Reject all requests not in given timespan
4
+ # Options
5
+ # * <tt>:after</tt> Only keep requests after this DateTime.
6
+ # * <tt>:before</tt> Only keep requests before this DateTime.
7
+ class Timespan < Base
8
+
9
+ attr_reader :before, :after
10
+
11
+ def prepare
12
+ # Convert the timestamp to the correct formats for quick timestamp comparisons
13
+ @after = @options[:after].strftime('%Y%m%d%H%M%S').to_i if options[:after]
14
+ @before = @options[:before].strftime('%Y%m%d%H%M%S').to_i if options[:before]
15
+ end
16
+
17
+ def filter(request)
18
+ return nil unless request
19
+
20
+ if @after && @before && request.timestamp <= @before && @after <= request.timestamp
21
+ return request
22
+ elsif @after && @before.nil? && @after <= request.timestamp
23
+ return request
24
+ elsif @before && @after.nil? && request.timestamp <= @before
25
+ return request
26
+ end
27
+
28
+ return nil
29
+ end
30
+ end
31
+
32
+ end
@@ -0,0 +1,159 @@
1
+ module RequestLogAnalyzer
2
+
3
+ module Anonymizers
4
+ def anonymizer_for_ip(value, capture_definition)
5
+ '127.0.0.1'
6
+ end
7
+
8
+ def anonymizer_for_url(value, capture_definition)
9
+ value.sub(/^https?\:\/\/[A-z0-9\.-]+\//, "http://example.com/")
10
+ end
11
+ end
12
+
13
+ # The line definition class is used to specify what lines should be parsed from the log file.
14
+ # It contains functionality to match a line against the definition and parse the information
15
+ # from this line. This is used by the LogParser class when parsing a log file..
16
+ class LineDefinition
17
+
18
+ include RequestLogAnalyzer::Anonymizers
19
+
20
+ class Definer
21
+
22
+ attr_accessor :line_definitions
23
+
24
+ def initialize
25
+ @line_definitions = {}
26
+ end
27
+
28
+ def method_missing(name, *args, &block)
29
+ if block_given?
30
+ @line_definitions[name] = RequestLogAnalyzer::LineDefinition.define(name, &block)
31
+ else
32
+ @line_definitions[name] = RequestLogAnalyzer::LineDefinition.new(name, args.first)
33
+ end
34
+ end
35
+ end
36
+
37
+ attr_reader :name
38
+ attr_accessor :teaser, :regexp, :captures
39
+ attr_accessor :header, :footer
40
+
41
+ # Initializes the LineDefinition instance with a hash containing the different elements of
42
+ # the definition.
43
+ def initialize(name, definition = {})
44
+ @name = name
45
+ @captures = []
46
+ definition.each { |key, value| self.send("#{key.to_s}=".to_sym, value) }
47
+ end
48
+
49
+ def self.define(name, &block)
50
+ definition = self.new(name)
51
+ yield(definition) if block_given?
52
+ return definition
53
+ end
54
+
55
+ # Converts a parsed value (String) to the desired value using some heuristics.
56
+ def convert_value(value, type)
57
+ case type
58
+ when :integer; value.to_i
59
+ when :float; value.to_f
60
+ when :decimal; value.to_f
61
+ when :symbol; value.to_sym
62
+ when :sec; value.to_f
63
+ when :msec; value.to_f / 1000
64
+ when :timestamp; value.gsub(/[^0-9]/,'')[0..13].to_i # Retrieve with: DateTime.parse(value, '%Y%m%d%H%M%S')
65
+ else value
66
+ end
67
+ end
68
+
69
+ # Checks whether a given line matches this definition.
70
+ # It will return false if a line does not match. If the line matches, a hash is returned
71
+ # with all the fields parsed from that line as content.
72
+ # If the line definition has a teaser-check, a :teaser_check_failed warning will be emitted
73
+ # if this teaser-check is passed, but the full regular exprssion does not ,atch.
74
+ def matches(line, lineno = nil, parser = nil)
75
+ if @teaser.nil? || @teaser =~ line
76
+ if match_data = line.match(@regexp)
77
+ request_info = { :line_type => name, :lineno => lineno }
78
+
79
+ captures.each_with_index do |capture, index|
80
+ next if capture == :ignore
81
+
82
+ if match_data.captures[index]
83
+ request_info[capture[:name]] = convert_value(match_data.captures[index], capture[:type])
84
+ end
85
+
86
+ end
87
+ return request_info
88
+ else
89
+ if @teaser && parser
90
+ parser.warn(:teaser_check_failed, "Teaser matched for #{name.inspect}, but full line did not:\n#{line.inspect}")
91
+ end
92
+ return false
93
+ end
94
+ else
95
+ return false
96
+ end
97
+ end
98
+
99
+ alias :=~ :matches
100
+
101
+ def anonymize_value(value, capture_definition)
102
+ if capture_definition[:anonymize].respond_to?(:call)
103
+ capture_definition[:anonymize].call(value, capture_definition)
104
+ else
105
+ case capture_definition[:anonymize]
106
+ when nil; value
107
+ when false; value
108
+ when true; '***'
109
+ when :slightly; anonymize_slightly(value, capture_definition)
110
+ else
111
+ method_name = "anonymizer_for_#{capture_definition[:anonymize]}".to_sym
112
+ self.respond_to?(method_name) ? self.send(method_name, value, capture_definition) : '***'
113
+ end
114
+ end
115
+ end
116
+
117
+ def anonymize_slightly(value, capture_definition)
118
+ case capture_definition[:type]
119
+ when :integer
120
+ (value.to_i * (0.8 + rand * 0.4)).to_i
121
+ when :double
122
+ (value.to_f * (0.8 + rand * 0.4)).to_f
123
+ when :msec
124
+ (value.to_i * (0.8 + rand * 0.4)).to_i
125
+ when :sec
126
+ (value.to_f * (0.8 + rand * 0.4)).to_f
127
+ when :timestamp
128
+ (DateTime.parse(value) + (rand(100) - 50)).to_s
129
+ else
130
+ puts "Cannot anonymize #{capture_definition[:type].inspect} slightly, using ***"
131
+ '***'
132
+ end
133
+ end
134
+
135
+ # Anonymize a log line
136
+ def anonymize(line, options = {})
137
+ if self.teaser.nil? || self.teaser =~ line
138
+ if self.regexp =~ line
139
+ pos_adjustment = 0
140
+ captures.each_with_index do |capture, index|
141
+ unless $~[index + 1].nil?
142
+ anonymized_value = anonymize_value($~[index + 1], capture).to_s
143
+ line[($~.begin(index + 1) + pos_adjustment)...($~.end(index + 1) + pos_adjustment)] = anonymized_value
144
+ pos_adjustment += anonymized_value.length - $~[index + 1].length
145
+ end
146
+ end
147
+ line
148
+ elsif self.teaser.nil?
149
+ nil
150
+ else
151
+ options[:discard_teaser_lines] ? "" : line
152
+ end
153
+ else
154
+ nil
155
+ end
156
+ end
157
+ end
158
+
159
+ end
@@ -0,0 +1,173 @@
1
+ module RequestLogAnalyzer
2
+
3
+ # The LogParser class reads log data from a given source and uses a file format definition
4
+ # to parse all relevent information about requests from the file. A FileFormat module should
5
+ # be provided that contains the definitions of the lines that occur in the log data.
6
+ #
7
+ # De order in which lines occur is used to combine lines to a single request. If these lines
8
+ # are mixed, requests cannot be combined properly. This can be the case if data is written to
9
+ # the log file simultaneously by different mongrel processes. This problem is detected by the
10
+ # parser, but the requests that are mixed up cannot be parsed. It will emit warnings when this
11
+ # occurs.
12
+ class LogParser
13
+
14
+ include RequestLogAnalyzer::FileFormat::Awareness
15
+
16
+ # A hash of options
17
+ attr_reader :options
18
+
19
+ # The current Request object that is being parsed
20
+ attr_reader :current_request
21
+
22
+ # The total number of parsed lines
23
+ attr_reader :parsed_lines
24
+
25
+ # The total number of parsed requests.
26
+ attr_reader :parsed_requests
27
+
28
+ # The number of skipped requests because of date constraints
29
+ attr_reader :skipped_requests
30
+
31
+ # Initializes the parser instance.
32
+ # It will apply the language specific FileFormat module to this instance. It will use the line
33
+ # definitions in this module to parse any input.
34
+ def initialize(format, options = {})
35
+ @line_definitions = {}
36
+ @options = options
37
+ @parsed_lines = 0
38
+ @parsed_requests = 0
39
+ @skipped_requests = 0
40
+
41
+ @current_io = nil
42
+
43
+ # install the file format module (see RequestLogAnalyzer::FileFormat)
44
+ # and register all the line definitions to the parser
45
+ self.register_file_format(format)
46
+ end
47
+
48
+ # Parses a list of consequent files of the same format
49
+ def parse_files(files, options = {}, &block)
50
+ files.each { |file| parse_file(file, options, &block) }
51
+ end
52
+
53
+ # Parses a file.
54
+ # Creates an IO stream for the provided file, and sends it to parse_io for further handling
55
+ def parse_file(file, options = {}, &block)
56
+ @progress_handler.call(:started, file) if @progress_handler
57
+ File.open(file, 'r') { |f| parse_io(f, options, &block) }
58
+ @progress_handler.call(:finished, file) if @progress_handler
59
+ end
60
+
61
+ def parse_stream(stream, options = {}, &block)
62
+ parse_io(stream, options, &block)
63
+ end
64
+
65
+ # Finds a log line and then parses the information in the line.
66
+ # Yields a hash containing the information found.
67
+ # <tt>*line_types</tt> The log line types to look for (defaults to LOG_LINES.keys).
68
+ # Yeilds a Hash when it encounters a chunk of information.
69
+ def parse_io(io, options = {}, &block)
70
+
71
+ # parse every line type by default
72
+ line_types = options[:line_types] || file_format.line_definitions.keys
73
+
74
+ # check whether all provided line types are valid
75
+ unknown = line_types.reject { |line_type| file_format.line_definitions.has_key?(line_type) }
76
+ raise "Unknown line types: #{unknown.join(', ')}" unless unknown.empty?
77
+
78
+ @current_io = io
79
+ @current_io.each_line do |line|
80
+
81
+ @progress_handler.call(:progress, @current_io.pos) if @progress_handler && @current_io.kind_of?(File)
82
+
83
+ request_data = nil
84
+ line_types.each do |line_type|
85
+ line_type_definition = file_format.line_definitions[line_type]
86
+ break if request_data = line_type_definition.matches(line, @current_io.lineno, self)
87
+ end
88
+
89
+ if request_data
90
+ @parsed_lines += 1
91
+ update_current_request(request_data, &block)
92
+ end
93
+ end
94
+
95
+ warn(:unfinished_request_on_eof, "End of file reached, but last request was not completed!") unless @current_request.nil?
96
+
97
+ @current_io = nil
98
+ end
99
+
100
+ # Add a block to this method to install a progress handler while parsing
101
+ def progress=(proc)
102
+ @progress_handler = proc
103
+ end
104
+
105
+ # Add a block to this method to install a warning handler while parsing
106
+ def warning=(proc)
107
+ @warning_handler = proc
108
+ end
109
+
110
+ # This method is called by the parser if it encounteres any problems.
111
+ # It will call the warning handler. The default controller will pass all warnings to every
112
+ # aggregator that is registered and running
113
+ def warn(type, message)
114
+ @warning_handler.call(type, message, @current_io.lineno) if @warning_handler
115
+ end
116
+
117
+ protected
118
+
119
+ # Combines the different lines of a request into a single Request object. It will start a
120
+ # new request when a header line is encountered en will emit the request when a footer line
121
+ # is encountered.
122
+ #
123
+ # - Every line that is parsed before a header line is ignored as it cannot be included in
124
+ # any request. It will emit a :no_current_request warning.
125
+ # - A header line that is parsed before a request is closed by a footer line, is a sign of
126
+ # an unprpertly ordered file. All data that is gathered for the request until then is
127
+ # discarded, the next request is ignored as well and a :unclosed_request warning is
128
+ # emitted.
129
+ def update_current_request(request_data, &block)
130
+ if header_line?(request_data)
131
+ unless @current_request.nil?
132
+ if options[:assume_correct_order]
133
+ handle_request(@current_request, &block)
134
+ @current_request = RequestLogAnalyzer::Request.create(@file_format, request_data)
135
+ else
136
+ warn(:unclosed_request, "Encountered header line, but previous request was not closed!")
137
+ @current_request = nil # remove all data that was parsed, skip next request as well.
138
+ end
139
+ else
140
+ @current_request = RequestLogAnalyzer::Request.create(@file_format, request_data)
141
+ end
142
+ else
143
+ unless @current_request.nil?
144
+ @current_request << request_data
145
+ if footer_line?(request_data)
146
+ handle_request(@current_request, &block)
147
+ @current_request = nil
148
+ end
149
+ else
150
+ warn(:no_current_request, "Parsebale line found outside of a request!")
151
+ end
152
+ end
153
+ end
154
+
155
+ # Handles the parsed request by calling the request handler.
156
+ # The default controller will send the request to every running aggegator.
157
+ def handle_request(request, &block)
158
+ @parsed_requests += 1
159
+ accepted = block_given? ? yield(request) : true
160
+ @skipped_requests += 1 if !accepted
161
+ end
162
+
163
+ # Checks whether a given line hash is a header line.
164
+ def header_line?(hash)
165
+ file_format.line_definitions[hash[:line_type]].header
166
+ end
167
+
168
+ # Checks whether a given line hash is a footer line.
169
+ def footer_line?(hash)
170
+ file_format.line_definitions[hash[:line_type]].footer
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,121 @@
1
+ module RequestLogAnalyzer
2
+
3
+ # The Logprocessor class is used to perform simple processing actions over log files.
4
+ # It will go over the log file/stream line by line, pass the line to a processor and
5
+ # write the result back to the output file or stream. The processor can alter the
6
+ # contents of the line, remain it intact or remove it altogether, based on the current
7
+ # file format
8
+ #
9
+ # Currently, two processors are supported, :strip and :anonymize.
10
+ # * :strip will remove all irrelevent lines (according to the file format) from the
11
+ # sources. A compact, information packed log will remain/.
12
+ # * :anonymize will anonymize sensitive information from the lines according to the
13
+ # anonymization rules in the file format. The result can be passed to third parties
14
+ # without privacy concerns.
15
+ #
16
+ class LogProcessor
17
+
18
+ include RequestLogAnalyzer::FileFormat::Awareness
19
+
20
+ attr_reader :mode, :options, :sources
21
+ attr_accessor :output_file
22
+
23
+ # Builds a logprocessor instance from the arguments given on the command line
24
+ # <tt>command</tt> The command hat was used to start the log processor. This can either be
25
+ # :strip or :anonymize. This will set the processing mode.
26
+ # <tt>arguments</tt> The parsed command line arguments (a CommandLine::Arguments instance)
27
+ def self.build(command, arguments)
28
+
29
+ options = {
30
+ :discard_teaser_lines => arguments[:discard_teaser_lines],
31
+ :keep_junk_lines => arguments[:keep_junk_lines],
32
+ }
33
+
34
+ log_processor = RequestLogAnalyzer::LogProcessor.new(arguments[:format].to_sym, command, options)
35
+ log_processor.output_file = arguments[:output] if arguments[:output]
36
+
37
+ arguments.parameters.each do |input|
38
+ log_processor.sources << input
39
+ end
40
+
41
+ return log_processor
42
+ end
43
+
44
+ # Initializes a new LogProcessor instance.
45
+ # <tt>format</tt> The file format to use (e.g. :rails).
46
+ # <tt>mode</tt> The processing mode (:anonymize or :strip)
47
+ # <tt>options</tt> A hash with options to take into account
48
+ def initialize(format, mode, options = {})
49
+ @options = options
50
+ @mode = mode
51
+ @sources = []
52
+ $output_file = nil
53
+ self.register_file_format(format)
54
+ end
55
+
56
+ # Processes input files by opening it and sending the filestream to <code>process_io</code>,
57
+ # in which the actual processing is performed.
58
+ # <tt>file</tt> The file to process
59
+ def process_file(file)
60
+ File.open(file, 'r') { |file| process_io(file) }
61
+ end
62
+
63
+ # Processes an input stream by iteration over each line and processing it according to
64
+ # the current operation mode (:strip, :anonymize)
65
+ # <tt>io</tt> The IO instance to process.
66
+ def process_io(io)
67
+ case mode
68
+ when :strip; io.each_line { |line| @output << strip_line(line) }
69
+ when :anonymize; io.each_line { |line| @output << anonymize_line(line) }
70
+ end
71
+ end
72
+
73
+ # Returns the line itself if the string matches any of the line definitions. If no match is
74
+ # found, an empty line is returned, which will strip the line from the output.
75
+ # <tt>line</tt> The line to strip
76
+ def strip_line(line)
77
+ file_format.line_definitions.any? { |name, definition| definition =~ line } ? line : ""
78
+ end
79
+
80
+ # Returns an anonymized version of the provided line. This can be a copy of the line it self,
81
+ # an empty string or a string in which some substrings are substituted for anonymized values.
82
+ # <tt>line</tt> The line to anonymize
83
+ def anonymize_line(line)
84
+ anonymized_line = nil
85
+ file_format.line_definitions.detect { |name, definition| anonymized_line = definition.anonymize(line, options) }
86
+
87
+ if anonymized_line
88
+ return anonymized_line
89
+ elsif options[:keep_junk_lines]
90
+ return line
91
+ else
92
+ return ""
93
+ end
94
+ end
95
+
96
+ # Runs the log processing by setting up the output stream and iterating over all the
97
+ # input sources. Input sources can either be filenames (String instances) or IO streams
98
+ # (IO instances). The strings "-" and "STDIN" will be substituted for the $stdin variable.
99
+ def run!
100
+ if @output_file.nil?
101
+ @output = $stdout
102
+ else
103
+ @output = File.new(@output_file, 'a')
104
+ end
105
+
106
+ @sources.each do |source|
107
+ if source.kind_of?(String) && File.exist?(source)
108
+ process_file(source)
109
+ elsif source.kind_of?(IO)
110
+ process_io(source)
111
+ elsif ['-', 'STDIN'].include?(source)
112
+ process_io($stdin)
113
+ end
114
+ end
115
+
116
+ ensure
117
+ @output.close if @output.kind_of?(File)
118
+ end
119
+ end
120
+
121
+ end