RubyGems - apachecrunch - Versions diffs - 0.1 - Mend

apachecrunch 0.1

Files changed (7) hide show

data/LICENSE ADDED Viewed

	@@ -0,0 +1 @@
1	+ http://creativecommons.org/licenses/by-sa/3.0/

data/bin/apachecrunch ADDED Viewed

@@ -0,0 +1,70 @@
+#!/usr/bin/ruby
+# For development while inside the apachecrunch dir:
+$: << ".."
+$: << "./lib"
+require "rubygems"
+require "apachecrunch"
+require "progress"
+require "procedure_dsl"
+# Prints the usage message and exits with the given exit code
+def barf_usage(exit_code)
+    puts %q!USAGE:
+    apachecrunch.rb <PROCEDURE> <LOG>
+                    [--format=<FORMAT NAME>] [--progress <METER TYPE>]
+    --progress: Gives you a progress meter as the log file is parsed.  METER TYPE can be "entry",
+                which prints out how many entries have been parsed so far, or "time", which prints
+                out the time of the last entry parsed.!
+    exit exit_code
+end
+# Parses arguments
+#
+# Returns a hash with these keys (as symbols):
+#   procedure: The path to the procedure DSL file
+#   logfile: The path to the log file
+#   format: The name of the log format specified ("ncsa" by default)
+def parse_args
+    args = ARGV.clone
+    options = {}
+    # Defaults
+    options[:format] = "ncsa"
+    options[:progress] = nil
+    while a = args.shift
+        if a == "--format"
+            options[:format] = args.shift
+        elsif a == "--help"
+            barf_usage(0)
+        elsif a == "--progress"
+            options[:progress] = args.shift
+        elsif options.key?(:procedure)
+            options[:logfile] = a
+        else
+            options[:procedure] = a
+        end
+    end
+    unless options.key?(:procedure) and options.key?(:logfile)
+        barf_usage(1)
+    end
+    return options
+end
+options = parse_args
+format_string = FormatStringFinder.new.find(options[:format])
+progress_meter = ProgressMeterFactory.from_options(options)
+log_parser = LogParserFactory.log_parser(
+                        format_string=format_string,
+                        path=options[:logfile],
+                        progress_meter=progress_meter)
+proc_env = ProcedureEnvironment.new(log_parser)
+proc_env.eval_procedure(open(options[:procedure]).read())

data/lib/apachecrunch.rb ADDED Viewed

@@ -0,0 +1,316 @@
+require "date"
+require "tempfile"
+require 'log_element'
+# A parsed entry from the log.
+#
+# Acts like a hash, in that you get at the log elements (e.g. "url_path", "remote_host") by
+# as entry[name].
+class LogEntry
+    def initialize(derivation_map)
+        @_derivation_map = derivation_map
+        @_attributes = {}
+    end
+    def []=(name, value)
+        @_attributes[name] = value
+    end
+    def [](name)
+        return @_attributes[name] if @_attributes.key?(name)
+        derived_from_cls = @_derivation_map[name]
+        return nil if derived_from_cls.nil?
+        derived_from_cls.derive(name, @_attributes[derived_from_cls.name])
+    end
+    def merge!(hsh)
+        @_attributes.merge!(hsh)
+    end
+end
+# A bare string in a log format
+#
+# Exposes 'regex' for consistency with LogFormatElement, but there shouldn't be anything other
+# than one-to-one character matching in there.
+class LogFormatString
+    attr_accessor :regex
+    def initialize(regex)
+        @regex = regex
+    end
+end
+# Represents a particular Apache log format
+class LogFormat
+    attr_accessor :format_string, :tokens
+    def initialize
+        @tokens = []
+        @_regex = nil
+    end
+    # Appends a given token (a LogFormatElement or LogFormatString) to the tokens list
+    def append(token)
+        @tokens << token
+    end
+    # Returns a compiled regex to match a log line in this format
+    def regex
+        return @_regex unless @_regex.nil?
+        r = "^"
+        @tokens.each do |tok|
+            # We only care to remember the LogFormatElements.  No need to put parentheses
+            # around LogFormatString shit.
+            if tok.respond_to?(:name)
+                r += "(" + tok.regex + ")"
+            else
+                r += tok.regex
+            end
+        end
+        r += "$"
+        @_regex = Regexp.compile(r)
+        @_regex
+    end
+    # Returns the list of LogFormatElements, in order, of the interpolated things in the format.
+    #
+    # For example, if the log format string were "%h %u %{Referer}i", this would return the
+    # LogFormatElement instances for "%h", "%u", and "%{Referer}i".
+    def elements
+        @tokens.find_all do |tok|
+            tok.respond_to?(:name)
+        end
+    end
+    # Returns hash mapping names of elements to the element class from which they can be derived.
+    def derivation_map
+        hsh = {}
+        elements.each do |tok|
+            tok.derived_elements.each do |derived_element|
+                hsh[derived_element.name] = tok.class
+            end
+        end
+        hsh
+    end
+end
+# Turns a string specifying an Apache log format into a LogFormat instance
+class LogFormatFactory
+    def initialize
+        @element_factory = LogFormatElementFactory.new
+    end
+    # Constructs and returns a LogFormat instance based on the given Apache log format string
+    def from_format_string(f_string)
+        logformat = LogFormat.new
+        logformat.format_string = f_string
+        until f_string.empty?
+            token, f_string = _shift_token(f_string)
+            logformat.append(token)
+        end
+        logformat
+    end
+    # Finds the first token (a LogFormatElement or LogFormatString) in a format string
+    #
+    # Returns a list containing the token and the new format string (with the characters that
+    # correspond to the token removed)
+    def _shift_token(f_string)
+        if f_string =~ /^%%(.*)/
+            # Literal "%"
+            return [LogFormatString.new("%%"), $1]
+        elsif f_string =~ /^(%[A-Za-z])(.*)/
+            # Simple element (e.g. "%h", "%u")
+            return [@element_factory.from_abbrev($1), $2]
+        elsif f_string =~ /^(%\{.+?\}[Ceinor])(.*)/
+            # "Contents of" element (e.g. "%{Accept}i")
+            return [@element_factory.from_abbrev($1), $2]
+        elsif f_string =~ /^(.+?)(%.*|$)/
+            # Bare string up until the next %, or up until the end of the format string
+            return [LogFormatString.new($1), $2]
+        end
+    end
+end
+# Makes log line hashes based on log file text
+class LogLineParser
+    # Initializes the instance given a LogFormat instance
+    def initialize(log_format, progress_meter)
+        @log_format = log_format
+        @progress_meter = progress_meter
+        @_elements = log_format.elements
+        @_derivation_map = log_format.derivation_map
+    end
+    # Returns a log line hash built from a line of text, or nil if the line was malformatted
+    #
+    # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
+    def from_text(log_text)
+        match = (log_text =~ @log_format.regex)
+        if match.nil?
+            warn "Log line did not match expected format: #{log_text}"
+            return nil
+        end
+        # Make a hash mapping all parsed elements to their values in the entry
+        match_groups = Regexp.last_match.to_a
+        match_groups.shift # First value is the whole matched string, which we do not want
+        element_values = Hash[*@_elements.zip(match_groups).flatten]
+        # Start building the return value
+        entry = LogEntry.new(@_derivation_map)
+        entry[:text] = log_text
+        # Insert all the elements specified in the LogFormat
+        entry.merge!(_elements_to_hash(element_values))
+        @progress_meter.output_progress(entry)
+        entry
+    end
+    # Returns a hash of "element name" => value pairs based on a hash of element => value pairs.
+    def _elements_to_hash(element_values)
+        hsh = {}
+        element_values.each_pair do |element, value|
+            hsh[element.name] = value
+        end
+        hsh
+    end
+    # Returns hash of derived "element name" => value pairs from a hash of element => value pairs.
+    #
+    # That is, we go through the elements passed and if any offers derived elements, we include
+    # those in the return value.
+    def _derived_elements(element_values)
+        hsh = {}
+        element_values.each_pair do |element, value|
+            hsh.merge!(element.derived_values(value))
+        end
+        hsh
+    end
+end
+# Parses a log file given a path and a LogFormat instance
+class LogParser
+    # Initializes the parser with the path to a log file and a LogLineParser.
+    def initialize(path, ll_parser)
+        @path = path
+        @ll_parser = ll_parser
+        @_file = nil
+    end
+    # Returns the next entry in the log file as a hash, or nil if we've reached EOF.
+    #
+    # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
+    def next_entry
+        @_file = open(@path) if @_file.nil?
+        while line_text = @_file.gets
+            return nil if line_text.nil?
+            logline = @ll_parser.from_text(line_text)
+            # The LogLineFactory returns nil and writes a warning if the line text doesn't
+            # match our expected format.
+            next if logline.nil?
+            return logline
+        end
+    end
+    # Resets the LogParser's filehandle so we can start over.
+    def reset
+        @_file = nil
+    end
+    # Makes the LogParser close its current log file and start parsing a new one instead
+    #
+    # `new_target` is a writable file object that the parser should start parsing, and if
+    # in_place is true, we actually replace the contents of the current target with those
+    # of the new target.
+    def replace_target(new_target, in_place)
+        new_target.close
+        if in_place
+            old_path = @_file.path
+            File.rename(new_target.path, old_path)
+        else
+            @path = new_target.path
+        end
+        @_file = nil
+    end
+end
+# Makes a LogParser given the parameters we want to work with.
+#
+# This is the class that most external code should instatiate to begin using this library.
+class LogParserFactory
+    # Returns a new LogParser instance for the given log file, which should have the given Apache
+    # log format.
+    def self.log_parser(format_string, path, progress_meter)
+        # First we generate a LogFormat instance based on the format string we were given
+        format_factory = LogFormatFactory.new
+        log_format = format_factory.from_format_string(format_string)
+        # Now we generate a line parser
+        log_line_parser = LogLineParser.new(log_format, progress_meter)
+        # And now we can instantiate and return a LogParser
+        return LogParser.new(path, log_line_parser)
+    end
+end
+# Finds a named log format string in the configuration file(s)
+class FormatStringFinder
+    @@FILE_NAME = "log_formats.rb"
+    @@DEFAULT_FORMATS = {
+        :ncsa => %q!%h %l %u %t \"%r\" %s %b \"%{Referer}i\" \"%{User-agent}i\"!,
+        :ubuntu => %q!%h %l %u %t \"%r\" %s %O \"%{Referer}i\" \"%{User-Agent}i\"!
+    }
+    # Finds the given format string in the configuration file(s)
+    #
+    # If none exists, returns nil.
+    def find(format_name)
+        name_as_symbol = format_name.to_sym
+        formats = @@DEFAULT_FORMATS.clone
+        _search_path.each do |dir|
+            config_path = File.join(dir, @@FILE_NAME)
+            if File.readable?(config_path)
+                config_file = open(File.join(dir, @@FILE_NAME))
+                eval config_file.read
+            end
+            if formats.key?(format_name.to_sym)
+                return formats[format_name.to_sym].gsub(/\\"/, '"')
+            end
+        end
+        raise "Failed to find the format '#{format_name}' in the search path: #{_search_path.inspect}"
+    end
+    def _search_path
+        [".", "./etc",
+         File.join(ENV["HOME"], ".apachecrunch"),
+         "/etc/apachecrunch"]
+    end
+end

data/lib/log_element.rb ADDED Viewed

@@ -0,0 +1,297 @@
+# Converts a string to an integer
+class IntegerCast
+    def self.cast(string_value)
+        string_value.to_i
+    end
+end
+# Converts a CLF-formatted string to an integer
+#
+# "CLF-formatted" means that if the value is 0, the string will be a single hyphen instead of
+# a number.  Like %b, for instance.
+class CLFIntegerCast
+    def self.cast(string_value)
+        if string_value == "-"
+            return 0
+        end
+        string_value.to_i
+    end
+end
+# An element in a log format.  Abstract from which all elements inherit.
+#
+# Exposes:
+#    abbrev: The Apache abbreviation for the element (such as "%h" or "%u" or "%{Referer}i")
+#    name: A short name for the element (such as "remote_host", "remote_user", or "reqhead_referer")
+#    regex: A regex that should match such an element ("[A-Za-z0-9.-]+", "[^:]+", ".+")
+#
+# If '_caster' is not nil, it should be a class with a method called "cast" that
+# transforms a string to the appropriate data type or format for consumption.
+# For example, the IntegerCast class transforms "562" to 562.  The correct cast
+# of a string can then be performed by passing that string to this LogFormaElement
+# instance's "cast" method.
+#
+# 'derive_elements' manages elements that can be derived from the instance's value.  See
+# ReqFirstlineElement for an example.
+class LogFormatElement
+    @_caster = nil
+    attr_accessor :abbrev, :name, :regex
+    # Class variables that determine the _default_ for abbrev, name, and regex in an instance.
+    # That is, an instance will initialize with these values for the instance variables @abbrev,
+    # @name, and @regex.
+    class << self; attr_accessor :abbrev, :name, :regex end
+    # Additionally we need to access this from within the instance:
+    class << self; attr_accessor :_caster end
+    def initialize
+        @abbrev = self.class.abbrev
+        @name = self.class.name
+        @regex = self.class.regex
+    end
+    # Casts a string found in the log to the correct type, using the class's @@_caster attribute.
+    def cast(string_value)
+        if self.class._caster.nil?
+            return string_value
+        else
+            return self.class._caster.cast(string_value)
+        end
+    end
+    # Derives the named element (e.g. "url_path") from a given value for this one.
+    #
+    # See ReqFirstlineElement for an example.
+    def self.derive(name, our_own_value)
+        raise NotImplementedError
+    end
+    # Returns a list of the element classes that can be derived from this one.
+    #
+    # See ReqFirstlineElement for an example.
+    def derived_elements
+        []
+    end
+end
+class RemoteHostElement < LogFormatElement
+    @abbrev = "%h"
+    @name = :remote_host
+    @regex = %q![A-Za-z0-9.-]+!
+end
+class LogNameElement < LogFormatElement
+    @abbrev = "%l"
+    @name = :log_name
+    @regex = %q!\S+!
+end
+class RemoteUserElement < LogFormatElement
+    @abbrev = "%u"
+    @name = :remote_user
+    @regex = %q![^:]+!
+end
+class TimeElement < LogFormatElement
+    @abbrev = "%t"
+    @name = :time
+    @regex = %q!\[\d\d/[A-Za-z]{3}/\d\d\d\d:\d\d:\d\d:\d\d [-+]\d\d\d\d\]!
+end
+class ReqFirstlineElement < LogFormatElement
+    @abbrev = "%r"
+    @name = :req_firstline
+    @regex = %q![^"]+!
+    @_derivation_regex = nil
+    def self.derive(name, our_own_value)
+        if @_derivation_regex.nil?
+            @_derivation_regex = Regexp.compile("^(#{ReqMethodElement.regex})\s+(#{UrlPathElement.regex})(#{QueryStringElement.regex})\s+(#{ProtocolElement.regex})$")
+        end
+        hsh = {}
+        if our_own_value =~ @_derivation_regex
+            hsh[ReqMethodElement.name] = $1
+            hsh[UrlPathElement.name] = $2
+            hsh[QueryStringElement.name] = $3
+            hsh[ProtocolElement.name] = $4
+        end
+        hsh[name]
+    end
+    def derived_elements
+        return [ReqMethodElement, UrlPathElement, QueryStringElement, ProtocolElement]
+    end
+end
+class StatusElement < LogFormatElement
+    @abbrev = "%s"
+    @name = :status
+    @regex = %q!\d+|-!
+end
+class BytesSentElement < LogFormatElement
+    @abbrev = "%b"
+    @name = :bytes_sent
+    @regex = %q!\d+!
+    @@_caster = IntegerCast
+end
+class BytesSentElement < LogFormatElement
+    @abbrev = "%b"
+    @name = :bytes_sent
+    @regex = %q![\d-]+!
+    @@_caster = CLFIntegerCast
+end
+class BytesSentWithHeadersElement < LogFormatElement
+    @abbrev = "%O"
+    @name = :bytes_sent_with_headers
+    @regex = %q!\d+!
+    @@_caster = IntegerCast
+end
+class ServeTimeMicroElement < LogFormatElement
+    @abbrev = "%D"
+    @name = :serve_time_micro
+    @regex = %q!\d+!
+    @@_caster = IntegerCast
+end
+class UrlPathElement < LogFormatElement
+    @abbrev = "%U"
+    @name = :url_path
+    @regex = %q!/[^?]*!
+end
+class QueryStringElement < LogFormatElement
+    @abbrev = "%q"
+    @name = :query_string
+    @regex = %q!\??\S*!
+end
+class ReqMethodElement < LogFormatElement
+    @abbrev = "%m"
+    @name = :req_method
+    @regex = %q![A-Z]+!
+end
+class ProtocolElement < LogFormatElement
+    @abbrev = "%H"
+    @name = :protocol
+    @regex = %q!\S+!
+end
+class ReqheaderElement < LogFormatElement
+end
+class RegexElement < LogFormatElement
+end
+# Finds log format elements given information about them.
+class ElementDictionary
+    @@_ELEMENTS = [
+                    RemoteHostElement,
+                    LogNameElement,
+                    RemoteUserElement,
+                    TimeElement,
+                    ReqFirstlineElement,
+                    StatusElement,
+                    BytesSentElement,
+                    BytesSentElement,
+                    BytesSentWithHeadersElement,
+                    ServeTimeMicroElement,
+                    UrlPathElement,
+                    QueryStringElement,
+                    ReqMethodElement,
+                    ProtocolElement
+    ]
+    # Returns the LogFormatElement subclass with the given format-string abbreviation.
+    #
+    # If none exists, returns nil.
+    def self.find_by_abbrev(abbrev)
+        @@_ELEMENTS.each do |element|
+            if element.abbrev == abbrev
+                return element
+            end
+        end
+        nil
+    end
+end
+# Generates LogFormatElement instances.
+#
+# This class does the work of figuring out which LogFormatElement subclass to make and makes it.
+class LogFormatElementFactory
+    # Takes an Apache log format abbreviation and returns a corresponding LogFormatElement
+    def from_abbrev(abbrev)
+        element_cls = ElementDictionary.find_by_abbrev(abbrev)
+        if element_cls
+            # We found it in the dictionary, so just return an instance
+            return element_cls.new
+        elsif abbrev =~ /^%\{([A-Za-z0-9-]+)\}i/
+            # HTTP request header
+            return _reqheader_element(abbrev, $1)
+        elsif abbrev =~ /^%\{(.*?):([^}]+)\}r/
+            # Arbitrary regex
+            return _regex_element(abbrev, $1, $2)
+        end
+        raise "Unknown element format '#{abbrev}'"
+    end
+    # Returns a format element based on an HTTP header
+    def _reqheader_element(abbrev, header_name)
+        element = ReqheaderElement.new
+        element.abbrev = abbrev
+        element.regex = %q![^"]*!
+        element.name = _header_name_to_element_name(header_name)
+        element
+    end
+    # Returns a format element based on an arbitrary regex
+    def _regex_element(abbrev, regex_name, regex)
+        element = RegexElement.new
+        element.abbrev = abbrev
+        element.regex = regex
+        element.name = "regex_#{regex_name}".to_sym
+        element
+    end
+    # Lowercases header name and turns hyphens into underscores
+    def _header_name_to_element_name(header_name)
+        ("reqheader_" + header_name.downcase().gsub("-", "_")).to_sym
+    end
+end

data/lib/procedure_dsl.rb ADDED Viewed

@@ -0,0 +1,308 @@
+# Abstract for a procedure routine.
+class ProcedureRoutine
+    def initialize(log_parser)
+        @_log_parser = log_parser
+        @_current_entry = nil
+    end
+    # Allows blocks passed to a DSL routine to access parameters from the current log entry
+    def method_missing(sym, *args)
+        @_current_entry[sym]
+    end
+    # Executes the DSL routine using the given block
+    #
+    # Abstract method
+    def execute(&blk)
+        raise "Not implemented"
+    end
+    # Anything that needs to happen after the routine completes but before it returns its
+    # result can go in here.
+    def finish
+        @_log_parser.reset
+    end
+end
+# DSL routine that returns the number of log entries where the block evaluates to true
+class CountWhere < ProcedureRoutine
+    def execute(&blk)
+        count = 0
+        while @_current_entry = @_log_parser.next_entry
+            if instance_eval(&blk)
+                count += 1
+            end
+        end
+        count
+    end
+end
+# DSL routine that executes the block for every log entry
+class Each < ProcedureRoutine
+    def execute(&blk)
+        while @_current_entry = @_log_parser.next_entry
+            instance_eval(&blk)
+        end
+    end
+end
+# DSL routine(s) that filter(s) for entries for which the given block evaluates to true
+#
+# This can be called as 'filter()', which means the filtering happens in a temporary file, or
+# as 'filter(path)', which means the filtering happens in the given file.  It can also be called
+# as 'filter!()', which means the filtering happens in place, clobbering what's in apachecrunch's
+# target file.
+class Filter < ProcedureRoutine
+    def execute(path=nil, in_place=false, &blk)
+        @_in_place = in_place
+        @_results_file = _make_results_file(path, in_place)
+        while @_current_entry = @_log_parser.next_entry
+            if instance_eval(&blk)
+                @_results_file.write(@_current_entry[:text])
+            end
+        end
+    end
+    def finish
+        @_log_parser.replace_target(@_results_file, @_in_place)
+    end
+    # Returns a writable file object to which the results of the filter should be written.
+    def _make_results_file(path, in_place)
+        if path.nil?
+            # If no path passed (this includes the case where the filter is being performed
+            # in place), we want a temp file.
+            return Tempfile.new("apachecrunch")
+        else
+            return open(path, "w")
+        end
+    end
+end
+# DSL routine that returns the count of entries with each found value of the given block
+#
+# You might for instance run this with the block { status }, and you'd get back something like
+# {"200" => 941, "301" => 41, "404" => 2, "500" => 0}
+class CountBy < ProcedureRoutine
+    def execute(&blk)
+        counts = {}
+        while @_current_entry = @_log_parser.next_entry
+            val = instance_eval(&blk)
+            if counts.key?(val)
+                counts[val] += 1
+            else
+                counts[val] = 1
+            end
+        end
+        return counts
+    end
+end
+# DSL routine that finds the distribution of (numeric) values to which the given block evaluates
+#
+# For example,
+#
+#     distribution 100 do
+#         bytes_sent
+#     end
+#
+# would return a hash with keys from 0 up by multiples of 100, the value of each being the number
+# of entries for which bytes_sent is between that key and the next key.
+class Distribution < ProcedureRoutine
+    def execute(bucket_width, &blk)
+        dist = {}
+        while @_current_entry = @_log_parser.next_entry
+            val = instance_eval(&blk)
+            k = _key_for(val, bucket_width)
+            if dist.key?(k)
+                dist[k] += 1
+            else
+                dist[k] = 1
+            end
+        end
+        # Backfill keys for which we didn't find a value
+        0.step(dist.keys.max, bucket_width).each do |k|
+            dist[k] = 0 unless dist.key?(k)
+        end
+        dist
+    end
+    # Determines the key for the distribution hash given the value and step
+    def _key_for(val, bucket_width)
+        (val.to_i / bucket_width) * bucket_width
+    end
+end
+# Same as Distribution, but the buckets get expenentially wider
+class LogDistribution < ProcedureRoutine
+    def execute(width_base, &blk)
+        dist = {}
+        while @_current_entry = @_log_parser.next_entry
+            val = instance_eval(&blk)
+            k = _key_for(val, width_base)
+            if dist.key?(k)
+                dist[k] += 1
+            else
+                dist[k] = 1
+            end
+        end
+        # Backfill keys for which we didn't find a value
+        k = dist.keys.min
+        max_key = dist.keys.max
+        while k *= width_base and k < max_key
+            dist[k] = 0 unless dist.key?(k)
+        end
+        dist
+    end
+    # Determines the key for the distribution hash given the value and logarithmic base for
+    # the bucket width
+    def _key_for(val, width_base)
+        exp = (Math.log(val) / Math.log(width_base)).to_i
+        width_base ** exp
+    end
+end
+# DSL routine that determines a confidence interval for the values to which the block evaluates
+#
+# For example,
+#
+#     confidence_interval 95 do
+#         time_to_serve
+#     end
+#
+# would return two numbers, the lower and upper bound of a 95% confidence interval for the values
+# of time_to_serve.
+class ConfidenceInterval < ProcedureRoutine
+    def execute(confidence, &blk)
+        # Build a list of all the values found
+        values = []
+        while @_current_entry = @_log_parser.next_entry
+            values << instance_eval(&blk)
+        end
+        values.sort!
+        # Determine how many values are outside the bounds of the CI
+        count_outside = (values.length * (1.0 - confidence/100.0)).to_i
+        # Find the bounds of the confidence interval
+        return values[count_outside / 2], values[-count_outside / 2]
+    end
+end
+# DSL routine that finds the most common n values for the given block.
+#
+# Returns a list of lists, each of which is [value, count].  This list is sorted by count.
+class MostCommon < ProcedureRoutine
+    def execute(n, &blk)
+        counts = CountBy.new(@_log_parser).execute(&blk)
+        # Sort the block values descending
+        sorted_vals = counts.keys.sort do |val_a,val_b|
+            - (counts[val_a] <=> counts[val_b])
+        end
+        sorted_vals[0..n].map do |val|
+            [val, counts[val]]
+        end
+    end
+end
+# The environment in which a procedure file is evaluated.
+#
+# A procedure file is some ruby code that uses our DSL.
+class ProcedureEnvironment
+    def initialize(log_parser)
+        @_log_parser = log_parser
+    end
+    # Evaluates the given string as a procedure in our DSL
+    def eval_procedure(proc_string)
+        eval proc_string
+    end
+    # DSL routine 'count_where'
+    def count_where(&blk)
+        routine = CountWhere.new(@_log_parser)
+        rv = routine.execute(&blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'filter!'
+    def filter!(&blk)
+        routine = Filter.new(@_log_parser)
+        routine.execute(nil, true, &blk)
+        routine.finish
+        nil
+    end
+    # DSL routine 'filter'
+    def filter(target_path=nil, &blk)
+        routine = Filter.new(@_log_parser)
+        routine.execute(target_path, &blk)
+        routine.finish
+        nil
+    end
+    # DSL routine 'each'
+    def each(&blk)
+        routine = Each.new(@_log_parser)
+        routine.execute(&blk)
+        routine.finish
+        nil
+    end
+    # DSL routine 'count_by'
+    def count_by(&blk)
+        routine = CountBy.new(@_log_parser)
+        rv = routine.execute(&blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'distribution'
+    def distribution(bucket_width, &blk)
+        routine = Distribution.new(@_log_parser)
+        rv = routine.execute(bucket_width, &blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'log_distribution'
+    def log_distribution(width_base, &blk)
+        routine = LogDistribution.new(@_log_parser)
+        rv = routine.execute(width_base, &blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'confidence_interval'
+    def confidence_interval(confidence, &blk)
+        routine = ConfidenceInterval.new(@_log_parser)
+        rv = routine.execute(confidence, &blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'most_common'
+    def most_common(n, &blk)
+        routine = MostCommon.new(@_log_parser)
+        rv = routine.execute(n, &blk)
+        routine.finish
+        rv
+    end
+end

data/lib/progress.rb ADDED Viewed

@@ -0,0 +1,65 @@
+class ProgressMeter
+    def initialize
+        @_entry_count = 0
+    end
+end
+# Progress meter that prints the number of entries parsed every (n) lines.
+class EntryCountProgressMeter < ProgressMeter
+    def initialize
+        # 'period' is how many entries we wait between printing output.  So if 'period' is 10 000,
+        # we'll print output every 10 000 lines.
+        @_period = 10000
+        super
+    end
+    # Outputs the number of entries that have been parsed so far (every once in a while).
+    #
+    # 'entry' should be the latest log entry to be parsed, in hash form.
+    def output_progress(entry)
+        @_entry_count += 1
+        if @_entry_count % @_period == 0
+            puts "Processed %d entries" % [@_entry_count]
+        end
+    end
+end
+class TimeProgressMeter < ProgressMeter
+    def initialize
+        # 'period' is how many entries we wait between printing output.  So if 'period' is 10 000,
+        # we'll print output every 10 000 lines.
+        @_period = 10000
+        super
+    end
+    # Outputs the number of entries that have been parsed so far (every once in a while).
+    #
+    # 'entry' should be the latest log entry to be parsed, in hash form.
+    def output_progress(entry)
+        @_entry_count += 1
+        if @_entry_count % @_period == 0
+            puts "Processed through %s" % [entry["time"]]
+        end
+    end
+end
+class NullProgressMeter < ProgressMeter
+    def output_progress(entry)
+    end
+end
+# Constructs progress meters that output progress info to the user.
+class ProgressMeterFactory
+    # Constructs a progress meter from a hash containing the options passed on the command line.
+    def self.from_options(options)
+        pm_class = {
+            "entry" => EntryCountProgressMeter,
+            "time" => TimeProgressMeter
+        }
+        pm_class.default = NullProgressMeter
+        pm_class[options[:progress]].new
+    end
+end

metadata ADDED Viewed

@@ -0,0 +1,74 @@
+--- !ruby/object:Gem::Specification
+name: apachecrunch
+version: !ruby/object:Gem::Version
+  hash: 9
+  prerelease:
+  segments:
+  - 0
+  - 1
+  version: "0.1"
+platform: ruby
+authors:
+- Dan Slimmon
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-07-09 00:00:00 -04:00
+default_executable:
+dependencies: []
+description: |-
+  Apache Crunch is an analysis tool for Apache logs.  You write little scripts
+  to do the analysis, using our DSL to make the procedure as simple and readable
+  as possible.  See our homepage for more details.
+email: dan@danslimmon.com
+executables:
+- apachecrunch
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/apachecrunch.rb
+- lib/log_element.rb
+- lib/procedure_dsl.rb
+- lib/progress.rb
+- bin/apachecrunch
+- LICENSE
+has_rdoc: true
+homepage: https://github.com/danslimmon/apachecrunch/
+licenses:
+- Creative Commons Share-Alike
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.6.2
+signing_key:
+specification_version: 3
+summary: Apache log analysis tool designed for ease of use
+test_files: []