RubyGems - apachecrunch - Versions diffs - 0.1 - Mend

apachecrunch 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/LICENSE ADDED Viewed

	@@ -0,0 +1 @@
1	+ http://creativecommons.org/licenses/by-sa/3.0/

data/bin/apachecrunch ADDED Viewed

@@ -0,0 +1,70 @@
+#!/usr/bin/ruby
+# For development while inside the apachecrunch dir:
+$: << ".."
+$: << "./lib"
+require "rubygems"
+require "apachecrunch"
+require "progress"
+require "procedure_dsl"
+# Prints the usage message and exits with the given exit code
+def barf_usage(exit_code)
+    puts %q!USAGE:
+    apachecrunch.rb <PROCEDURE> <LOG>
+                    [--format=<FORMAT NAME>] [--progress <METER TYPE>]
+    --progress: Gives you a progress meter as the log file is parsed.  METER TYPE can be "entry",
+                which prints out how many entries have been parsed so far, or "time", which prints
+                out the time of the last entry parsed.!
+    exit exit_code
+end
+# Parses arguments
+#
+# Returns a hash with these keys (as symbols):
+#   procedure: The path to the procedure DSL file
+#   logfile: The path to the log file
+#   format: The name of the log format specified ("ncsa" by default)
+def parse_args
+    args = ARGV.clone
+    options = {}
+    # Defaults
+    options[:format] = "ncsa"
+    options[:progress] = nil
+    while a = args.shift
+        if a == "--format"
+            options[:format] = args.shift
+        elsif a == "--help"
+            barf_usage(0)
+        elsif a == "--progress"
+            options[:progress] = args.shift
+        elsif options.key?(:procedure)
+            options[:logfile] = a
+        else
+            options[:procedure] = a
+        end
+    end
+    unless options.key?(:procedure) and options.key?(:logfile)
+        barf_usage(1)
+    end
+    return options
+end
+options = parse_args
+format_string = FormatStringFinder.new.find(options[:format])
+progress_meter = ProgressMeterFactory.from_options(options)
+log_parser = LogParserFactory.log_parser(
+                        format_string=format_string,
+                        path=options[:logfile],
+                        progress_meter=progress_meter)
+proc_env = ProcedureEnvironment.new(log_parser)
+proc_env.eval_procedure(open(options[:procedure]).read())

data/lib/apachecrunch.rb ADDED Viewed

@@ -0,0 +1,316 @@
+require "date"
+require "tempfile"
+require 'log_element'
+# A parsed entry from the log.
+#
+# Acts like a hash, in that you get at the log elements (e.g. "url_path", "remote_host") by
+# as entry[name].
+class LogEntry
+    def initialize(derivation_map)
+        @_derivation_map = derivation_map
+        @_attributes = {}
+    end
+    def []=(name, value)
+        @_attributes[name] = value
+    end
+    def [](name)
+        return @_attributes[name] if @_attributes.key?(name)
+        derived_from_cls = @_derivation_map[name]
+        return nil if derived_from_cls.nil?
+        derived_from_cls.derive(name, @_attributes[derived_from_cls.name])
+    end
+    def merge!(hsh)
+        @_attributes.merge!(hsh)
+    end
+end
+# A bare string in a log format
+#
+# Exposes 'regex' for consistency with LogFormatElement, but there shouldn't be anything other
+# than one-to-one character matching in there.
+class LogFormatString
+    attr_accessor :regex
+    def initialize(regex)
+        @regex = regex
+    end
+end
+# Represents a particular Apache log format
+class LogFormat
+    attr_accessor :format_string, :tokens
+    def initialize
+        @tokens = []
+        @_regex = nil
+    end
+    # Appends a given token (a LogFormatElement or LogFormatString) to the tokens list
+    def append(token)
+        @tokens << token
+    end
+    # Returns a compiled regex to match a log line in this format
+    def regex
+        return @_regex unless @_regex.nil?
+        r = "^"
+        @tokens.each do |tok|
+            # We only care to remember the LogFormatElements.  No need to put parentheses
+            # around LogFormatString shit.
+            if tok.respond_to?(:name)
+                r += "(" + tok.regex + ")"
+            else
+                r += tok.regex
+            end
+        end
+        r += "$"
+        @_regex = Regexp.compile(r)
+        @_regex
+    end
+    # Returns the list of LogFormatElements, in order, of the interpolated things in the format.
+    #
+    # For example, if the log format string were "%h %u %{Referer}i", this would return the
+    # LogFormatElement instances for "%h", "%u", and "%{Referer}i".
+    def elements
+        @tokens.find_all do |tok|
+            tok.respond_to?(:name)
+        end
+    end
+    # Returns hash mapping names of elements to the element class from which they can be derived.
+    def derivation_map
+        hsh = {}
+        elements.each do |tok|
+            tok.derived_elements.each do |derived_element|
+                hsh[derived_element.name] = tok.class
+            end
+        end
+        hsh
+    end
+end
+# Turns a string specifying an Apache log format into a LogFormat instance
+class LogFormatFactory
+    def initialize
+        @element_factory = LogFormatElementFactory.new
+    end
+    # Constructs and returns a LogFormat instance based on the given Apache log format string
+    def from_format_string(f_string)
+        logformat = LogFormat.new
+        logformat.format_string = f_string
+        until f_string.empty?
+            token, f_string = _shift_token(f_string)
+            logformat.append(token)
+        end
+        logformat
+    end
+    # Finds the first token (a LogFormatElement or LogFormatString) in a format string
+    #
+    # Returns a list containing the token and the new format string (with the characters that
+    # correspond to the token removed)
+    def _shift_token(f_string)
+        if f_string =~ /^%%(.*)/
+            # Literal "%"
+            return [LogFormatString.new("%%"), $1]
+        elsif f_string =~ /^(%[A-Za-z])(.*)/
+            # Simple element (e.g. "%h", "%u")
+            return [@element_factory.from_abbrev($1), $2]
+        elsif f_string =~ /^(%\{.+?\}[Ceinor])(.*)/
+            # "Contents of" element (e.g. "%{Accept}i")
+            return [@element_factory.from_abbrev($1), $2]
+        elsif f_string =~ /^(.+?)(%.*|$)/
+            # Bare string up until the next %, or up until the end of the format string
+            return [LogFormatString.new($1), $2]
+        end
+    end
+end
+# Makes log line hashes based on log file text
+class LogLineParser
+    # Initializes the instance given a LogFormat instance
+    def initialize(log_format, progress_meter)
+        @log_format = log_format
+        @progress_meter = progress_meter
+        @_elements = log_format.elements
+        @_derivation_map = log_format.derivation_map
+    end
+    # Returns a log line hash built from a line of text, or nil if the line was malformatted
+    #
+    # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
+    def from_text(log_text)
+        match = (log_text =~ @log_format.regex)
+        if match.nil?
+            warn "Log line did not match expected format: #{log_text}"
+            return nil
+        end
+        # Make a hash mapping all parsed elements to their values in the entry
+        match_groups = Regexp.last_match.to_a
+        match_groups.shift # First value is the whole matched string, which we do not want
+        element_values = Hash[*@_elements.zip(match_groups).flatten]
+        # Start building the return value
+        entry = LogEntry.new(@_derivation_map)
+        entry[:text] = log_text
+        # Insert all the elements specified in the LogFormat
+        entry.merge!(_elements_to_hash(element_values))
+        @progress_meter.output_progress(entry)
+        entry
+    end
+    # Returns a hash of "element name" => value pairs based on a hash of element => value pairs.
+    def _elements_to_hash(element_values)
+        hsh = {}
+        element_values.each_pair do |element, value|
+            hsh[element.name] = value
+        end
+        hsh
+    end
+    # Returns hash of derived "element name" => value pairs from a hash of element => value pairs.
+    #
+    # That is, we go through the elements passed and if any offers derived elements, we include
+    # those in the return value.
+    def _derived_elements(element_values)
+        hsh = {}
+        element_values.each_pair do |element, value|
+            hsh.merge!(element.derived_values(value))
+        end
+        hsh
+    end
+end
+# Parses a log file given a path and a LogFormat instance
+class LogParser
+    # Initializes the parser with the path to a log file and a LogLineParser.
+    def initialize(path, ll_parser)
+        @path = path
+        @ll_parser = ll_parser
+        @_file = nil
+    end
+    # Returns the next entry in the log file as a hash, or nil if we've reached EOF.
+    #
+    # The keys of the hash are names of LogFormatElements (e.g. "remote_host", "reqheader_referer")
+    def next_entry
+        @_file = open(@path) if @_file.nil?
+        while line_text = @_file.gets
+            return nil if line_text.nil?
+            logline = @ll_parser.from_text(line_text)
+            # The LogLineFactory returns nil and writes a warning if the line text doesn't
+            # match our expected format.
+            next if logline.nil?
+            return logline
+        end
+    end
+    # Resets the LogParser's filehandle so we can start over.
+    def reset
+        @_file = nil
+    end
+    # Makes the LogParser close its current log file and start parsing a new one instead
+    #
+    # `new_target` is a writable file object that the parser should start parsing, and if
+    # in_place is true, we actually replace the contents of the current target with those
+    # of the new target.
+    def replace_target(new_target, in_place)
+        new_target.close
+        if in_place
+            old_path = @_file.path
+            File.rename(new_target.path, old_path)
+        else
+            @path = new_target.path
+        end
+        @_file = nil
+    end
+end
+# Makes a LogParser given the parameters we want to work with.
+#
+# This is the class that most external code should instatiate to begin using this library.
+class LogParserFactory
+    # Returns a new LogParser instance for the given log file, which should have the given Apache
+    # log format.
+    def self.log_parser(format_string, path, progress_meter)
+        # First we generate a LogFormat instance based on the format string we were given
+        format_factory = LogFormatFactory.new
+        log_format = format_factory.from_format_string(format_string)
+        # Now we generate a line parser
+        log_line_parser = LogLineParser.new(log_format, progress_meter)
+        # And now we can instantiate and return a LogParser
+        return LogParser.new(path, log_line_parser)
+    end
+end
+# Finds a named log format string in the configuration file(s)
+class FormatStringFinder
+    @@FILE_NAME = "log_formats.rb"
+    @@DEFAULT_FORMATS = {
+        :ncsa => %q!%h %l %u %t \"%r\" %s %b \"%{Referer}i\" \"%{User-agent}i\"!,
+        :ubuntu => %q!%h %l %u %t \"%r\" %s %O \"%{Referer}i\" \"%{User-Agent}i\"!
+    }
+    # Finds the given format string in the configuration file(s)
+    #
+    # If none exists, returns nil.
+    def find(format_name)
+        name_as_symbol = format_name.to_sym
+        formats = @@DEFAULT_FORMATS.clone
+        _search_path.each do |dir|
+            config_path = File.join(dir, @@FILE_NAME)
+            if File.readable?(config_path)
+                config_file = open(File.join(dir, @@FILE_NAME))
+                eval config_file.read
+            end
+            if formats.key?(format_name.to_sym)
+                return formats[format_name.to_sym].gsub(/\\"/, '"')
+            end
+        end
+        raise "Failed to find the format '#{format_name}' in the search path: #{_search_path.inspect}"
+    end
+    def _search_path
+        [".", "./etc",
+         File.join(ENV["HOME"], ".apachecrunch"),
+         "/etc/apachecrunch"]
+    end
+end

data/lib/log_element.rb ADDED Viewed

@@ -0,0 +1,297 @@
+# Converts a string to an integer
+class IntegerCast
+    def self.cast(string_value)
+        string_value.to_i
+    end
+end
+# Converts a CLF-formatted string to an integer
+#
+# "CLF-formatted" means that if the value is 0, the string will be a single hyphen instead of
+# a number.  Like %b, for instance.
+class CLFIntegerCast
+    def self.cast(string_value)
+        if string_value == "-"
+            return 0
+        end
+        string_value.to_i
+    end
+end
+# An element in a log format.  Abstract from which all elements inherit.
+#
+# Exposes:
+#    abbrev: The Apache abbreviation for the element (such as "%h" or "%u" or "%{Referer}i")
+#    name: A short name for the element (such as "remote_host", "remote_user", or "reqhead_referer")
+#    regex: A regex that should match such an element ("[A-Za-z0-9.-]+", "[^:]+", ".+")
+#
+# If '_caster' is not nil, it should be a class with a method called "cast" that
+# transforms a string to the appropriate data type or format for consumption.
+# For example, the IntegerCast class transforms "562" to 562.  The correct cast
+# of a string can then be performed by passing that string to this LogFormaElement
+# instance's "cast" method.
+#
+# 'derive_elements' manages elements that can be derived from the instance's value.  See
+# ReqFirstlineElement for an example.
+class LogFormatElement
+    @_caster = nil
+    attr_accessor :abbrev, :name, :regex
+    # Class variables that determine the _default_ for abbrev, name, and regex in an instance.
+    # That is, an instance will initialize with these values for the instance variables @abbrev,
+    # @name, and @regex.
+    class << self; attr_accessor :abbrev, :name, :regex end
+    # Additionally we need to access this from within the instance:
+    class << self; attr_accessor :_caster end
+    def initialize
+        @abbrev = self.class.abbrev
+        @name = self.class.name
+        @regex = self.class.regex
+    end
+    # Casts a string found in the log to the correct type, using the class's @@_caster attribute.
+    def cast(string_value)
+        if self.class._caster.nil?
+            return string_value
+        else
+            return self.class._caster.cast(string_value)
+        end
+    end
+    # Derives the named element (e.g. "url_path") from a given value for this one.
+    #
+    # See ReqFirstlineElement for an example.
+    def self.derive(name, our_own_value)
+        raise NotImplementedError
+    end
+    # Returns a list of the element classes that can be derived from this one.
+    #
+    # See ReqFirstlineElement for an example.
+    def derived_elements
+        []
+    end
+end
+class RemoteHostElement < LogFormatElement
+    @abbrev = "%h"
+    @name = :remote_host
+    @regex = %q![A-Za-z0-9.-]+!
+end
+class LogNameElement < LogFormatElement
+    @abbrev = "%l"
+    @name = :log_name
+    @regex = %q!\S+!
+end
+class RemoteUserElement < LogFormatElement
+    @abbrev = "%u"
+    @name = :remote_user
+    @regex = %q![^:]+!
+end
+class TimeElement < LogFormatElement
+    @abbrev = "%t"
+    @name = :time
+    @regex = %q!\[\d\d/[A-Za-z]{3}/\d\d\d\d:\d\d:\d\d:\d\d [-+]\d\d\d\d\]!
+end
+class ReqFirstlineElement < LogFormatElement
+    @abbrev = "%r"
+    @name = :req_firstline
+    @regex = %q![^"]+!
+    @_derivation_regex = nil
+    def self.derive(name, our_own_value)
+        if @_derivation_regex.nil?
+            @_derivation_regex = Regexp.compile("^(#{ReqMethodElement.regex})\s+(#{UrlPathElement.regex})(#{QueryStringElement.regex})\s+(#{ProtocolElement.regex})$")
+        end
+        hsh = {}
+        if our_own_value =~ @_derivation_regex
+            hsh[ReqMethodElement.name] = $1
+            hsh[UrlPathElement.name] = $2
+            hsh[QueryStringElement.name] = $3
+            hsh[ProtocolElement.name] = $4
+        end
+        hsh[name]
+    end
+    def derived_elements
+        return [ReqMethodElement, UrlPathElement, QueryStringElement, ProtocolElement]
+    end
+end
+class StatusElement < LogFormatElement
+    @abbrev = "%s"
+    @name = :status
+    @regex = %q!\d+|-!
+end
+class BytesSentElement < LogFormatElement
+    @abbrev = "%b"
+    @name = :bytes_sent
+    @regex = %q!\d+!
+    @@_caster = IntegerCast
+end
+class BytesSentElement < LogFormatElement
+    @abbrev = "%b"
+    @name = :bytes_sent
+    @regex = %q![\d-]+!
+    @@_caster = CLFIntegerCast
+end
+class BytesSentWithHeadersElement < LogFormatElement
+    @abbrev = "%O"
+    @name = :bytes_sent_with_headers
+    @regex = %q!\d+!
+    @@_caster = IntegerCast
+end
+class ServeTimeMicroElement < LogFormatElement
+    @abbrev = "%D"
+    @name = :serve_time_micro
+    @regex = %q!\d+!
+    @@_caster = IntegerCast
+end
+class UrlPathElement < LogFormatElement
+    @abbrev = "%U"
+    @name = :url_path
+    @regex = %q!/[^?]*!
+end
+class QueryStringElement < LogFormatElement
+    @abbrev = "%q"
+    @name = :query_string
+    @regex = %q!\??\S*!
+end
+class ReqMethodElement < LogFormatElement
+    @abbrev = "%m"
+    @name = :req_method
+    @regex = %q![A-Z]+!
+end
+class ProtocolElement < LogFormatElement
+    @abbrev = "%H"
+    @name = :protocol
+    @regex = %q!\S+!
+end
+class ReqheaderElement < LogFormatElement
+end
+class RegexElement < LogFormatElement
+end
+# Finds log format elements given information about them.
+class ElementDictionary
+    @@_ELEMENTS = [
+                    RemoteHostElement,
+                    LogNameElement,
+                    RemoteUserElement,
+                    TimeElement,
+                    ReqFirstlineElement,
+                    StatusElement,
+                    BytesSentElement,
+                    BytesSentElement,
+                    BytesSentWithHeadersElement,
+                    ServeTimeMicroElement,
+                    UrlPathElement,
+                    QueryStringElement,
+                    ReqMethodElement,
+                    ProtocolElement
+    ]
+    # Returns the LogFormatElement subclass with the given format-string abbreviation.
+    #
+    # If none exists, returns nil.
+    def self.find_by_abbrev(abbrev)
+        @@_ELEMENTS.each do |element|
+            if element.abbrev == abbrev
+                return element
+            end
+        end
+        nil
+    end
+end
+# Generates LogFormatElement instances.
+#
+# This class does the work of figuring out which LogFormatElement subclass to make and makes it.
+class LogFormatElementFactory
+    # Takes an Apache log format abbreviation and returns a corresponding LogFormatElement
+    def from_abbrev(abbrev)
+        element_cls = ElementDictionary.find_by_abbrev(abbrev)
+        if element_cls
+            # We found it in the dictionary, so just return an instance
+            return element_cls.new
+        elsif abbrev =~ /^%\{([A-Za-z0-9-]+)\}i/
+            # HTTP request header
+            return _reqheader_element(abbrev, $1)
+        elsif abbrev =~ /^%\{(.*?):([^}]+)\}r/
+            # Arbitrary regex
+            return _regex_element(abbrev, $1, $2)
+        end
+        raise "Unknown element format '#{abbrev}'"
+    end
+    # Returns a format element based on an HTTP header
+    def _reqheader_element(abbrev, header_name)
+        element = ReqheaderElement.new
+        element.abbrev = abbrev
+        element.regex = %q![^"]*!
+        element.name = _header_name_to_element_name(header_name)
+        element
+    end
+    # Returns a format element based on an arbitrary regex
+    def _regex_element(abbrev, regex_name, regex)
+        element = RegexElement.new
+        element.abbrev = abbrev
+        element.regex = regex
+        element.name = "regex_#{regex_name}".to_sym
+        element
+    end
+    # Lowercases header name and turns hyphens into underscores
+    def _header_name_to_element_name(header_name)
+        ("reqheader_" + header_name.downcase().gsub("-", "_")).to_sym
+    end
+end

data/lib/procedure_dsl.rb ADDED Viewed

@@ -0,0 +1,308 @@
+# Abstract for a procedure routine.
+class ProcedureRoutine
+    def initialize(log_parser)
+        @_log_parser = log_parser
+        @_current_entry = nil
+    end
+    # Allows blocks passed to a DSL routine to access parameters from the current log entry
+    def method_missing(sym, *args)
+        @_current_entry[sym]
+    end
+    # Executes the DSL routine using the given block
+    #
+    # Abstract method
+    def execute(&blk)
+        raise "Not implemented"
+    end
+    # Anything that needs to happen after the routine completes but before it returns its
+    # result can go in here.
+    def finish
+        @_log_parser.reset
+    end
+end
+# DSL routine that returns the number of log entries where the block evaluates to true
+class CountWhere < ProcedureRoutine
+    def execute(&blk)
+        count = 0
+        while @_current_entry = @_log_parser.next_entry
+            if instance_eval(&blk)
+                count += 1
+            end
+        end
+        count
+    end
+end
+# DSL routine that executes the block for every log entry
+class Each < ProcedureRoutine
+    def execute(&blk)
+        while @_current_entry = @_log_parser.next_entry
+            instance_eval(&blk)
+        end
+    end
+end
+# DSL routine(s) that filter(s) for entries for which the given block evaluates to true
+#
+# This can be called as 'filter()', which means the filtering happens in a temporary file, or
+# as 'filter(path)', which means the filtering happens in the given file.  It can also be called
+# as 'filter!()', which means the filtering happens in place, clobbering what's in apachecrunch's
+# target file.
+class Filter < ProcedureRoutine
+    def execute(path=nil, in_place=false, &blk)
+        @_in_place = in_place
+        @_results_file = _make_results_file(path, in_place)
+        while @_current_entry = @_log_parser.next_entry
+            if instance_eval(&blk)
+                @_results_file.write(@_current_entry[:text])
+            end
+        end
+    end
+    def finish
+        @_log_parser.replace_target(@_results_file, @_in_place)
+    end
+    # Returns a writable file object to which the results of the filter should be written.
+    def _make_results_file(path, in_place)
+        if path.nil?
+            # If no path passed (this includes the case where the filter is being performed
+            # in place), we want a temp file.
+            return Tempfile.new("apachecrunch")
+        else
+            return open(path, "w")
+        end
+    end
+end
+# DSL routine that returns the count of entries with each found value of the given block
+#
+# You might for instance run this with the block { status }, and you'd get back something like
+# {"200" => 941, "301" => 41, "404" => 2, "500" => 0}
+class CountBy < ProcedureRoutine
+    def execute(&blk)
+        counts = {}
+        while @_current_entry = @_log_parser.next_entry
+            val = instance_eval(&blk)
+            if counts.key?(val)
+                counts[val] += 1
+            else
+                counts[val] = 1
+            end
+        end
+        return counts
+    end
+end
+# DSL routine that finds the distribution of (numeric) values to which the given block evaluates
+#
+# For example,
+#
+#     distribution 100 do
+#         bytes_sent
+#     end
+#
+# would return a hash with keys from 0 up by multiples of 100, the value of each being the number
+# of entries for which bytes_sent is between that key and the next key.
+class Distribution < ProcedureRoutine
+    def execute(bucket_width, &blk)
+        dist = {}
+        while @_current_entry = @_log_parser.next_entry
+            val = instance_eval(&blk)
+            k = _key_for(val, bucket_width)
+            if dist.key?(k)
+                dist[k] += 1
+            else
+                dist[k] = 1
+            end
+        end
+        # Backfill keys for which we didn't find a value
+        0.step(dist.keys.max, bucket_width).each do |k|
+            dist[k] = 0 unless dist.key?(k)
+        end
+        dist
+    end
+    # Determines the key for the distribution hash given the value and step
+    def _key_for(val, bucket_width)
+        (val.to_i / bucket_width) * bucket_width
+    end
+end
+# Same as Distribution, but the buckets get expenentially wider
+class LogDistribution < ProcedureRoutine
+    def execute(width_base, &blk)
+        dist = {}
+        while @_current_entry = @_log_parser.next_entry
+            val = instance_eval(&blk)
+            k = _key_for(val, width_base)
+            if dist.key?(k)
+                dist[k] += 1
+            else
+                dist[k] = 1
+            end
+        end
+        # Backfill keys for which we didn't find a value
+        k = dist.keys.min
+        max_key = dist.keys.max
+        while k *= width_base and k < max_key
+            dist[k] = 0 unless dist.key?(k)
+        end
+        dist
+    end
+    # Determines the key for the distribution hash given the value and logarithmic base for
+    # the bucket width
+    def _key_for(val, width_base)
+        exp = (Math.log(val) / Math.log(width_base)).to_i
+        width_base ** exp
+    end
+end
+# DSL routine that determines a confidence interval for the values to which the block evaluates
+#
+# For example,
+#
+#     confidence_interval 95 do
+#         time_to_serve
+#     end
+#
+# would return two numbers, the lower and upper bound of a 95% confidence interval for the values
+# of time_to_serve.
+class ConfidenceInterval < ProcedureRoutine
+    def execute(confidence, &blk)
+        # Build a list of all the values found
+        values = []
+        while @_current_entry = @_log_parser.next_entry
+            values << instance_eval(&blk)
+        end
+        values.sort!
+        # Determine how many values are outside the bounds of the CI
+        count_outside = (values.length * (1.0 - confidence/100.0)).to_i
+        # Find the bounds of the confidence interval
+        return values[count_outside / 2], values[-count_outside / 2]
+    end
+end
+# DSL routine that finds the most common n values for the given block.
+#
+# Returns a list of lists, each of which is [value, count].  This list is sorted by count.
+class MostCommon < ProcedureRoutine
+    def execute(n, &blk)
+        counts = CountBy.new(@_log_parser).execute(&blk)
+        # Sort the block values descending
+        sorted_vals = counts.keys.sort do |val_a,val_b|
+            - (counts[val_a] <=> counts[val_b])
+        end
+        sorted_vals[0..n].map do |val|
+            [val, counts[val]]
+        end
+    end
+end
+# The environment in which a procedure file is evaluated.
+#
+# A procedure file is some ruby code that uses our DSL.
+class ProcedureEnvironment
+    def initialize(log_parser)
+        @_log_parser = log_parser
+    end
+    # Evaluates the given string as a procedure in our DSL
+    def eval_procedure(proc_string)
+        eval proc_string
+    end
+    # DSL routine 'count_where'
+    def count_where(&blk)
+        routine = CountWhere.new(@_log_parser)
+        rv = routine.execute(&blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'filter!'
+    def filter!(&blk)
+        routine = Filter.new(@_log_parser)
+        routine.execute(nil, true, &blk)
+        routine.finish
+        nil
+    end
+    # DSL routine 'filter'
+    def filter(target_path=nil, &blk)
+        routine = Filter.new(@_log_parser)
+        routine.execute(target_path, &blk)
+        routine.finish
+        nil
+    end
+    # DSL routine 'each'
+    def each(&blk)
+        routine = Each.new(@_log_parser)
+        routine.execute(&blk)
+        routine.finish
+        nil
+    end
+    # DSL routine 'count_by'
+    def count_by(&blk)
+        routine = CountBy.new(@_log_parser)
+        rv = routine.execute(&blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'distribution'
+    def distribution(bucket_width, &blk)
+        routine = Distribution.new(@_log_parser)
+        rv = routine.execute(bucket_width, &blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'log_distribution'
+    def log_distribution(width_base, &blk)
+        routine = LogDistribution.new(@_log_parser)
+        rv = routine.execute(width_base, &blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'confidence_interval'
+    def confidence_interval(confidence, &blk)
+        routine = ConfidenceInterval.new(@_log_parser)
+        rv = routine.execute(confidence, &blk)
+        routine.finish
+        rv
+    end
+    # DSL routine 'most_common'
+    def most_common(n, &blk)
+        routine = MostCommon.new(@_log_parser)
+        rv = routine.execute(n, &blk)
+        routine.finish
+        rv
+    end
+end

data/lib/progress.rb ADDED Viewed

@@ -0,0 +1,65 @@
+class ProgressMeter
+    def initialize
+        @_entry_count = 0
+    end
+end
+# Progress meter that prints the number of entries parsed every (n) lines.
+class EntryCountProgressMeter < ProgressMeter
+    def initialize
+        # 'period' is how many entries we wait between printing output.  So if 'period' is 10 000,
+        # we'll print output every 10 000 lines.
+        @_period = 10000
+        super
+    end
+    # Outputs the number of entries that have been parsed so far (every once in a while).
+    #
+    # 'entry' should be the latest log entry to be parsed, in hash form.
+    def output_progress(entry)
+        @_entry_count += 1
+        if @_entry_count % @_period == 0
+            puts "Processed %d entries" % [@_entry_count]
+        end
+    end
+end
+class TimeProgressMeter < ProgressMeter
+    def initialize
+        # 'period' is how many entries we wait between printing output.  So if 'period' is 10 000,
+        # we'll print output every 10 000 lines.
+        @_period = 10000
+        super
+    end
+    # Outputs the number of entries that have been parsed so far (every once in a while).
+    #
+    # 'entry' should be the latest log entry to be parsed, in hash form.
+    def output_progress(entry)
+        @_entry_count += 1
+        if @_entry_count % @_period == 0
+            puts "Processed through %s" % [entry["time"]]
+        end
+    end
+end
+class NullProgressMeter < ProgressMeter
+    def output_progress(entry)
+    end
+end
+# Constructs progress meters that output progress info to the user.
+class ProgressMeterFactory
+    # Constructs a progress meter from a hash containing the options passed on the command line.
+    def self.from_options(options)
+        pm_class = {
+            "entry" => EntryCountProgressMeter,
+            "time" => TimeProgressMeter
+        }
+        pm_class.default = NullProgressMeter
+        pm_class[options[:progress]].new
+    end
+end

metadata ADDED Viewed

@@ -0,0 +1,74 @@
+--- !ruby/object:Gem::Specification
+name: apachecrunch
+version: !ruby/object:Gem::Version
+  hash: 9
+  prerelease:
+  segments:
+  - 0
+  - 1
+  version: "0.1"
+platform: ruby
+authors:
+- Dan Slimmon
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-07-09 00:00:00 -04:00
+default_executable:
+dependencies: []
+description: |-
+  Apache Crunch is an analysis tool for Apache logs.  You write little scripts
+  to do the analysis, using our DSL to make the procedure as simple and readable
+  as possible.  See our homepage for more details.
+email: dan@danslimmon.com
+executables:
+- apachecrunch
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/apachecrunch.rb
+- lib/log_element.rb
+- lib/procedure_dsl.rb
+- lib/progress.rb
+- bin/apachecrunch
+- LICENSE
+has_rdoc: true
+homepage: https://github.com/danslimmon/apachecrunch/
+licenses:
+- Creative Commons Share-Alike
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.6.2
+signing_key:
+specification_version: 3
+summary: Apache log analysis tool designed for ease of use
+test_files: []