RubyGems - aggregate - Versions diffs - 0.1.2 - Mend

aggregate 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/LICENSE ADDED

@@ -0,0 +1,22 @@
+Copyright (c) 2009 Joseph Ruscio
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.

data/README ADDED

	@@ -0,0 +1,2 @@
1	+ Aggregate is a ruby implementation of a statistics aggregator including histogram support
2	+

data/Rakefile ADDED

@@ -0,0 +1,15 @@
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gemspec|
+    gemspec.name = "aggregate"
+    gemspec.summary = "Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support"
+    gemspec.description = "Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support"
+    gemspec.email = "jruscio@gmail.com"
+    gemspec.homepage = "http://github.com/josephruscio/aggregate"
+    gemspec.authors = ["Joseph Ruscio"]
+  end
+rescue LoadError
+  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+end

data/VERSION ADDED

	@@ -0,0 +1 @@
1	+ 0.1.2

data/aggregate.gemspec ADDED

@@ -0,0 +1,46 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE
+# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{aggregate}
+  s.version = "0.1.2"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["Joseph Ruscio"]
+  s.date = %q{2009-08-16}
+  s.description = %q{Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support}
+  s.email = %q{jruscio@gmail.com}
+  s.extra_rdoc_files = [
+    "LICENSE",
+     "README"
+  ]
+  s.files = [
+    "LICENSE",
+     "README",
+     "Rakefile",
+     "VERSION",
+     "aggregate.gemspec",
+     "lib/aggregate.rb",
+     "test/ts_aggregate.rb"
+  ]
+  s.homepage = %q{http://github.com/josephruscio/aggregate}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.3}
+  s.summary = %q{Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support}
+  s.test_files = [
+    "test/ts_aggregate.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+    else
+    end
+  else
+  end
+end

data/lib/aggregate.rb ADDED

@@ -0,0 +1,277 @@
+# Implements aggregate statistics and maintains
+# configurable histogram for a set of given samples. Convenient for tracking
+# high throughput data.
+class Aggregate
+  #The current average of all samples
+  attr_reader :mean
+  #The current number of samples
+  attr_reader :count
+  #The maximum sample value
+  attr_reader :max
+  #The minimum samples value
+  attr_reader :min
+  #The sum of all samples
+  attr_reader :sum
+  #The number of samples falling below the lowest valued histogram bucket
+  attr_reader :outliers_low
+  #The number of samples falling above the highest valued histogram bucket
+  attr_reader :outliers_high
+  # The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
+  @@LOG_BUCKETS = 128
+  # Create a new Aggregate that maintains a binary logarithmic histogram
+  # by default. Specifying values for low, high, and width configures
+  # the aggregate to maintain a linear histogram with (high - low)/width buckets
+  def initialize (low=nil, high=nil, width=nil)
+    @count = 0
+    @sum = 0.0
+    @sum2 = 0.0
+    @outliers_low = 0
+    @outliers_high = 0
+    # If the user asks we maintain a linear histogram
+    if (nil != low && nil != high && nil != width)
+      #Validate linear specification
+      if high <= low
+	raise ArgumentError, "High bucket must be > Low bucket"
+      end
+      if high - low < width
+        raise ArgumentError, "Histogram width must be <= histogram range"
+      end
+      @low = low
+      @high = high
+      @width = width
+    else
+      @low = 1
+      @high = to_bucket(@@LOG_BUCKETS - 1)
+    end
+    #Initialize all buckets to 0
+    @buckets = Array.new(bucket_count, 0)
+  end
+  # Include a sample in the aggregate
+  def << data
+    # Update min/max
+    if 0 == @count
+      @min = data
+      @max = data
+    else
+      @max = [data, @max].max
+      @min = [data, @min].min
+    end
+    # Update the running info
+    @count += 1
+    @sum += data
+    @sum2 += (data * data)
+    # Update the bucket
+    @buckets[to_index(data)] += 1 unless outlier?(data)
+  end
+  def mean
+    @sum / @count
+  end
+  #Calculate the standard deviation
+  def std_dev
+    Math.sqrt((@sum2.to_f - ((@sum.to_f * @sum.to_f)/@count.to_f)) / (@count.to_f - 1))
+  end
+  # Combine two aggregates
+  #def +(b)
+  #  a = self
+  #  c = Aggregate.new
+  #  c.count = a.count + b.count
+  #end
+  #Generate a pretty-printed ASCII representation of the histogram
+  def to_s(columns=nil)
+    #default to an 80 column terminal, don't support < 80 for now
+    if nil == columns
+      columns = 80
+    else
+      raise ArgumentError if columns < 80
+    end
+    #Find the largest bucket and create an array of the rows we intend to print
+    disp_buckets = Array.new
+    max_count = 0
+    total = 0
+    @buckets.each_with_index do |count, idx|
+      next if 0 == count
+      max_count = [max_count, count].max
+      disp_buckets << [idx, to_bucket(idx), count]
+      total += count
+    end
+    #Figure out how wide the value and count columns need to be based on their
+    #largest respective numbers
+    value_str = "value"
+    count_str = "count"
+    total_str = "Total"
+    value_width = [disp_buckets.last[1].to_s.length, value_str.length].max
+    value_width = [value_width, total_str.length].max
+    count_width = [total.to_s.length, count_str.length].max
+    max_bar_width  = columns - (value_width + " |".length + "| ".length + count_width)
+    #Determine the value of a '@'
+    weight = [max_count.to_f/max_bar_width.to_f, 1.0].max
+    #format the header
+    histogram = sprintf("%#{value_width}s |", value_str)
+    max_bar_width.times { histogram << "-"}
+    histogram << sprintf("| %#{count_width}s\n", count_str)
+    # We denote empty buckets with a '~'
+    def skip_row(value_width)
+      sprintf("%#{value_width}s ~\n", " ")
+    end
+    #Loop through each bucket to be displayed and output the correct number
+    prev_index = disp_buckets[0][0] - 1
+    disp_buckets.each do |x|
+      #Denote skipped empty buckets with a ~
+      histogram << skip_row(value_width) unless prev_index == x[0] - 1
+      prev_index = x[0]
+      #Add the value
+      row = sprintf("%#{value_width}d |", x[1])
+      #Add the bar
+      bar_size = (x[2]/weight).to_i
+      bar_size.times { row += "@"}
+      (max_bar_width - bar_size).times { row += " " }
+      #Add the count
+      row << sprintf("| %#{count_width}d\n", x[2])
+      #Append the finished row onto the histogram
+      histogram << row
+    end
+    #End the table
+    histogram << skip_row(value_width) if disp_buckets.last[0] != bucket_count-1
+    histogram << sprintf("%#{value_width}s", "Total")
+    histogram << " |"
+    max_bar_width.times {histogram << "-"}
+    histogram << "| "
+    histogram << sprintf("%#{count_width}d\n", total)
+  end
+  #Iterate through each bucket in the histogram regardless of
+  #its contents
+  def each
+    @buckets.each_with_index do |count, index|
+      yield(to_bucket(index), count)
+    end
+  end
+  #Iterate through only the buckets in the histogram that contain
+  #samples
+  def each_nonzero
+    @buckets.each_with_index do |count, index|
+      yield(to_bucket(index), count) if count != 0
+    end
+  end
+  private
+  def linear?
+    nil != @width
+  end
+  def outlier? (data)
+    if data < @low
+      @outliers_low += 1
+    elsif data > @high
+      @outliers_high += 1
+    else
+      return false
+    end
+  end
+  def bucket_count
+    if linear?
+      return (@high-@low)/@width
+    else
+      return @@LOG_BUCKETS
+    end
+  end
+  def to_bucket(index)
+    if linear?
+      return @low + (index * @width)
+    else
+      return 2**(index)
+    end
+  end
+  def right_bucket? index, data
+    # check invariant
+    raise unless linear?
+    bucket = to_bucket(index)
+    #It's the right bucket if data falls between bucket and next bucket
+    bucket <= data && data < bucket + @width
+  end
+=begin
+  def find_bucket(lower, upper, target)
+    #Classic binary search
+    return upper if right_bucket?(upper, target)
+    # Cut the search range in half
+    middle = (upper/2).to_i
+    # Determine which half contains our value and recurse
+    if (to_bucket(middle) >= target)
+      return find_bucket(lower, middle, target)
+    else
+      return find_bucket(middle, upper, target)
+    end
+  end
+=end
+  # A data point is added to the bucket[n] where the data point
+  # is less than the value represented by bucket[n], but greater
+  # than the value represented by bucket[n+1]
+  def to_index (data)
+    # basic case is simple
+    return log2(data).to_i if !linear?
+    # Search for the right bucket in the linear case
+    @buckets.each_with_index do |count, idx|
+      return idx if right_bucket?(idx, data)
+    end
+    #find_bucket(0, bucket_count-1, data)
+    #Should not get here
+    raise "#{data}"
+  end
+  # log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
+  def log2( x )
+   Math.log(x) / Math.log(2)
+  end
+end

data/test/ts_aggregate.rb ADDED

@@ -0,0 +1,145 @@
+require 'test/unit'
+require 'lib/aggregate'
+class SimpleStatsTest < Test::Unit::TestCase
+  def setup
+    @stats = Aggregate.new
+    @@DATA.each do |x|
+      @stats << x
+    end
+  end
+  def test_stats_count
+    assert_equal @@DATA.length, @stats.count
+  end
+  def test_stats_min_max
+    sorted_data = @@DATA.sort
+    assert_equal sorted_data[0], @stats.min
+    assert_equal sorted_data.last, @stats.max
+  end
+  def test_stats_mean
+    sum = 0
+    @@DATA.each do |x|
+      sum += x
+    end
+    assert_equal sum.to_f/@@DATA.length.to_f, @stats.mean
+  end
+  def test_bucket_counts
+    #Test each iterator
+    total_bucket_sum = 0
+    i = 0
+    @stats.each do |bucket, count|
+      assert_equal 2**i, bucket
+      total_bucket_sum += count
+      i += 1
+    end
+    assert_equal total_bucket_sum, @@DATA.length
+    #Test each_nonzero iterator
+    prev_bucket = 0
+    total_bucket_sum = 0
+    @stats.each_nonzero do |bucket, count|
+      assert bucket > prev_bucket
+      assert_not_equal count, 0
+      total_bucket_sum += count
+    end
+    assert_equal total_bucket_sum, @@DATA.length
+  end
+=begin
+  def test_addition
+    stats1 = Aggregate.new
+    stats2 = Aggregate.new
+    stats1 << 1
+    stats2 << 3
+    stats_sum = stats1 + stats2
+    assert_equal stats_sum.count, stats1.count + stats2.count
+  end
+=end
+  #XXX: Update test_bucket_contents() if you muck with @@DATA
+  @@DATA = [ 1, 5, 4, 6, 1028, 1972, 16384, 16385, 16383 ]
+  def test_bucket_contents
+    #XXX: This is the only test so far that cares about the actual contents
+    # of @@DATA, so if you update that array ... update this method too
+    expected_buckets  = [1, 4, 1024, 8192, 16384]
+    expected_counts =   [1, 3,    2,    1,     2]
+    i = 0
+    @stats.each_nonzero do |bucket, count|
+      assert_equal expected_buckets[i], bucket
+      assert_equal expected_counts[i],  count
+      # Increment for the next test
+      i += 1
+    end
+  end
+  def test_histogram
+    puts @stats.to_s
+  end
+  def test_outlier
+    assert_equal 0, @stats.outliers_low
+    assert_equal 0, @stats.outliers_high
+    @stats << -1
+    @stats << -2
+    @stats << 2**129
+    assert_equal 2, @stats.outliers_low
+    assert_equal 1, @stats.outliers_high
+  end
+  def test_std_dev
+    @stats.std_dev
+  end
+end
+class LinearHistogramTest < Test::Unit::TestCase
+  def setup
+    @stats = Aggregate.new(0, 32768, 1024)
+    @@DATA.each do |x|
+      @stats << x
+    end
+  end
+  def test_validation
+    assert_raise(ArgumentError) {bad_stats = Aggregate.new(32,32,4)}
+    assert_raise(ArgumentError) {bad_stats = Aggregate.new(32,16,4)}
+    assert_raise(ArgumentError) {bad_stats = Aggregate.new(16,32,17)}
+  end
+  #XXX: Update test_bucket_contents() if you muck with @@DATA
+  @@DATA = [ 1, 5, 4, 6, 1028, 1972, 16384, 16385, 16383 ]
+  def test_bucket_contents
+    #XXX: This is the only test so far that cares about the actual contents
+    # of @@DATA, so if you update that array ... update this method too
+    expected_buckets  = [0, 1024,  15360, 16384]
+    expected_counts =   [4, 2,     1,     2]
+    i = 0
+    @stats.each_nonzero do |bucket, count|
+      assert_equal expected_buckets[i], bucket
+      assert_equal expected_counts[i],  count
+      # Increment for the next test
+      i += 1
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,62 @@
+--- !ruby/object:Gem::Specification
+name: aggregate
+version: !ruby/object:Gem::Version
+  version: 0.1.2
+platform: ruby
+authors:
+- Joseph Ruscio
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-08-16 00:00:00 -07:00
+default_executable:
+dependencies: []
+description: Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support
+email: jruscio@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README
+files:
+- LICENSE
+- README
+- Rakefile
+- VERSION
+- aggregate.gemspec
+- lib/aggregate.rb
+- test/ts_aggregate.rb
+has_rdoc: true
+homepage: http://github.com/josephruscio/aggregate
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.3
+signing_key:
+specification_version: 3
+summary: Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support
+test_files:
+- test/ts_aggregate.rb