aggregate 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2009 Joseph Ruscio
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,2 @@
1
+ Aggregate is a ruby implementation of a statistics aggregator including histogram support
2
+
@@ -0,0 +1,15 @@
1
+ require 'rake'
2
+
3
+ begin
4
+ require 'jeweler'
5
+ Jeweler::Tasks.new do |gemspec|
6
+ gemspec.name = "aggregate"
7
+ gemspec.summary = "Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support"
8
+ gemspec.description = "Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support"
9
+ gemspec.email = "jruscio@gmail.com"
10
+ gemspec.homepage = "http://github.com/josephruscio/aggregate"
11
+ gemspec.authors = ["Joseph Ruscio"]
12
+ end
13
+ rescue LoadError
14
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
15
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.2
@@ -0,0 +1,46 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{aggregate}
8
+ s.version = "0.1.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Joseph Ruscio"]
12
+ s.date = %q{2009-08-16}
13
+ s.description = %q{Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support}
14
+ s.email = %q{jruscio@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README"
18
+ ]
19
+ s.files = [
20
+ "LICENSE",
21
+ "README",
22
+ "Rakefile",
23
+ "VERSION",
24
+ "aggregate.gemspec",
25
+ "lib/aggregate.rb",
26
+ "test/ts_aggregate.rb"
27
+ ]
28
+ s.homepage = %q{http://github.com/josephruscio/aggregate}
29
+ s.rdoc_options = ["--charset=UTF-8"]
30
+ s.require_paths = ["lib"]
31
+ s.rubygems_version = %q{1.3.3}
32
+ s.summary = %q{Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support}
33
+ s.test_files = [
34
+ "test/ts_aggregate.rb"
35
+ ]
36
+
37
+ if s.respond_to? :specification_version then
38
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
39
+ s.specification_version = 3
40
+
41
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
42
+ else
43
+ end
44
+ else
45
+ end
46
+ end
@@ -0,0 +1,277 @@
1
+ # Implements aggregate statistics and maintains
2
+ # configurable histogram for a set of given samples. Convenient for tracking
3
+ # high throughput data.
4
+ class Aggregate
5
+ #The current average of all samples
6
+ attr_reader :mean
7
+
8
+ #The current number of samples
9
+ attr_reader :count
10
+
11
+ #The maximum sample value
12
+ attr_reader :max
13
+
14
+ #The minimum samples value
15
+ attr_reader :min
16
+
17
+ #The sum of all samples
18
+ attr_reader :sum
19
+
20
+ #The number of samples falling below the lowest valued histogram bucket
21
+ attr_reader :outliers_low
22
+
23
+ #The number of samples falling above the highest valued histogram bucket
24
+ attr_reader :outliers_high
25
+
26
+ # The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
27
+ @@LOG_BUCKETS = 128
28
+
29
+ # Create a new Aggregate that maintains a binary logarithmic histogram
30
+ # by default. Specifying values for low, high, and width configures
31
+ # the aggregate to maintain a linear histogram with (high - low)/width buckets
32
+ def initialize (low=nil, high=nil, width=nil)
33
+ @count = 0
34
+ @sum = 0.0
35
+ @sum2 = 0.0
36
+ @outliers_low = 0
37
+ @outliers_high = 0
38
+
39
+ # If the user asks we maintain a linear histogram
40
+ if (nil != low && nil != high && nil != width)
41
+
42
+ #Validate linear specification
43
+ if high <= low
44
+ raise ArgumentError, "High bucket must be > Low bucket"
45
+ end
46
+
47
+ if high - low < width
48
+ raise ArgumentError, "Histogram width must be <= histogram range"
49
+ end
50
+
51
+ @low = low
52
+ @high = high
53
+ @width = width
54
+ else
55
+ @low = 1
56
+ @high = to_bucket(@@LOG_BUCKETS - 1)
57
+ end
58
+
59
+ #Initialize all buckets to 0
60
+ @buckets = Array.new(bucket_count, 0)
61
+ end
62
+
63
+ # Include a sample in the aggregate
64
+ def << data
65
+
66
+ # Update min/max
67
+ if 0 == @count
68
+ @min = data
69
+ @max = data
70
+ else
71
+ @max = [data, @max].max
72
+ @min = [data, @min].min
73
+ end
74
+
75
+ # Update the running info
76
+ @count += 1
77
+ @sum += data
78
+ @sum2 += (data * data)
79
+
80
+ # Update the bucket
81
+ @buckets[to_index(data)] += 1 unless outlier?(data)
82
+ end
83
+
84
+ def mean
85
+ @sum / @count
86
+ end
87
+
88
+ #Calculate the standard deviation
89
+ def std_dev
90
+ Math.sqrt((@sum2.to_f - ((@sum.to_f * @sum.to_f)/@count.to_f)) / (@count.to_f - 1))
91
+ end
92
+
93
+ # Combine two aggregates
94
+ #def +(b)
95
+ # a = self
96
+ # c = Aggregate.new
97
+
98
+ # c.count = a.count + b.count
99
+ #end
100
+
101
+ #Generate a pretty-printed ASCII representation of the histogram
102
+ def to_s(columns=nil)
103
+
104
+ #default to an 80 column terminal, don't support < 80 for now
105
+ if nil == columns
106
+ columns = 80
107
+ else
108
+ raise ArgumentError if columns < 80
109
+ end
110
+
111
+ #Find the largest bucket and create an array of the rows we intend to print
112
+ disp_buckets = Array.new
113
+ max_count = 0
114
+ total = 0
115
+ @buckets.each_with_index do |count, idx|
116
+ next if 0 == count
117
+ max_count = [max_count, count].max
118
+ disp_buckets << [idx, to_bucket(idx), count]
119
+ total += count
120
+ end
121
+
122
+ #Figure out how wide the value and count columns need to be based on their
123
+ #largest respective numbers
124
+ value_str = "value"
125
+ count_str = "count"
126
+ total_str = "Total"
127
+ value_width = [disp_buckets.last[1].to_s.length, value_str.length].max
128
+ value_width = [value_width, total_str.length].max
129
+ count_width = [total.to_s.length, count_str.length].max
130
+ max_bar_width = columns - (value_width + " |".length + "| ".length + count_width)
131
+
132
+ #Determine the value of a '@'
133
+ weight = [max_count.to_f/max_bar_width.to_f, 1.0].max
134
+
135
+ #format the header
136
+ histogram = sprintf("%#{value_width}s |", value_str)
137
+ max_bar_width.times { histogram << "-"}
138
+ histogram << sprintf("| %#{count_width}s\n", count_str)
139
+
140
+ # We denote empty buckets with a '~'
141
+ def skip_row(value_width)
142
+ sprintf("%#{value_width}s ~\n", " ")
143
+ end
144
+
145
+ #Loop through each bucket to be displayed and output the correct number
146
+ prev_index = disp_buckets[0][0] - 1
147
+
148
+ disp_buckets.each do |x|
149
+ #Denote skipped empty buckets with a ~
150
+ histogram << skip_row(value_width) unless prev_index == x[0] - 1
151
+ prev_index = x[0]
152
+
153
+ #Add the value
154
+ row = sprintf("%#{value_width}d |", x[1])
155
+
156
+ #Add the bar
157
+ bar_size = (x[2]/weight).to_i
158
+ bar_size.times { row += "@"}
159
+ (max_bar_width - bar_size).times { row += " " }
160
+
161
+ #Add the count
162
+ row << sprintf("| %#{count_width}d\n", x[2])
163
+
164
+ #Append the finished row onto the histogram
165
+ histogram << row
166
+ end
167
+
168
+ #End the table
169
+ histogram << skip_row(value_width) if disp_buckets.last[0] != bucket_count-1
170
+ histogram << sprintf("%#{value_width}s", "Total")
171
+ histogram << " |"
172
+ max_bar_width.times {histogram << "-"}
173
+ histogram << "| "
174
+ histogram << sprintf("%#{count_width}d\n", total)
175
+ end
176
+
177
+ #Iterate through each bucket in the histogram regardless of
178
+ #its contents
179
+ def each
180
+ @buckets.each_with_index do |count, index|
181
+ yield(to_bucket(index), count)
182
+ end
183
+ end
184
+
185
+ #Iterate through only the buckets in the histogram that contain
186
+ #samples
187
+ def each_nonzero
188
+ @buckets.each_with_index do |count, index|
189
+ yield(to_bucket(index), count) if count != 0
190
+ end
191
+ end
192
+
193
+ private
194
+
195
+ def linear?
196
+ nil != @width
197
+ end
198
+
199
+ def outlier? (data)
200
+
201
+ if data < @low
202
+ @outliers_low += 1
203
+ elsif data > @high
204
+ @outliers_high += 1
205
+ else
206
+ return false
207
+ end
208
+ end
209
+
210
+ def bucket_count
211
+ if linear?
212
+ return (@high-@low)/@width
213
+ else
214
+ return @@LOG_BUCKETS
215
+ end
216
+ end
217
+
218
+ def to_bucket(index)
219
+ if linear?
220
+ return @low + (index * @width)
221
+ else
222
+ return 2**(index)
223
+ end
224
+ end
225
+
226
+ def right_bucket? index, data
227
+
228
+ # check invariant
229
+ raise unless linear?
230
+
231
+ bucket = to_bucket(index)
232
+
233
+ #It's the right bucket if data falls between bucket and next bucket
234
+ bucket <= data && data < bucket + @width
235
+ end
236
+
237
+ =begin
238
+ def find_bucket(lower, upper, target)
239
+ #Classic binary search
240
+ return upper if right_bucket?(upper, target)
241
+
242
+ # Cut the search range in half
243
+ middle = (upper/2).to_i
244
+
245
+ # Determine which half contains our value and recurse
246
+ if (to_bucket(middle) >= target)
247
+ return find_bucket(lower, middle, target)
248
+ else
249
+ return find_bucket(middle, upper, target)
250
+ end
251
+ end
252
+ =end
253
+
254
+ # A data point is added to the bucket[n] where the data point
255
+ # is less than the value represented by bucket[n], but greater
256
+ # than the value represented by bucket[n+1]
257
+ def to_index (data)
258
+
259
+ # basic case is simple
260
+ return log2(data).to_i if !linear?
261
+
262
+ # Search for the right bucket in the linear case
263
+ @buckets.each_with_index do |count, idx|
264
+ return idx if right_bucket?(idx, data)
265
+ end
266
+ #find_bucket(0, bucket_count-1, data)
267
+
268
+ #Should not get here
269
+ raise "#{data}"
270
+ end
271
+
272
+ # log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
273
+ def log2( x )
274
+ Math.log(x) / Math.log(2)
275
+ end
276
+
277
+ end
@@ -0,0 +1,145 @@
1
+ require 'test/unit'
2
+ require 'lib/aggregate'
3
+
4
+ class SimpleStatsTest < Test::Unit::TestCase
5
+
6
+ def setup
7
+ @stats = Aggregate.new
8
+
9
+ @@DATA.each do |x|
10
+ @stats << x
11
+ end
12
+ end
13
+
14
+ def test_stats_count
15
+ assert_equal @@DATA.length, @stats.count
16
+ end
17
+
18
+ def test_stats_min_max
19
+ sorted_data = @@DATA.sort
20
+
21
+ assert_equal sorted_data[0], @stats.min
22
+ assert_equal sorted_data.last, @stats.max
23
+ end
24
+
25
+ def test_stats_mean
26
+ sum = 0
27
+ @@DATA.each do |x|
28
+ sum += x
29
+ end
30
+
31
+ assert_equal sum.to_f/@@DATA.length.to_f, @stats.mean
32
+ end
33
+
34
+ def test_bucket_counts
35
+
36
+ #Test each iterator
37
+ total_bucket_sum = 0
38
+ i = 0
39
+ @stats.each do |bucket, count|
40
+ assert_equal 2**i, bucket
41
+
42
+ total_bucket_sum += count
43
+ i += 1
44
+ end
45
+
46
+ assert_equal total_bucket_sum, @@DATA.length
47
+
48
+ #Test each_nonzero iterator
49
+ prev_bucket = 0
50
+ total_bucket_sum = 0
51
+ @stats.each_nonzero do |bucket, count|
52
+ assert bucket > prev_bucket
53
+ assert_not_equal count, 0
54
+
55
+ total_bucket_sum += count
56
+ end
57
+
58
+ assert_equal total_bucket_sum, @@DATA.length
59
+ end
60
+
61
+ =begin
62
+ def test_addition
63
+ stats1 = Aggregate.new
64
+ stats2 = Aggregate.new
65
+
66
+ stats1 << 1
67
+ stats2 << 3
68
+
69
+ stats_sum = stats1 + stats2
70
+
71
+ assert_equal stats_sum.count, stats1.count + stats2.count
72
+ end
73
+ =end
74
+
75
+ #XXX: Update test_bucket_contents() if you muck with @@DATA
76
+ @@DATA = [ 1, 5, 4, 6, 1028, 1972, 16384, 16385, 16383 ]
77
+ def test_bucket_contents
78
+ #XXX: This is the only test so far that cares about the actual contents
79
+ # of @@DATA, so if you update that array ... update this method too
80
+ expected_buckets = [1, 4, 1024, 8192, 16384]
81
+ expected_counts = [1, 3, 2, 1, 2]
82
+
83
+ i = 0
84
+ @stats.each_nonzero do |bucket, count|
85
+ assert_equal expected_buckets[i], bucket
86
+ assert_equal expected_counts[i], count
87
+ # Increment for the next test
88
+ i += 1
89
+ end
90
+ end
91
+
92
+ def test_histogram
93
+ puts @stats.to_s
94
+ end
95
+
96
+ def test_outlier
97
+ assert_equal 0, @stats.outliers_low
98
+ assert_equal 0, @stats.outliers_high
99
+
100
+ @stats << -1
101
+ @stats << -2
102
+ @stats << 2**129
103
+
104
+ assert_equal 2, @stats.outliers_low
105
+ assert_equal 1, @stats.outliers_high
106
+ end
107
+
108
+ def test_std_dev
109
+ @stats.std_dev
110
+ end
111
+ end
112
+
113
+ class LinearHistogramTest < Test::Unit::TestCase
114
+ def setup
115
+ @stats = Aggregate.new(0, 32768, 1024)
116
+
117
+ @@DATA.each do |x|
118
+ @stats << x
119
+ end
120
+ end
121
+
122
+ def test_validation
123
+ assert_raise(ArgumentError) {bad_stats = Aggregate.new(32,32,4)}
124
+ assert_raise(ArgumentError) {bad_stats = Aggregate.new(32,16,4)}
125
+ assert_raise(ArgumentError) {bad_stats = Aggregate.new(16,32,17)}
126
+ end
127
+
128
+ #XXX: Update test_bucket_contents() if you muck with @@DATA
129
+ @@DATA = [ 1, 5, 4, 6, 1028, 1972, 16384, 16385, 16383 ]
130
+ def test_bucket_contents
131
+ #XXX: This is the only test so far that cares about the actual contents
132
+ # of @@DATA, so if you update that array ... update this method too
133
+ expected_buckets = [0, 1024, 15360, 16384]
134
+ expected_counts = [4, 2, 1, 2]
135
+
136
+ i = 0
137
+ @stats.each_nonzero do |bucket, count|
138
+ assert_equal expected_buckets[i], bucket
139
+ assert_equal expected_counts[i], count
140
+ # Increment for the next test
141
+ i += 1
142
+ end
143
+ end
144
+
145
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: aggregate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ platform: ruby
6
+ authors:
7
+ - Joseph Ruscio
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-16 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support
17
+ email: jruscio@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README
25
+ files:
26
+ - LICENSE
27
+ - README
28
+ - Rakefile
29
+ - VERSION
30
+ - aggregate.gemspec
31
+ - lib/aggregate.rb
32
+ - test/ts_aggregate.rb
33
+ has_rdoc: true
34
+ homepage: http://github.com/josephruscio/aggregate
35
+ licenses: []
36
+
37
+ post_install_message:
38
+ rdoc_options:
39
+ - --charset=UTF-8
40
+ require_paths:
41
+ - lib
42
+ required_ruby_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: "0"
47
+ version:
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: "0"
53
+ version:
54
+ requirements: []
55
+
56
+ rubyforge_project:
57
+ rubygems_version: 1.3.3
58
+ signing_key:
59
+ specification_version: 3
60
+ summary: Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support
61
+ test_files:
62
+ - test/ts_aggregate.rb