aggregate 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2009 Joseph Ruscio
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
data/README ADDED
@@ -0,0 +1,2 @@
1
+ Aggregate is a ruby implementation of a statistics aggregator including histogram support
2
+
@@ -0,0 +1,15 @@
1
+ require 'rake'
2
+
3
+ begin
4
+ require 'jeweler'
5
+ Jeweler::Tasks.new do |gemspec|
6
+ gemspec.name = "aggregate"
7
+ gemspec.summary = "Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support"
8
+ gemspec.description = "Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support"
9
+ gemspec.email = "jruscio@gmail.com"
10
+ gemspec.homepage = "http://github.com/josephruscio/aggregate"
11
+ gemspec.authors = ["Joseph Ruscio"]
12
+ end
13
+ rescue LoadError
14
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
15
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.2
@@ -0,0 +1,46 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{aggregate}
8
+ s.version = "0.1.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Joseph Ruscio"]
12
+ s.date = %q{2009-08-16}
13
+ s.description = %q{Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support}
14
+ s.email = %q{jruscio@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README"
18
+ ]
19
+ s.files = [
20
+ "LICENSE",
21
+ "README",
22
+ "Rakefile",
23
+ "VERSION",
24
+ "aggregate.gemspec",
25
+ "lib/aggregate.rb",
26
+ "test/ts_aggregate.rb"
27
+ ]
28
+ s.homepage = %q{http://github.com/josephruscio/aggregate}
29
+ s.rdoc_options = ["--charset=UTF-8"]
30
+ s.require_paths = ["lib"]
31
+ s.rubygems_version = %q{1.3.3}
32
+ s.summary = %q{Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support}
33
+ s.test_files = [
34
+ "test/ts_aggregate.rb"
35
+ ]
36
+
37
+ if s.respond_to? :specification_version then
38
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
39
+ s.specification_version = 3
40
+
41
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
42
+ else
43
+ end
44
+ else
45
+ end
46
+ end
@@ -0,0 +1,277 @@
1
+ # Implements aggregate statistics and maintains
2
+ # configurable histogram for a set of given samples. Convenient for tracking
3
+ # high throughput data.
4
+ class Aggregate
5
+ #The current average of all samples
6
+ attr_reader :mean
7
+
8
+ #The current number of samples
9
+ attr_reader :count
10
+
11
+ #The maximum sample value
12
+ attr_reader :max
13
+
14
+ #The minimum samples value
15
+ attr_reader :min
16
+
17
+ #The sum of all samples
18
+ attr_reader :sum
19
+
20
+ #The number of samples falling below the lowest valued histogram bucket
21
+ attr_reader :outliers_low
22
+
23
+ #The number of samples falling above the highest valued histogram bucket
24
+ attr_reader :outliers_high
25
+
26
+ # The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
27
+ @@LOG_BUCKETS = 128
28
+
29
+ # Create a new Aggregate that maintains a binary logarithmic histogram
30
+ # by default. Specifying values for low, high, and width configures
31
+ # the aggregate to maintain a linear histogram with (high - low)/width buckets
32
+ def initialize (low=nil, high=nil, width=nil)
33
+ @count = 0
34
+ @sum = 0.0
35
+ @sum2 = 0.0
36
+ @outliers_low = 0
37
+ @outliers_high = 0
38
+
39
+ # If the user asks we maintain a linear histogram
40
+ if (nil != low && nil != high && nil != width)
41
+
42
+ #Validate linear specification
43
+ if high <= low
44
+ raise ArgumentError, "High bucket must be > Low bucket"
45
+ end
46
+
47
+ if high - low < width
48
+ raise ArgumentError, "Histogram width must be <= histogram range"
49
+ end
50
+
51
+ @low = low
52
+ @high = high
53
+ @width = width
54
+ else
55
+ @low = 1
56
+ @high = to_bucket(@@LOG_BUCKETS - 1)
57
+ end
58
+
59
+ #Initialize all buckets to 0
60
+ @buckets = Array.new(bucket_count, 0)
61
+ end
62
+
63
+ # Include a sample in the aggregate
64
+ def << data
65
+
66
+ # Update min/max
67
+ if 0 == @count
68
+ @min = data
69
+ @max = data
70
+ else
71
+ @max = [data, @max].max
72
+ @min = [data, @min].min
73
+ end
74
+
75
+ # Update the running info
76
+ @count += 1
77
+ @sum += data
78
+ @sum2 += (data * data)
79
+
80
+ # Update the bucket
81
+ @buckets[to_index(data)] += 1 unless outlier?(data)
82
+ end
83
+
84
+ def mean
85
+ @sum / @count
86
+ end
87
+
88
+ #Calculate the standard deviation
89
+ def std_dev
90
+ Math.sqrt((@sum2.to_f - ((@sum.to_f * @sum.to_f)/@count.to_f)) / (@count.to_f - 1))
91
+ end
92
+
93
+ # Combine two aggregates
94
+ #def +(b)
95
+ # a = self
96
+ # c = Aggregate.new
97
+
98
+ # c.count = a.count + b.count
99
+ #end
100
+
101
+ #Generate a pretty-printed ASCII representation of the histogram
102
+ def to_s(columns=nil)
103
+
104
+ #default to an 80 column terminal, don't support < 80 for now
105
+ if nil == columns
106
+ columns = 80
107
+ else
108
+ raise ArgumentError if columns < 80
109
+ end
110
+
111
+ #Find the largest bucket and create an array of the rows we intend to print
112
+ disp_buckets = Array.new
113
+ max_count = 0
114
+ total = 0
115
+ @buckets.each_with_index do |count, idx|
116
+ next if 0 == count
117
+ max_count = [max_count, count].max
118
+ disp_buckets << [idx, to_bucket(idx), count]
119
+ total += count
120
+ end
121
+
122
+ #Figure out how wide the value and count columns need to be based on their
123
+ #largest respective numbers
124
+ value_str = "value"
125
+ count_str = "count"
126
+ total_str = "Total"
127
+ value_width = [disp_buckets.last[1].to_s.length, value_str.length].max
128
+ value_width = [value_width, total_str.length].max
129
+ count_width = [total.to_s.length, count_str.length].max
130
+ max_bar_width = columns - (value_width + " |".length + "| ".length + count_width)
131
+
132
+ #Determine the value of a '@'
133
+ weight = [max_count.to_f/max_bar_width.to_f, 1.0].max
134
+
135
+ #format the header
136
+ histogram = sprintf("%#{value_width}s |", value_str)
137
+ max_bar_width.times { histogram << "-"}
138
+ histogram << sprintf("| %#{count_width}s\n", count_str)
139
+
140
+ # We denote empty buckets with a '~'
141
+ def skip_row(value_width)
142
+ sprintf("%#{value_width}s ~\n", " ")
143
+ end
144
+
145
+ #Loop through each bucket to be displayed and output the correct number
146
+ prev_index = disp_buckets[0][0] - 1
147
+
148
+ disp_buckets.each do |x|
149
+ #Denote skipped empty buckets with a ~
150
+ histogram << skip_row(value_width) unless prev_index == x[0] - 1
151
+ prev_index = x[0]
152
+
153
+ #Add the value
154
+ row = sprintf("%#{value_width}d |", x[1])
155
+
156
+ #Add the bar
157
+ bar_size = (x[2]/weight).to_i
158
+ bar_size.times { row += "@"}
159
+ (max_bar_width - bar_size).times { row += " " }
160
+
161
+ #Add the count
162
+ row << sprintf("| %#{count_width}d\n", x[2])
163
+
164
+ #Append the finished row onto the histogram
165
+ histogram << row
166
+ end
167
+
168
+ #End the table
169
+ histogram << skip_row(value_width) if disp_buckets.last[0] != bucket_count-1
170
+ histogram << sprintf("%#{value_width}s", "Total")
171
+ histogram << " |"
172
+ max_bar_width.times {histogram << "-"}
173
+ histogram << "| "
174
+ histogram << sprintf("%#{count_width}d\n", total)
175
+ end
176
+
177
+ #Iterate through each bucket in the histogram regardless of
178
+ #its contents
179
+ def each
180
+ @buckets.each_with_index do |count, index|
181
+ yield(to_bucket(index), count)
182
+ end
183
+ end
184
+
185
+ #Iterate through only the buckets in the histogram that contain
186
+ #samples
187
+ def each_nonzero
188
+ @buckets.each_with_index do |count, index|
189
+ yield(to_bucket(index), count) if count != 0
190
+ end
191
+ end
192
+
193
+ private
194
+
195
+ def linear?
196
+ nil != @width
197
+ end
198
+
199
+ def outlier? (data)
200
+
201
+ if data < @low
202
+ @outliers_low += 1
203
+ elsif data > @high
204
+ @outliers_high += 1
205
+ else
206
+ return false
207
+ end
208
+ end
209
+
210
+ def bucket_count
211
+ if linear?
212
+ return (@high-@low)/@width
213
+ else
214
+ return @@LOG_BUCKETS
215
+ end
216
+ end
217
+
218
+ def to_bucket(index)
219
+ if linear?
220
+ return @low + (index * @width)
221
+ else
222
+ return 2**(index)
223
+ end
224
+ end
225
+
226
+ def right_bucket? index, data
227
+
228
+ # check invariant
229
+ raise unless linear?
230
+
231
+ bucket = to_bucket(index)
232
+
233
+ #It's the right bucket if data falls between bucket and next bucket
234
+ bucket <= data && data < bucket + @width
235
+ end
236
+
237
+ =begin
238
+ def find_bucket(lower, upper, target)
239
+ #Classic binary search
240
+ return upper if right_bucket?(upper, target)
241
+
242
+ # Cut the search range in half
243
+ middle = (upper/2).to_i
244
+
245
+ # Determine which half contains our value and recurse
246
+ if (to_bucket(middle) >= target)
247
+ return find_bucket(lower, middle, target)
248
+ else
249
+ return find_bucket(middle, upper, target)
250
+ end
251
+ end
252
+ =end
253
+
254
+ # A data point is added to the bucket[n] where the data point
255
+ # is less than the value represented by bucket[n], but greater
256
+ # than the value represented by bucket[n+1]
257
+ def to_index (data)
258
+
259
+ # basic case is simple
260
+ return log2(data).to_i if !linear?
261
+
262
+ # Search for the right bucket in the linear case
263
+ @buckets.each_with_index do |count, idx|
264
+ return idx if right_bucket?(idx, data)
265
+ end
266
+ #find_bucket(0, bucket_count-1, data)
267
+
268
+ #Should not get here
269
+ raise "#{data}"
270
+ end
271
+
272
+ # log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
273
+ def log2( x )
274
+ Math.log(x) / Math.log(2)
275
+ end
276
+
277
+ end
@@ -0,0 +1,145 @@
1
+ require 'test/unit'
2
+ require 'lib/aggregate'
3
+
4
+ class SimpleStatsTest < Test::Unit::TestCase
5
+
6
+ def setup
7
+ @stats = Aggregate.new
8
+
9
+ @@DATA.each do |x|
10
+ @stats << x
11
+ end
12
+ end
13
+
14
+ def test_stats_count
15
+ assert_equal @@DATA.length, @stats.count
16
+ end
17
+
18
+ def test_stats_min_max
19
+ sorted_data = @@DATA.sort
20
+
21
+ assert_equal sorted_data[0], @stats.min
22
+ assert_equal sorted_data.last, @stats.max
23
+ end
24
+
25
+ def test_stats_mean
26
+ sum = 0
27
+ @@DATA.each do |x|
28
+ sum += x
29
+ end
30
+
31
+ assert_equal sum.to_f/@@DATA.length.to_f, @stats.mean
32
+ end
33
+
34
+ def test_bucket_counts
35
+
36
+ #Test each iterator
37
+ total_bucket_sum = 0
38
+ i = 0
39
+ @stats.each do |bucket, count|
40
+ assert_equal 2**i, bucket
41
+
42
+ total_bucket_sum += count
43
+ i += 1
44
+ end
45
+
46
+ assert_equal total_bucket_sum, @@DATA.length
47
+
48
+ #Test each_nonzero iterator
49
+ prev_bucket = 0
50
+ total_bucket_sum = 0
51
+ @stats.each_nonzero do |bucket, count|
52
+ assert bucket > prev_bucket
53
+ assert_not_equal count, 0
54
+
55
+ total_bucket_sum += count
56
+ end
57
+
58
+ assert_equal total_bucket_sum, @@DATA.length
59
+ end
60
+
61
+ =begin
62
+ def test_addition
63
+ stats1 = Aggregate.new
64
+ stats2 = Aggregate.new
65
+
66
+ stats1 << 1
67
+ stats2 << 3
68
+
69
+ stats_sum = stats1 + stats2
70
+
71
+ assert_equal stats_sum.count, stats1.count + stats2.count
72
+ end
73
+ =end
74
+
75
+ #XXX: Update test_bucket_contents() if you muck with @@DATA
76
+ @@DATA = [ 1, 5, 4, 6, 1028, 1972, 16384, 16385, 16383 ]
77
+ def test_bucket_contents
78
+ #XXX: This is the only test so far that cares about the actual contents
79
+ # of @@DATA, so if you update that array ... update this method too
80
+ expected_buckets = [1, 4, 1024, 8192, 16384]
81
+ expected_counts = [1, 3, 2, 1, 2]
82
+
83
+ i = 0
84
+ @stats.each_nonzero do |bucket, count|
85
+ assert_equal expected_buckets[i], bucket
86
+ assert_equal expected_counts[i], count
87
+ # Increment for the next test
88
+ i += 1
89
+ end
90
+ end
91
+
92
+ def test_histogram
93
+ puts @stats.to_s
94
+ end
95
+
96
+ def test_outlier
97
+ assert_equal 0, @stats.outliers_low
98
+ assert_equal 0, @stats.outliers_high
99
+
100
+ @stats << -1
101
+ @stats << -2
102
+ @stats << 2**129
103
+
104
+ assert_equal 2, @stats.outliers_low
105
+ assert_equal 1, @stats.outliers_high
106
+ end
107
+
108
+ def test_std_dev
109
+ @stats.std_dev
110
+ end
111
+ end
112
+
113
+ class LinearHistogramTest < Test::Unit::TestCase
114
+ def setup
115
+ @stats = Aggregate.new(0, 32768, 1024)
116
+
117
+ @@DATA.each do |x|
118
+ @stats << x
119
+ end
120
+ end
121
+
122
+ def test_validation
123
+ assert_raise(ArgumentError) {bad_stats = Aggregate.new(32,32,4)}
124
+ assert_raise(ArgumentError) {bad_stats = Aggregate.new(32,16,4)}
125
+ assert_raise(ArgumentError) {bad_stats = Aggregate.new(16,32,17)}
126
+ end
127
+
128
+ #XXX: Update test_bucket_contents() if you muck with @@DATA
129
+ @@DATA = [ 1, 5, 4, 6, 1028, 1972, 16384, 16385, 16383 ]
130
+ def test_bucket_contents
131
+ #XXX: This is the only test so far that cares about the actual contents
132
+ # of @@DATA, so if you update that array ... update this method too
133
+ expected_buckets = [0, 1024, 15360, 16384]
134
+ expected_counts = [4, 2, 1, 2]
135
+
136
+ i = 0
137
+ @stats.each_nonzero do |bucket, count|
138
+ assert_equal expected_buckets[i], bucket
139
+ assert_equal expected_counts[i], count
140
+ # Increment for the next test
141
+ i += 1
142
+ end
143
+ end
144
+
145
+ end
metadata ADDED
@@ -0,0 +1,62 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: aggregate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.2
5
+ platform: ruby
6
+ authors:
7
+ - Joseph Ruscio
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-16 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support
17
+ email: jruscio@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ - README
25
+ files:
26
+ - LICENSE
27
+ - README
28
+ - Rakefile
29
+ - VERSION
30
+ - aggregate.gemspec
31
+ - lib/aggregate.rb
32
+ - test/ts_aggregate.rb
33
+ has_rdoc: true
34
+ homepage: http://github.com/josephruscio/aggregate
35
+ licenses: []
36
+
37
+ post_install_message:
38
+ rdoc_options:
39
+ - --charset=UTF-8
40
+ require_paths:
41
+ - lib
42
+ required_ruby_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: "0"
47
+ version:
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: "0"
53
+ version:
54
+ requirements: []
55
+
56
+ rubyforge_project:
57
+ rubygems_version: 1.3.3
58
+ signing_key:
59
+ specification_version: 3
60
+ summary: Aggregate is a Ruby class for accumulating aggregate statistics and includes histogram support
61
+ test_files:
62
+ - test/ts_aggregate.rb