josephruscio-aggregate 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +22 -0
- data/aggregate.rb +240 -0
- metadata +56 -0
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2009 Joseph Ruscio
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person
|
4
|
+
obtaining a copy of this software and associated documentation
|
5
|
+
files (the "Software"), to deal in the Software without
|
6
|
+
restriction, including without limitation the rights to use,
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the
|
9
|
+
Software is furnished to do so, subject to the following
|
10
|
+
conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
19
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
20
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
21
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/aggregate.rb
ADDED
@@ -0,0 +1,240 @@
|
|
1
|
+
# Implements aggregate statistics and maintains
|
2
|
+
# configurable histogram for a set of given samples. Convenient for tracking
|
3
|
+
# high throughput data.
|
4
|
+
class Aggregate
|
5
|
+
#The current average of all samples
|
6
|
+
attr_reader :mean
|
7
|
+
|
8
|
+
#The current number of samples
|
9
|
+
attr_reader :count
|
10
|
+
|
11
|
+
#The maximum sample value
|
12
|
+
attr_reader :max
|
13
|
+
|
14
|
+
#The minimum samples value
|
15
|
+
attr_reader :min
|
16
|
+
|
17
|
+
#The sum of all samples
|
18
|
+
attr_reader :sum
|
19
|
+
|
20
|
+
#The number of samples falling below the lowest valued histogram bucket
|
21
|
+
attr_reader :outliers_low
|
22
|
+
|
23
|
+
#The number of samples falling above the highest valued histogram bucket
|
24
|
+
attr_reader :outliers_high
|
25
|
+
|
26
|
+
# The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
|
27
|
+
@@LOG_BUCKETS = 128
|
28
|
+
|
29
|
+
# Create a new Aggregate that maintains a binary logarithmic histogram
|
30
|
+
# by default. Specifying values for low, high, and width configures
|
31
|
+
# the aggregate to maintain a linear histogram with (high - low)/width buckets
|
32
|
+
def initialize (low=nil, high=nil, width=nil)
|
33
|
+
@count = 0
|
34
|
+
@sum = 0.0
|
35
|
+
@sum2 = 0.0
|
36
|
+
@outliers_low = 0
|
37
|
+
@outliers_high = 0
|
38
|
+
|
39
|
+
# If the user asks we maintain a linear histogram
|
40
|
+
# STILL UNDER TEST/DEV
|
41
|
+
if false #(nil != low && nil != high && nil != width)
|
42
|
+
# This is a linear histogram
|
43
|
+
if high < low
|
44
|
+
raise ArgumentError, "High bucket must be > Low bucket"
|
45
|
+
end
|
46
|
+
|
47
|
+
@low = low
|
48
|
+
@high = high
|
49
|
+
@width = width
|
50
|
+
else
|
51
|
+
@low = 1
|
52
|
+
@high = to_bucket(@@LOG_BUCKETS - 1)
|
53
|
+
end
|
54
|
+
|
55
|
+
#Initialize all buckets to 0
|
56
|
+
@buckets = Array.new(bucket_count, 0)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Include a sample in the aggregate
|
60
|
+
def << data
|
61
|
+
|
62
|
+
# Update min/max
|
63
|
+
if 0 == @count
|
64
|
+
@min = data
|
65
|
+
@max = data
|
66
|
+
elsif data > @max
|
67
|
+
@max = data
|
68
|
+
elsif data < @min
|
69
|
+
@min = data
|
70
|
+
end
|
71
|
+
|
72
|
+
# Update the running info
|
73
|
+
@count += 1
|
74
|
+
@sum += data
|
75
|
+
@sum2 += (data * data)
|
76
|
+
|
77
|
+
# Update the bucket
|
78
|
+
@buckets[to_index(data)] += 1 unless outlier?(data)
|
79
|
+
end
|
80
|
+
|
81
|
+
def mean
|
82
|
+
@sum / self.count
|
83
|
+
end
|
84
|
+
|
85
|
+
def std_dev
|
86
|
+
end
|
87
|
+
|
88
|
+
#Generate a pretty-printed ASCII representation of the histogram
|
89
|
+
def to_s
|
90
|
+
#Find the largest bucket and create an array of the rows we intend to print
|
91
|
+
max_count = 0
|
92
|
+
disp_buckets = Array.new
|
93
|
+
@buckets.each_with_index do |count, idx|
|
94
|
+
next if 0 == count
|
95
|
+
max_count = count if max_count < count
|
96
|
+
disp_buckets << [idx, to_bucket(idx), count]
|
97
|
+
end
|
98
|
+
|
99
|
+
#Figure out how wide the value and count columns need to be based on their
|
100
|
+
#largest respective numbers
|
101
|
+
value_width = [disp_buckets.last[1].to_s.length, "value".length].max
|
102
|
+
count_width = [max_count.to_s.length, "count".length].max
|
103
|
+
max_bar_width = 80 - (value_width + " |".length + " ".length + count_width)
|
104
|
+
|
105
|
+
#print the header
|
106
|
+
header = sprintf("%#{value_width}s", "value")
|
107
|
+
header += " |"
|
108
|
+
max_bar_width.times { header += "-"}
|
109
|
+
header += " count"
|
110
|
+
|
111
|
+
#Determine the value of a '@'
|
112
|
+
weight = [max_count.to_f/max_bar_width.to_f, 1.0].max
|
113
|
+
|
114
|
+
#Loop through each bucket to be displayed and output the correct number
|
115
|
+
histogram = ""
|
116
|
+
prev_index = disp_buckets[0][0] - 1
|
117
|
+
disp_buckets.each do |x|
|
118
|
+
|
119
|
+
#Denote skipped empty buckets with a ~
|
120
|
+
histogram += " ~\n" unless prev_index == x[0] - 1
|
121
|
+
prev_index = x[0]
|
122
|
+
|
123
|
+
#Add the value
|
124
|
+
row = sprintf("%#{value_width}d |", x[1])
|
125
|
+
|
126
|
+
#Add the bar
|
127
|
+
bar_size = (x[2]/weight).to_i
|
128
|
+
bar_size.times { row += "@"}
|
129
|
+
(max_bar_width - bar_size).times { row += " " }
|
130
|
+
|
131
|
+
#Add the count
|
132
|
+
row += sprintf(" %#{count_width}d\n", x[2])
|
133
|
+
|
134
|
+
#Append the finished row onto the histogram
|
135
|
+
histogram += row
|
136
|
+
end
|
137
|
+
|
138
|
+
#Put the pieces together
|
139
|
+
"\n" + header + "\n" + histogram
|
140
|
+
end
|
141
|
+
|
142
|
+
#Iterate through each bucket in the histogram regardless of
|
143
|
+
#its contents
|
144
|
+
def each
|
145
|
+
@buckets.each_with_index do |count, index|
|
146
|
+
yield(to_bucket(index), count)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
#Iterate through only the buckets in the histogram that contain
|
151
|
+
#samples
|
152
|
+
def each_nonzero
|
153
|
+
@buckets.each_with_index do |count, index|
|
154
|
+
yield(to_bucket(index), count) if count != 0
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
private
|
159
|
+
|
160
|
+
def linear?
|
161
|
+
nil != @width
|
162
|
+
end
|
163
|
+
|
164
|
+
def outlier? (data)
|
165
|
+
|
166
|
+
if data < @low
|
167
|
+
@outliers_low += 1
|
168
|
+
elsif data > @high
|
169
|
+
@outliers_high += 1
|
170
|
+
else
|
171
|
+
return false
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def bucket_count
|
176
|
+
if linear?
|
177
|
+
return (@high-@low)/@width
|
178
|
+
else
|
179
|
+
return @@LOG_BUCKETS
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def to_bucket(index)
|
184
|
+
if linear?
|
185
|
+
return @low + ( (index + 1) * @width)
|
186
|
+
else
|
187
|
+
return 2**(index)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def right_bucket?(index, data)
|
192
|
+
bucket = to_bucket(index)
|
193
|
+
|
194
|
+
# Edge case
|
195
|
+
if 0 == index
|
196
|
+
prev_bucket = @low
|
197
|
+
else
|
198
|
+
prev_bucket = to_bucket(index - 1)
|
199
|
+
end
|
200
|
+
|
201
|
+
#It's the right bucket if data falls between prev_bucket and bucket
|
202
|
+
prev_bucket <= data && data <= bucket
|
203
|
+
end
|
204
|
+
|
205
|
+
def find_bucket(lower, upper, target)
|
206
|
+
#Classic binary search
|
207
|
+
return upper if right_bucket?(upper, target)
|
208
|
+
|
209
|
+
# Cut the search range in half
|
210
|
+
middle = (upper/2).to_i
|
211
|
+
|
212
|
+
# Determine which half contains our value and recurse
|
213
|
+
if (to_bucket(middle) >= target)
|
214
|
+
return find_bucket(lower, middle, target)
|
215
|
+
else
|
216
|
+
return find_bucket(middle, upper, target)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# A data point is added to the bucket[n] where the data point
|
221
|
+
# is less than the value represented by bucket[n], but greater
|
222
|
+
# than the value represented by bucket[n+1]
|
223
|
+
def to_index (data)
|
224
|
+
|
225
|
+
if linear?
|
226
|
+
find_bucket(0, bucket_count-1, data)
|
227
|
+
else
|
228
|
+
#log2 returns the bucket above the one we want,
|
229
|
+
#and we need to also subtract for 0 indexing of Array
|
230
|
+
log2(data).to_i
|
231
|
+
end
|
232
|
+
|
233
|
+
end
|
234
|
+
|
235
|
+
# log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
|
236
|
+
def log2( x )
|
237
|
+
Math.log(x) / Math.log(2)
|
238
|
+
end
|
239
|
+
|
240
|
+
end
|
metadata
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: josephruscio-aggregate
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Joseph Ruscio
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-08-11 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
|
17
|
+
email: jruscio@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- LICENSE
|
24
|
+
files:
|
25
|
+
- aggregate.rb
|
26
|
+
- LICENSE
|
27
|
+
has_rdoc: true
|
28
|
+
homepage: http://github.com/josephruscio/aggregate
|
29
|
+
licenses:
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options:
|
32
|
+
- --inline-source
|
33
|
+
- --charset=UTF-8
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: "0"
|
41
|
+
version:
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: "0"
|
47
|
+
version:
|
48
|
+
requirements: []
|
49
|
+
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 1.3.5
|
52
|
+
signing_key:
|
53
|
+
specification_version: 2
|
54
|
+
summary: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
|
55
|
+
test_files: []
|
56
|
+
|