josephruscio-aggregate 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +22 -0
- data/aggregate.rb +240 -0
- metadata +56 -0
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2009 Joseph Ruscio
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person
|
4
|
+
obtaining a copy of this software and associated documentation
|
5
|
+
files (the "Software"), to deal in the Software without
|
6
|
+
restriction, including without limitation the rights to use,
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
copies of the Software, and to permit persons to whom the
|
9
|
+
Software is furnished to do so, subject to the following
|
10
|
+
conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
17
|
+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
19
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
20
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
21
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/aggregate.rb
ADDED
@@ -0,0 +1,240 @@
|
|
1
|
+
# Implements aggregate statistics and maintains
|
2
|
+
# configurable histogram for a set of given samples. Convenient for tracking
|
3
|
+
# high throughput data.
|
4
|
+
class Aggregate
|
5
|
+
#The current average of all samples
|
6
|
+
attr_reader :mean
|
7
|
+
|
8
|
+
#The current number of samples
|
9
|
+
attr_reader :count
|
10
|
+
|
11
|
+
#The maximum sample value
|
12
|
+
attr_reader :max
|
13
|
+
|
14
|
+
#The minimum samples value
|
15
|
+
attr_reader :min
|
16
|
+
|
17
|
+
#The sum of all samples
|
18
|
+
attr_reader :sum
|
19
|
+
|
20
|
+
#The number of samples falling below the lowest valued histogram bucket
|
21
|
+
attr_reader :outliers_low
|
22
|
+
|
23
|
+
#The number of samples falling above the highest valued histogram bucket
|
24
|
+
attr_reader :outliers_high
|
25
|
+
|
26
|
+
# The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
|
27
|
+
@@LOG_BUCKETS = 128
|
28
|
+
|
29
|
+
# Create a new Aggregate that maintains a binary logarithmic histogram
|
30
|
+
# by default. Specifying values for low, high, and width configures
|
31
|
+
# the aggregate to maintain a linear histogram with (high - low)/width buckets
|
32
|
+
def initialize (low=nil, high=nil, width=nil)
|
33
|
+
@count = 0
|
34
|
+
@sum = 0.0
|
35
|
+
@sum2 = 0.0
|
36
|
+
@outliers_low = 0
|
37
|
+
@outliers_high = 0
|
38
|
+
|
39
|
+
# If the user asks we maintain a linear histogram
|
40
|
+
# STILL UNDER TEST/DEV
|
41
|
+
if false #(nil != low && nil != high && nil != width)
|
42
|
+
# This is a linear histogram
|
43
|
+
if high < low
|
44
|
+
raise ArgumentError, "High bucket must be > Low bucket"
|
45
|
+
end
|
46
|
+
|
47
|
+
@low = low
|
48
|
+
@high = high
|
49
|
+
@width = width
|
50
|
+
else
|
51
|
+
@low = 1
|
52
|
+
@high = to_bucket(@@LOG_BUCKETS - 1)
|
53
|
+
end
|
54
|
+
|
55
|
+
#Initialize all buckets to 0
|
56
|
+
@buckets = Array.new(bucket_count, 0)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Include a sample in the aggregate
|
60
|
+
def << data
|
61
|
+
|
62
|
+
# Update min/max
|
63
|
+
if 0 == @count
|
64
|
+
@min = data
|
65
|
+
@max = data
|
66
|
+
elsif data > @max
|
67
|
+
@max = data
|
68
|
+
elsif data < @min
|
69
|
+
@min = data
|
70
|
+
end
|
71
|
+
|
72
|
+
# Update the running info
|
73
|
+
@count += 1
|
74
|
+
@sum += data
|
75
|
+
@sum2 += (data * data)
|
76
|
+
|
77
|
+
# Update the bucket
|
78
|
+
@buckets[to_index(data)] += 1 unless outlier?(data)
|
79
|
+
end
|
80
|
+
|
81
|
+
def mean
|
82
|
+
@sum / self.count
|
83
|
+
end
|
84
|
+
|
85
|
+
def std_dev
|
86
|
+
end
|
87
|
+
|
88
|
+
#Generate a pretty-printed ASCII representation of the histogram
|
89
|
+
def to_s
|
90
|
+
#Find the largest bucket and create an array of the rows we intend to print
|
91
|
+
max_count = 0
|
92
|
+
disp_buckets = Array.new
|
93
|
+
@buckets.each_with_index do |count, idx|
|
94
|
+
next if 0 == count
|
95
|
+
max_count = count if max_count < count
|
96
|
+
disp_buckets << [idx, to_bucket(idx), count]
|
97
|
+
end
|
98
|
+
|
99
|
+
#Figure out how wide the value and count columns need to be based on their
|
100
|
+
#largest respective numbers
|
101
|
+
value_width = [disp_buckets.last[1].to_s.length, "value".length].max
|
102
|
+
count_width = [max_count.to_s.length, "count".length].max
|
103
|
+
max_bar_width = 80 - (value_width + " |".length + " ".length + count_width)
|
104
|
+
|
105
|
+
#print the header
|
106
|
+
header = sprintf("%#{value_width}s", "value")
|
107
|
+
header += " |"
|
108
|
+
max_bar_width.times { header += "-"}
|
109
|
+
header += " count"
|
110
|
+
|
111
|
+
#Determine the value of a '@'
|
112
|
+
weight = [max_count.to_f/max_bar_width.to_f, 1.0].max
|
113
|
+
|
114
|
+
#Loop through each bucket to be displayed and output the correct number
|
115
|
+
histogram = ""
|
116
|
+
prev_index = disp_buckets[0][0] - 1
|
117
|
+
disp_buckets.each do |x|
|
118
|
+
|
119
|
+
#Denote skipped empty buckets with a ~
|
120
|
+
histogram += " ~\n" unless prev_index == x[0] - 1
|
121
|
+
prev_index = x[0]
|
122
|
+
|
123
|
+
#Add the value
|
124
|
+
row = sprintf("%#{value_width}d |", x[1])
|
125
|
+
|
126
|
+
#Add the bar
|
127
|
+
bar_size = (x[2]/weight).to_i
|
128
|
+
bar_size.times { row += "@"}
|
129
|
+
(max_bar_width - bar_size).times { row += " " }
|
130
|
+
|
131
|
+
#Add the count
|
132
|
+
row += sprintf(" %#{count_width}d\n", x[2])
|
133
|
+
|
134
|
+
#Append the finished row onto the histogram
|
135
|
+
histogram += row
|
136
|
+
end
|
137
|
+
|
138
|
+
#Put the pieces together
|
139
|
+
"\n" + header + "\n" + histogram
|
140
|
+
end
|
141
|
+
|
142
|
+
#Iterate through each bucket in the histogram regardless of
|
143
|
+
#its contents
|
144
|
+
def each
|
145
|
+
@buckets.each_with_index do |count, index|
|
146
|
+
yield(to_bucket(index), count)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
#Iterate through only the buckets in the histogram that contain
|
151
|
+
#samples
|
152
|
+
def each_nonzero
|
153
|
+
@buckets.each_with_index do |count, index|
|
154
|
+
yield(to_bucket(index), count) if count != 0
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
private
|
159
|
+
|
160
|
+
def linear?
|
161
|
+
nil != @width
|
162
|
+
end
|
163
|
+
|
164
|
+
def outlier? (data)
|
165
|
+
|
166
|
+
if data < @low
|
167
|
+
@outliers_low += 1
|
168
|
+
elsif data > @high
|
169
|
+
@outliers_high += 1
|
170
|
+
else
|
171
|
+
return false
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def bucket_count
|
176
|
+
if linear?
|
177
|
+
return (@high-@low)/@width
|
178
|
+
else
|
179
|
+
return @@LOG_BUCKETS
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def to_bucket(index)
|
184
|
+
if linear?
|
185
|
+
return @low + ( (index + 1) * @width)
|
186
|
+
else
|
187
|
+
return 2**(index)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def right_bucket?(index, data)
|
192
|
+
bucket = to_bucket(index)
|
193
|
+
|
194
|
+
# Edge case
|
195
|
+
if 0 == index
|
196
|
+
prev_bucket = @low
|
197
|
+
else
|
198
|
+
prev_bucket = to_bucket(index - 1)
|
199
|
+
end
|
200
|
+
|
201
|
+
#It's the right bucket if data falls between prev_bucket and bucket
|
202
|
+
prev_bucket <= data && data <= bucket
|
203
|
+
end
|
204
|
+
|
205
|
+
def find_bucket(lower, upper, target)
|
206
|
+
#Classic binary search
|
207
|
+
return upper if right_bucket?(upper, target)
|
208
|
+
|
209
|
+
# Cut the search range in half
|
210
|
+
middle = (upper/2).to_i
|
211
|
+
|
212
|
+
# Determine which half contains our value and recurse
|
213
|
+
if (to_bucket(middle) >= target)
|
214
|
+
return find_bucket(lower, middle, target)
|
215
|
+
else
|
216
|
+
return find_bucket(middle, upper, target)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# A data point is added to the bucket[n] where the data point
|
221
|
+
# is less than the value represented by bucket[n], but greater
|
222
|
+
# than the value represented by bucket[n+1]
|
223
|
+
def to_index (data)
|
224
|
+
|
225
|
+
if linear?
|
226
|
+
find_bucket(0, bucket_count-1, data)
|
227
|
+
else
|
228
|
+
#log2 returns the bucket above the one we want,
|
229
|
+
#and we need to also subtract for 0 indexing of Array
|
230
|
+
log2(data).to_i
|
231
|
+
end
|
232
|
+
|
233
|
+
end
|
234
|
+
|
235
|
+
# log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
|
236
|
+
def log2( x )
|
237
|
+
Math.log(x) / Math.log(2)
|
238
|
+
end
|
239
|
+
|
240
|
+
end
|
metadata
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: josephruscio-aggregate
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Joseph Ruscio
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-08-11 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
|
17
|
+
email: jruscio@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- LICENSE
|
24
|
+
files:
|
25
|
+
- aggregate.rb
|
26
|
+
- LICENSE
|
27
|
+
has_rdoc: true
|
28
|
+
homepage: http://github.com/josephruscio/aggregate
|
29
|
+
licenses:
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options:
|
32
|
+
- --inline-source
|
33
|
+
- --charset=UTF-8
|
34
|
+
require_paths:
|
35
|
+
- lib
|
36
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: "0"
|
41
|
+
version:
|
42
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: "0"
|
47
|
+
version:
|
48
|
+
requirements: []
|
49
|
+
|
50
|
+
rubyforge_project:
|
51
|
+
rubygems_version: 1.3.5
|
52
|
+
signing_key:
|
53
|
+
specification_version: 2
|
54
|
+
summary: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
|
55
|
+
test_files: []
|
56
|
+
|