josephruscio-aggregate 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/LICENSE +22 -0
  2. data/aggregate.rb +240 -0
  3. metadata +56 -0
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2009 Joseph Ruscio
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
data/aggregate.rb ADDED
@@ -0,0 +1,240 @@
1
+ # Implements aggregate statistics and maintains
2
+ # configurable histogram for a set of given samples. Convenient for tracking
3
+ # high throughput data.
4
+ class Aggregate
5
+ #The current average of all samples
6
+ attr_reader :mean
7
+
8
+ #The current number of samples
9
+ attr_reader :count
10
+
11
+ #The maximum sample value
12
+ attr_reader :max
13
+
14
+ #The minimum samples value
15
+ attr_reader :min
16
+
17
+ #The sum of all samples
18
+ attr_reader :sum
19
+
20
+ #The number of samples falling below the lowest valued histogram bucket
21
+ attr_reader :outliers_low
22
+
23
+ #The number of samples falling above the highest valued histogram bucket
24
+ attr_reader :outliers_high
25
+
26
+ # The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
27
+ @@LOG_BUCKETS = 128
28
+
29
+ # Create a new Aggregate that maintains a binary logarithmic histogram
30
+ # by default. Specifying values for low, high, and width configures
31
+ # the aggregate to maintain a linear histogram with (high - low)/width buckets
32
+ def initialize (low=nil, high=nil, width=nil)
33
+ @count = 0
34
+ @sum = 0.0
35
+ @sum2 = 0.0
36
+ @outliers_low = 0
37
+ @outliers_high = 0
38
+
39
+ # If the user asks we maintain a linear histogram
40
+ # STILL UNDER TEST/DEV
41
+ if false #(nil != low && nil != high && nil != width)
42
+ # This is a linear histogram
43
+ if high < low
44
+ raise ArgumentError, "High bucket must be > Low bucket"
45
+ end
46
+
47
+ @low = low
48
+ @high = high
49
+ @width = width
50
+ else
51
+ @low = 1
52
+ @high = to_bucket(@@LOG_BUCKETS - 1)
53
+ end
54
+
55
+ #Initialize all buckets to 0
56
+ @buckets = Array.new(bucket_count, 0)
57
+ end
58
+
59
+ # Include a sample in the aggregate
60
+ def << data
61
+
62
+ # Update min/max
63
+ if 0 == @count
64
+ @min = data
65
+ @max = data
66
+ elsif data > @max
67
+ @max = data
68
+ elsif data < @min
69
+ @min = data
70
+ end
71
+
72
+ # Update the running info
73
+ @count += 1
74
+ @sum += data
75
+ @sum2 += (data * data)
76
+
77
+ # Update the bucket
78
+ @buckets[to_index(data)] += 1 unless outlier?(data)
79
+ end
80
+
81
+ def mean
82
+ @sum / self.count
83
+ end
84
+
85
+ def std_dev
86
+ end
87
+
88
+ #Generate a pretty-printed ASCII representation of the histogram
89
+ def to_s
90
+ #Find the largest bucket and create an array of the rows we intend to print
91
+ max_count = 0
92
+ disp_buckets = Array.new
93
+ @buckets.each_with_index do |count, idx|
94
+ next if 0 == count
95
+ max_count = count if max_count < count
96
+ disp_buckets << [idx, to_bucket(idx), count]
97
+ end
98
+
99
+ #Figure out how wide the value and count columns need to be based on their
100
+ #largest respective numbers
101
+ value_width = [disp_buckets.last[1].to_s.length, "value".length].max
102
+ count_width = [max_count.to_s.length, "count".length].max
103
+ max_bar_width = 80 - (value_width + " |".length + " ".length + count_width)
104
+
105
+ #print the header
106
+ header = sprintf("%#{value_width}s", "value")
107
+ header += " |"
108
+ max_bar_width.times { header += "-"}
109
+ header += " count"
110
+
111
+ #Determine the value of a '@'
112
+ weight = [max_count.to_f/max_bar_width.to_f, 1.0].max
113
+
114
+ #Loop through each bucket to be displayed and output the correct number
115
+ histogram = ""
116
+ prev_index = disp_buckets[0][0] - 1
117
+ disp_buckets.each do |x|
118
+
119
+ #Denote skipped empty buckets with a ~
120
+ histogram += " ~\n" unless prev_index == x[0] - 1
121
+ prev_index = x[0]
122
+
123
+ #Add the value
124
+ row = sprintf("%#{value_width}d |", x[1])
125
+
126
+ #Add the bar
127
+ bar_size = (x[2]/weight).to_i
128
+ bar_size.times { row += "@"}
129
+ (max_bar_width - bar_size).times { row += " " }
130
+
131
+ #Add the count
132
+ row += sprintf(" %#{count_width}d\n", x[2])
133
+
134
+ #Append the finished row onto the histogram
135
+ histogram += row
136
+ end
137
+
138
+ #Put the pieces together
139
+ "\n" + header + "\n" + histogram
140
+ end
141
+
142
+ #Iterate through each bucket in the histogram regardless of
143
+ #its contents
144
+ def each
145
+ @buckets.each_with_index do |count, index|
146
+ yield(to_bucket(index), count)
147
+ end
148
+ end
149
+
150
+ #Iterate through only the buckets in the histogram that contain
151
+ #samples
152
+ def each_nonzero
153
+ @buckets.each_with_index do |count, index|
154
+ yield(to_bucket(index), count) if count != 0
155
+ end
156
+ end
157
+
158
+ private
159
+
160
+ def linear?
161
+ nil != @width
162
+ end
163
+
164
+ def outlier? (data)
165
+
166
+ if data < @low
167
+ @outliers_low += 1
168
+ elsif data > @high
169
+ @outliers_high += 1
170
+ else
171
+ return false
172
+ end
173
+ end
174
+
175
+ def bucket_count
176
+ if linear?
177
+ return (@high-@low)/@width
178
+ else
179
+ return @@LOG_BUCKETS
180
+ end
181
+ end
182
+
183
+ def to_bucket(index)
184
+ if linear?
185
+ return @low + ( (index + 1) * @width)
186
+ else
187
+ return 2**(index)
188
+ end
189
+ end
190
+
191
+ def right_bucket?(index, data)
192
+ bucket = to_bucket(index)
193
+
194
+ # Edge case
195
+ if 0 == index
196
+ prev_bucket = @low
197
+ else
198
+ prev_bucket = to_bucket(index - 1)
199
+ end
200
+
201
+ #It's the right bucket if data falls between prev_bucket and bucket
202
+ prev_bucket <= data && data <= bucket
203
+ end
204
+
205
+ def find_bucket(lower, upper, target)
206
+ #Classic binary search
207
+ return upper if right_bucket?(upper, target)
208
+
209
+ # Cut the search range in half
210
+ middle = (upper/2).to_i
211
+
212
+ # Determine which half contains our value and recurse
213
+ if (to_bucket(middle) >= target)
214
+ return find_bucket(lower, middle, target)
215
+ else
216
+ return find_bucket(middle, upper, target)
217
+ end
218
+ end
219
+
220
+ # A data point is added to the bucket[n] where the data point
221
+ # is less than the value represented by bucket[n], but greater
222
+ # than the value represented by bucket[n+1]
223
+ def to_index (data)
224
+
225
+ if linear?
226
+ find_bucket(0, bucket_count-1, data)
227
+ else
228
+ #log2 returns the bucket above the one we want,
229
+ #and we need to also subtract for 0 indexing of Array
230
+ log2(data).to_i
231
+ end
232
+
233
+ end
234
+
235
+ # log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
236
+ def log2( x )
237
+ Math.log(x) / Math.log(2)
238
+ end
239
+
240
+ end
metadata ADDED
@@ -0,0 +1,56 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: josephruscio-aggregate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Joseph Ruscio
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-11 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
17
+ email: jruscio@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ files:
25
+ - aggregate.rb
26
+ - LICENSE
27
+ has_rdoc: true
28
+ homepage: http://github.com/josephruscio/aggregate
29
+ licenses:
30
+ post_install_message:
31
+ rdoc_options:
32
+ - --inline-source
33
+ - --charset=UTF-8
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: "0"
41
+ version:
42
+ required_rubygems_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: "0"
47
+ version:
48
+ requirements: []
49
+
50
+ rubyforge_project:
51
+ rubygems_version: 1.3.5
52
+ signing_key:
53
+ specification_version: 2
54
+ summary: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
55
+ test_files: []
56
+