josephruscio-aggregate 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/LICENSE +22 -0
  2. data/aggregate.rb +240 -0
  3. metadata +56 -0
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2009 Joseph Ruscio
2
+
3
+ Permission is hereby granted, free of charge, to any person
4
+ obtaining a copy of this software and associated documentation
5
+ files (the "Software"), to deal in the Software without
6
+ restriction, including without limitation the rights to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ copies of the Software, and to permit persons to whom the
9
+ Software is furnished to do so, subject to the following
10
+ conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20
+ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
+ OTHER DEALINGS IN THE SOFTWARE.
data/aggregate.rb ADDED
@@ -0,0 +1,240 @@
1
+ # Implements aggregate statistics and maintains
2
+ # configurable histogram for a set of given samples. Convenient for tracking
3
+ # high throughput data.
4
+ class Aggregate
5
+ #The current average of all samples
6
+ attr_reader :mean
7
+
8
+ #The current number of samples
9
+ attr_reader :count
10
+
11
+ #The maximum sample value
12
+ attr_reader :max
13
+
14
+ #The minimum samples value
15
+ attr_reader :min
16
+
17
+ #The sum of all samples
18
+ attr_reader :sum
19
+
20
+ #The number of samples falling below the lowest valued histogram bucket
21
+ attr_reader :outliers_low
22
+
23
+ #The number of samples falling above the highest valued histogram bucket
24
+ attr_reader :outliers_high
25
+
26
+ # The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
27
+ @@LOG_BUCKETS = 128
28
+
29
+ # Create a new Aggregate that maintains a binary logarithmic histogram
30
+ # by default. Specifying values for low, high, and width configures
31
+ # the aggregate to maintain a linear histogram with (high - low)/width buckets
32
+ def initialize (low=nil, high=nil, width=nil)
33
+ @count = 0
34
+ @sum = 0.0
35
+ @sum2 = 0.0
36
+ @outliers_low = 0
37
+ @outliers_high = 0
38
+
39
+ # If the user asks we maintain a linear histogram
40
+ # STILL UNDER TEST/DEV
41
+ if false #(nil != low && nil != high && nil != width)
42
+ # This is a linear histogram
43
+ if high < low
44
+ raise ArgumentError, "High bucket must be > Low bucket"
45
+ end
46
+
47
+ @low = low
48
+ @high = high
49
+ @width = width
50
+ else
51
+ @low = 1
52
+ @high = to_bucket(@@LOG_BUCKETS - 1)
53
+ end
54
+
55
+ #Initialize all buckets to 0
56
+ @buckets = Array.new(bucket_count, 0)
57
+ end
58
+
59
+ # Include a sample in the aggregate
60
+ def << data
61
+
62
+ # Update min/max
63
+ if 0 == @count
64
+ @min = data
65
+ @max = data
66
+ elsif data > @max
67
+ @max = data
68
+ elsif data < @min
69
+ @min = data
70
+ end
71
+
72
+ # Update the running info
73
+ @count += 1
74
+ @sum += data
75
+ @sum2 += (data * data)
76
+
77
+ # Update the bucket
78
+ @buckets[to_index(data)] += 1 unless outlier?(data)
79
+ end
80
+
81
+ def mean
82
+ @sum / self.count
83
+ end
84
+
85
+ def std_dev
86
+ end
87
+
88
+ #Generate a pretty-printed ASCII representation of the histogram
89
+ def to_s
90
+ #Find the largest bucket and create an array of the rows we intend to print
91
+ max_count = 0
92
+ disp_buckets = Array.new
93
+ @buckets.each_with_index do |count, idx|
94
+ next if 0 == count
95
+ max_count = count if max_count < count
96
+ disp_buckets << [idx, to_bucket(idx), count]
97
+ end
98
+
99
+ #Figure out how wide the value and count columns need to be based on their
100
+ #largest respective numbers
101
+ value_width = [disp_buckets.last[1].to_s.length, "value".length].max
102
+ count_width = [max_count.to_s.length, "count".length].max
103
+ max_bar_width = 80 - (value_width + " |".length + " ".length + count_width)
104
+
105
+ #print the header
106
+ header = sprintf("%#{value_width}s", "value")
107
+ header += " |"
108
+ max_bar_width.times { header += "-"}
109
+ header += " count"
110
+
111
+ #Determine the value of a '@'
112
+ weight = [max_count.to_f/max_bar_width.to_f, 1.0].max
113
+
114
+ #Loop through each bucket to be displayed and output the correct number
115
+ histogram = ""
116
+ prev_index = disp_buckets[0][0] - 1
117
+ disp_buckets.each do |x|
118
+
119
+ #Denote skipped empty buckets with a ~
120
+ histogram += " ~\n" unless prev_index == x[0] - 1
121
+ prev_index = x[0]
122
+
123
+ #Add the value
124
+ row = sprintf("%#{value_width}d |", x[1])
125
+
126
+ #Add the bar
127
+ bar_size = (x[2]/weight).to_i
128
+ bar_size.times { row += "@"}
129
+ (max_bar_width - bar_size).times { row += " " }
130
+
131
+ #Add the count
132
+ row += sprintf(" %#{count_width}d\n", x[2])
133
+
134
+ #Append the finished row onto the histogram
135
+ histogram += row
136
+ end
137
+
138
+ #Put the pieces together
139
+ "\n" + header + "\n" + histogram
140
+ end
141
+
142
+ #Iterate through each bucket in the histogram regardless of
143
+ #its contents
144
+ def each
145
+ @buckets.each_with_index do |count, index|
146
+ yield(to_bucket(index), count)
147
+ end
148
+ end
149
+
150
+ #Iterate through only the buckets in the histogram that contain
151
+ #samples
152
+ def each_nonzero
153
+ @buckets.each_with_index do |count, index|
154
+ yield(to_bucket(index), count) if count != 0
155
+ end
156
+ end
157
+
158
+ private
159
+
160
+ def linear?
161
+ nil != @width
162
+ end
163
+
164
+ def outlier? (data)
165
+
166
+ if data < @low
167
+ @outliers_low += 1
168
+ elsif data > @high
169
+ @outliers_high += 1
170
+ else
171
+ return false
172
+ end
173
+ end
174
+
175
+ def bucket_count
176
+ if linear?
177
+ return (@high-@low)/@width
178
+ else
179
+ return @@LOG_BUCKETS
180
+ end
181
+ end
182
+
183
+ def to_bucket(index)
184
+ if linear?
185
+ return @low + ( (index + 1) * @width)
186
+ else
187
+ return 2**(index)
188
+ end
189
+ end
190
+
191
+ def right_bucket?(index, data)
192
+ bucket = to_bucket(index)
193
+
194
+ # Edge case
195
+ if 0 == index
196
+ prev_bucket = @low
197
+ else
198
+ prev_bucket = to_bucket(index - 1)
199
+ end
200
+
201
+ #It's the right bucket if data falls between prev_bucket and bucket
202
+ prev_bucket <= data && data <= bucket
203
+ end
204
+
205
+ def find_bucket(lower, upper, target)
206
+ #Classic binary search
207
+ return upper if right_bucket?(upper, target)
208
+
209
+ # Cut the search range in half
210
+ middle = (upper/2).to_i
211
+
212
+ # Determine which half contains our value and recurse
213
+ if (to_bucket(middle) >= target)
214
+ return find_bucket(lower, middle, target)
215
+ else
216
+ return find_bucket(middle, upper, target)
217
+ end
218
+ end
219
+
220
+ # A data point is added to the bucket[n] where the data point
221
+ # is less than the value represented by bucket[n], but greater
222
+ # than the value represented by bucket[n+1]
223
+ def to_index (data)
224
+
225
+ if linear?
226
+ find_bucket(0, bucket_count-1, data)
227
+ else
228
+ #log2 returns the bucket above the one we want,
229
+ #and we need to also subtract for 0 indexing of Array
230
+ log2(data).to_i
231
+ end
232
+
233
+ end
234
+
235
+ # log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
236
+ def log2( x )
237
+ Math.log(x) / Math.log(2)
238
+ end
239
+
240
+ end
metadata ADDED
@@ -0,0 +1,56 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: josephruscio-aggregate
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Joseph Ruscio
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-11 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
17
+ email: jruscio@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - LICENSE
24
+ files:
25
+ - aggregate.rb
26
+ - LICENSE
27
+ has_rdoc: true
28
+ homepage: http://github.com/josephruscio/aggregate
29
+ licenses:
30
+ post_install_message:
31
+ rdoc_options:
32
+ - --inline-source
33
+ - --charset=UTF-8
34
+ require_paths:
35
+ - lib
36
+ required_ruby_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: "0"
41
+ version:
42
+ required_rubygems_version: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: "0"
47
+ version:
48
+ requirements: []
49
+
50
+ rubyforge_project:
51
+ rubygems_version: 1.3.5
52
+ signing_key:
53
+ specification_version: 2
54
+ summary: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
55
+ test_files: []
56
+