josephruscio-aggregate 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +2 -0
- data/{aggregate.rb → lib/aggregate.rb} +38 -25
- data/test/ts_aggregate.rb +134 -0
- metadata +7 -4
data/README
ADDED
@@ -37,13 +37,17 @@ class Aggregate
|
|
37
37
|
@outliers_high = 0
|
38
38
|
|
39
39
|
# If the user asks we maintain a linear histogram
|
40
|
-
|
41
|
-
|
42
|
-
#
|
43
|
-
if high
|
40
|
+
if (nil != low && nil != high && nil != width)
|
41
|
+
|
42
|
+
#Validate linear specification
|
43
|
+
if high <= low
|
44
44
|
raise ArgumentError, "High bucket must be > Low bucket"
|
45
45
|
end
|
46
46
|
|
47
|
+
if high - low < width
|
48
|
+
raise ArgumentError, "Histogram width must be <= histogram range"
|
49
|
+
end
|
50
|
+
|
47
51
|
@low = low
|
48
52
|
@high = high
|
49
53
|
@width = width
|
@@ -63,10 +67,9 @@ class Aggregate
|
|
63
67
|
if 0 == @count
|
64
68
|
@min = data
|
65
69
|
@max = data
|
66
|
-
|
67
|
-
@max = data
|
68
|
-
|
69
|
-
@min = data
|
70
|
+
else
|
71
|
+
@max = [data, @max].max
|
72
|
+
@min = [data, @min].min
|
70
73
|
end
|
71
74
|
|
72
75
|
# Update the running info
|
@@ -85,6 +88,14 @@ class Aggregate
|
|
85
88
|
def std_dev
|
86
89
|
end
|
87
90
|
|
91
|
+
# Combine two aggregates
|
92
|
+
#def +(b)
|
93
|
+
# a = self
|
94
|
+
# c = Aggregate.new
|
95
|
+
|
96
|
+
# c.count = a.count + b.count
|
97
|
+
#end
|
98
|
+
|
88
99
|
#Generate a pretty-printed ASCII representation of the histogram
|
89
100
|
def to_s
|
90
101
|
#Find the largest bucket and create an array of the rows we intend to print
|
@@ -182,26 +193,24 @@ class Aggregate
|
|
182
193
|
|
183
194
|
def to_bucket(index)
|
184
195
|
if linear?
|
185
|
-
return @low + (
|
196
|
+
return @low + (index * @width)
|
186
197
|
else
|
187
198
|
return 2**(index)
|
188
199
|
end
|
189
200
|
end
|
190
201
|
|
191
|
-
def right_bucket?
|
192
|
-
bucket = to_bucket(index)
|
202
|
+
def right_bucket? index, data
|
193
203
|
|
194
|
-
#
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
prev_bucket = to_bucket(index - 1)
|
199
|
-
end
|
204
|
+
# check invariant
|
205
|
+
raise unless linear?
|
206
|
+
|
207
|
+
bucket = to_bucket(index)
|
200
208
|
|
201
|
-
#It's the right bucket if data falls between
|
202
|
-
|
209
|
+
#It's the right bucket if data falls between bucket and next bucket
|
210
|
+
bucket <= data && data < bucket + @width
|
203
211
|
end
|
204
212
|
|
213
|
+
=begin
|
205
214
|
def find_bucket(lower, upper, target)
|
206
215
|
#Classic binary search
|
207
216
|
return upper if right_bucket?(upper, target)
|
@@ -216,20 +225,24 @@ class Aggregate
|
|
216
225
|
return find_bucket(middle, upper, target)
|
217
226
|
end
|
218
227
|
end
|
228
|
+
=end
|
219
229
|
|
220
230
|
# A data point is added to the bucket[n] where the data point
|
221
231
|
# is less than the value represented by bucket[n], but greater
|
222
232
|
# than the value represented by bucket[n+1]
|
223
233
|
def to_index (data)
|
224
234
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
235
|
+
# basic case is simple
|
236
|
+
return log2(data).to_i if !linear?
|
237
|
+
|
238
|
+
# Search for the right bucket in the linear case
|
239
|
+
@buckets.each_with_index do |count, idx|
|
240
|
+
return idx if right_bucket?(idx, data)
|
231
241
|
end
|
242
|
+
#find_bucket(0, bucket_count-1, data)
|
232
243
|
|
244
|
+
#Should not get here
|
245
|
+
raise "#{data}"
|
233
246
|
end
|
234
247
|
|
235
248
|
# log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'lib/aggregate'
|
3
|
+
|
4
|
+
class SimpleStatsTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@stats = Aggregate.new
|
8
|
+
|
9
|
+
@@DATA.each do |x|
|
10
|
+
@stats << x
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_stats_count
|
15
|
+
assert_equal @@DATA.length, @stats.count
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_stats_min_max
|
19
|
+
sorted_data = @@DATA.sort
|
20
|
+
|
21
|
+
assert_equal sorted_data[0], @stats.min
|
22
|
+
assert_equal sorted_data.last, @stats.max
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_stats_mean
|
26
|
+
sum = 0
|
27
|
+
@@DATA.each do |x|
|
28
|
+
sum += x
|
29
|
+
end
|
30
|
+
|
31
|
+
assert_equal sum.to_f/@@DATA.length.to_f, @stats.mean
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_bucket_counts
|
35
|
+
|
36
|
+
#Test each iterator
|
37
|
+
total_bucket_sum = 0
|
38
|
+
i = 0
|
39
|
+
@stats.each do |bucket, count|
|
40
|
+
assert_equal 2**i, bucket
|
41
|
+
|
42
|
+
total_bucket_sum += count
|
43
|
+
i += 1
|
44
|
+
end
|
45
|
+
|
46
|
+
assert_equal total_bucket_sum, @@DATA.length
|
47
|
+
|
48
|
+
#Test each_nonzero iterator
|
49
|
+
prev_bucket = 0
|
50
|
+
total_bucket_sum = 0
|
51
|
+
@stats.each_nonzero do |bucket, count|
|
52
|
+
assert bucket > prev_bucket
|
53
|
+
assert_not_equal count, 0
|
54
|
+
|
55
|
+
total_bucket_sum += count
|
56
|
+
end
|
57
|
+
|
58
|
+
assert_equal total_bucket_sum, @@DATA.length
|
59
|
+
end
|
60
|
+
|
61
|
+
=begin
|
62
|
+
def test_addition
|
63
|
+
stats1 = Aggregate.new
|
64
|
+
stats2 = Aggregate.new
|
65
|
+
|
66
|
+
stats1 << 1
|
67
|
+
stats2 << 3
|
68
|
+
|
69
|
+
stats_sum = stats1 + stats2
|
70
|
+
|
71
|
+
assert_equal stats_sum.count, stats1.count + stats2.count
|
72
|
+
end
|
73
|
+
=end
|
74
|
+
|
75
|
+
#XXX: Update test_bucket_contents() if you muck with @@DATA
|
76
|
+
@@DATA = [ 1, 5, 4, 6, 1028, 1972, 16384, 16385, 16383 ]
|
77
|
+
def test_bucket_contents
|
78
|
+
#XXX: This is the only test so far that cares about the actual contents
|
79
|
+
# of @@DATA, so if you update that array ... update this method too
|
80
|
+
expected_buckets = [1, 4, 1024, 8192, 16384]
|
81
|
+
expected_counts = [1, 3, 2, 1, 2]
|
82
|
+
|
83
|
+
i = 0
|
84
|
+
@stats.each_nonzero do |bucket, count|
|
85
|
+
assert_equal expected_buckets[i], bucket
|
86
|
+
assert_equal expected_counts[i], count
|
87
|
+
# Increment for the next test
|
88
|
+
i += 1
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_histogram
|
93
|
+
puts @stats.to_s
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_outlier
|
97
|
+
@stats << -1
|
98
|
+
@stats << 2**129
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
class LinearHistogramTest < Test::Unit::TestCase
|
103
|
+
def setup
|
104
|
+
@stats = Aggregate.new(0, 32768, 1024)
|
105
|
+
|
106
|
+
@@DATA.each do |x|
|
107
|
+
@stats << x
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_validation
|
112
|
+
assert_raise(ArgumentError) {bad_stats = Aggregate.new(32,32,4)}
|
113
|
+
assert_raise(ArgumentError) {bad_stats = Aggregate.new(32,16,4)}
|
114
|
+
assert_raise(ArgumentError) {bad_stats = Aggregate.new(16,32,17)}
|
115
|
+
end
|
116
|
+
|
117
|
+
#XXX: Update test_bucket_contents() if you muck with @@DATA
|
118
|
+
@@DATA = [ 1, 5, 4, 6, 1028, 1972, 16384, 16385, 16383 ]
|
119
|
+
def test_bucket_contents
|
120
|
+
#XXX: This is the only test so far that cares about the actual contents
|
121
|
+
# of @@DATA, so if you update that array ... update this method too
|
122
|
+
expected_buckets = [0, 1024, 15360, 16384]
|
123
|
+
expected_counts = [4, 2, 1, 2]
|
124
|
+
|
125
|
+
i = 0
|
126
|
+
@stats.each_nonzero do |bucket, count|
|
127
|
+
assert_equal expected_buckets[i], bucket
|
128
|
+
assert_equal expected_counts[i], count
|
129
|
+
# Increment for the next test
|
130
|
+
i += 1
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: josephruscio-aggregate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joseph Ruscio
|
@@ -20,10 +20,13 @@ executables: []
|
|
20
20
|
extensions: []
|
21
21
|
|
22
22
|
extra_rdoc_files:
|
23
|
+
- README
|
23
24
|
- LICENSE
|
24
25
|
files:
|
25
|
-
-
|
26
|
+
- README
|
26
27
|
- LICENSE
|
28
|
+
- lib/aggregate.rb
|
29
|
+
- test/ts_aggregate.rb
|
27
30
|
has_rdoc: true
|
28
31
|
homepage: http://github.com/josephruscio/aggregate
|
29
32
|
licenses:
|
@@ -52,5 +55,5 @@ rubygems_version: 1.3.5
|
|
52
55
|
signing_key:
|
53
56
|
specification_version: 2
|
54
57
|
summary: Aggregate is a Ruby library accumulating aggregate statistics (including histograms) in an object oriented manner.
|
55
|
-
test_files:
|
56
|
-
|
58
|
+
test_files:
|
59
|
+
- test/ts_aggregate.rb
|