splashy 0.0.2 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +6 -0
- data/VERSION +1 -1
- data/lib/splashy/bucket.rb +6 -2
- data/lib/splashy/buckets.rb +112 -39
- data/splashy.gemspec +1 -1
- data/test/test_splashy_buckets.rb +30 -3
- metadata +2 -2
data/README.markdown
CHANGED
@@ -46,6 +46,12 @@ buckets.select
|
|
46
46
|
Changelog
|
47
47
|
=========
|
48
48
|
|
49
|
+
* 0.1.0 - Several bug fixes, add "neediest_buckets" method to Buckets to allow
|
50
|
+
you to choose which buckets to add to first if an element can be put in
|
51
|
+
multiple buckets, final distributions can now have empty buckets if it means
|
52
|
+
we meet the wanted distribution better (i.e. a 99% % 1% distribution with 5
|
53
|
+
and 1 elements, respectively, which will now select 4 and 0 elements if your
|
54
|
+
wanted count is 4).
|
49
55
|
* 0.0.2 - Raise `ArgumentError` when trying to add to a bucket that doesn't
|
50
56
|
exist, don't consider an empty bucket "satisfied".
|
51
57
|
* 0.0.1 - Initial release.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0
|
1
|
+
0.1.0
|
data/lib/splashy/bucket.rb
CHANGED
data/lib/splashy/buckets.rb
CHANGED
@@ -17,13 +17,13 @@ module Splashy
|
|
17
17
|
@total_count = 0
|
18
18
|
end
|
19
19
|
|
20
|
-
# Public: Put elements into buckets.
|
20
|
+
# Public: Put elements into buckets with a block.
|
21
21
|
#
|
22
22
|
# bucket_name - If supplied, all yielded elements will be added to that
|
23
23
|
# bucket.
|
24
|
-
# &block
|
25
|
-
#
|
26
|
-
#
|
24
|
+
# &block - A block that returns (if `bucket_name` is not supplied) an
|
25
|
+
# Array: [bucket_name, element]. If `bucket_name` is supplied, only
|
26
|
+
# the element needs to be returned.
|
27
27
|
#
|
28
28
|
# Examples
|
29
29
|
#
|
@@ -50,7 +50,10 @@ module Splashy
|
|
50
50
|
@total_count += 1
|
51
51
|
end
|
52
52
|
|
53
|
-
#
|
53
|
+
# Public
|
54
|
+
#
|
55
|
+
# Returns true if the conditions (distribution and, optionally, count) are
|
56
|
+
# satisfied enough to do a final selection of elements.
|
54
57
|
def satisfied?
|
55
58
|
begin
|
56
59
|
self.assert_satisfied!
|
@@ -64,61 +67,123 @@ module Splashy
|
|
64
67
|
# distribution. If a satisfactory distribution is not possible, a
|
65
68
|
# DistributionUnsatisfiedError is raised.
|
66
69
|
#
|
67
|
-
# Returns a Hash of elements
|
70
|
+
# Returns a Hash of elements matching the desired distribution, keyed by
|
68
71
|
# the bucket names.
|
69
72
|
def select
|
70
73
|
self.assert_satisfied!
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
selected
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
74
|
+
selected = self._select_wanted
|
75
|
+
# Sometimes we need to fudge by a few to meet the `@wanted_count`
|
76
|
+
selected = self.trim( selected, @wanted_count ) if @wanted_count
|
77
|
+
selected
|
78
|
+
end
|
79
|
+
|
80
|
+
# Array of the buckets that need more elements to match the desired
|
81
|
+
# distribution, sorted descending by how much more they need.
|
82
|
+
def neediest_buckets
|
83
|
+
multipliers = self.needed_multipliers( self._select_all, @wanted_distribution ).to_a
|
84
|
+
multipliers.sort! { |a, b| b[1] <=> a[1] } # Sort on multiplier ascending
|
85
|
+
multipliers.map{ |bucket_name, multiplier| bucket_name }
|
86
|
+
end
|
87
|
+
|
88
|
+
protected
|
89
|
+
|
90
|
+
# Protected
|
91
|
+
#
|
92
|
+
# Returns Hash of all bucket elements, keyed by bucket name.
|
93
|
+
def _select_all
|
94
|
+
@buckets.values.inject({}) do |memo, bucket|
|
95
|
+
memo[bucket.name] = bucket.elements
|
79
96
|
memo
|
80
97
|
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Protected
|
101
|
+
#
|
102
|
+
# Returns Hash of bucket elements, matching the wanted distribution as
|
103
|
+
# closely as possible.
|
104
|
+
def _select_wanted
|
105
|
+
final_count = self.estimated_final_count
|
81
106
|
|
82
|
-
|
83
|
-
|
107
|
+
@buckets.values.inject({}) do |memo, bucket|
|
108
|
+
count = ( final_count * @wanted_distribution[bucket.name] ).round
|
109
|
+
count = [1, count].max # Ensure every bucket has at least one element
|
110
|
+
memo[bucket.name] = bucket.elements( count )
|
111
|
+
memo
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# Protected: Trim a given Hash of Arrays -- keyed by bucket names -- until
|
116
|
+
# it satisfies @wanted_count.
|
117
|
+
#
|
118
|
+
# selected - A Hash of selected elements, keyed by the bucket names. All
|
119
|
+
# values must be Arrays (or respond to `size`).
|
120
|
+
# size - The desired total size of `selected`.
|
121
|
+
def trim( selected, size )
|
122
|
+
raise ArgumentError.new( "Can't trim to a nil size" ) unless size
|
123
|
+
while self.class.elements_count( selected ) > size
|
124
|
+
candidates = self.trim_candidates( selected, @wanted_distribution )
|
125
|
+
selected[candidates.first].pop
|
126
|
+
end
|
84
127
|
|
85
128
|
selected
|
86
129
|
end
|
87
130
|
|
88
|
-
|
131
|
+
# Protected
|
132
|
+
#
|
133
|
+
# current_selections - Hash of element Arrays, keyed by bucket name.
|
134
|
+
# wanted_distribution - The wanted distribution as a hash of percentage
|
135
|
+
# Floats.
|
136
|
+
#
|
137
|
+
# Returns Array of bucket names for buckets that are good trim candidates,
|
138
|
+
# ordered by best candidates first.
|
139
|
+
def trim_candidates( current_selections, wanted_distribution )
|
140
|
+
multipliers = self.needed_multipliers( current_selections, wanted_distribution ).to_a
|
141
|
+
multipliers.select do |bucket_name, multiplier|
|
142
|
+
# Can't trim empty buckets
|
143
|
+
@buckets[bucket_name].count != 0
|
144
|
+
end
|
145
|
+
return multipliers if multipliers.empty?
|
146
|
+
multipliers.sort! { |a, b| a[1] <=> b[1] } # Sort on multiplier ascending
|
147
|
+
multipliers.map{ |bucket_name, multiplier| bucket_name }
|
148
|
+
end
|
89
149
|
|
90
|
-
#
|
91
|
-
#
|
92
|
-
|
93
|
-
|
150
|
+
# Protected
|
151
|
+
#
|
152
|
+
# current_selections - Hash of element Arrays, keyed by bucket name.
|
153
|
+
# wanted_distribution - The wanted distribution as a hash of percentage
|
154
|
+
# Floats.
|
155
|
+
#
|
156
|
+
# Returns Hash of multipliers needd for each bucket to reach its current
|
157
|
+
# wanted distribution.
|
158
|
+
def needed_multipliers( current_selections, wanted_distribution )
|
159
|
+
total_size = self.class.elements_count( current_selections )
|
94
160
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
memo << [bucket_name, variance]
|
104
|
-
end
|
105
|
-
memo
|
161
|
+
current_selections.keys.inject({}) do |memo, bucket_name|
|
162
|
+
bucket_size = current_selections[bucket_name].size
|
163
|
+
desired_pct = wanted_distribution[bucket_name]
|
164
|
+
current_pct = bucket_size.to_f / total_size
|
165
|
+
if current_pct > 0
|
166
|
+
memo[bucket_name] = desired_pct / current_pct
|
167
|
+
else
|
168
|
+
memo[bucket_name] = 1 / 0.0 # Infinity
|
106
169
|
end
|
107
|
-
|
108
|
-
trim_bucket_name = variances.sort{ |a, b| a[1] }[0][0] # Smallest variance
|
109
|
-
selected[trim_bucket_name].pop
|
170
|
+
memo
|
110
171
|
end
|
111
|
-
|
112
|
-
selected
|
113
172
|
end
|
114
173
|
|
174
|
+
# Protected
|
175
|
+
#
|
176
|
+
# hash - Hash of Objects that respond to `count` (usually Arrays).
|
177
|
+
#
|
115
178
|
# Returns count of all elements in the Hash's Array values.
|
116
179
|
def self.elements_count( hash )
|
117
180
|
hash.values.inject(0){ |memo, array| memo + array.count }
|
118
181
|
end
|
119
182
|
|
183
|
+
# Protected
|
184
|
+
#
|
120
185
|
# Returns projected final number of elements that will be returned to
|
121
|
-
# satisfy the requirements. If this is less than `@wanted_count`,
|
186
|
+
# satisfy the requirements. If this is less than `@wanted_count`, if
|
122
187
|
# supplied, we can't meet the requirements.
|
123
188
|
def estimated_final_count
|
124
189
|
limiter_bucket = self.limiter_bucket
|
@@ -127,6 +192,10 @@ module Splashy
|
|
127
192
|
final_count
|
128
193
|
end
|
129
194
|
|
195
|
+
# Protected
|
196
|
+
#
|
197
|
+
# Raises a DistributionUnsatisfiedError if we can't meet the wanted
|
198
|
+
# distribution or count (or both).
|
130
199
|
def assert_satisfied!
|
131
200
|
if @total_count < @wanted_distribution.size
|
132
201
|
raise DistributionUnsatisfiedError.new(
|
@@ -156,7 +225,11 @@ module Splashy
|
|
156
225
|
end
|
157
226
|
end
|
158
227
|
|
159
|
-
#
|
228
|
+
# Protected
|
229
|
+
#
|
230
|
+
# Return the Bucket that is the current limiter in the distribution. In
|
231
|
+
# other words, this bucket is limiting the total size of the final
|
232
|
+
# selection.
|
160
233
|
def limiter_bucket
|
161
234
|
# Smallest value of "count / desired percent" is the limiter.
|
162
235
|
@buckets.values.map do |bucket|
|
data/splashy.gemspec
CHANGED
@@ -227,7 +227,7 @@ describe Splashy::Buckets do
|
|
227
227
|
fill_with_counts( 10, 2, 40 )
|
228
228
|
assert @buckets.satisfied?
|
229
229
|
assert_equal(
|
230
|
-
{:a=>[
|
230
|
+
{:a=>[], :b=>["20", "21"], :c=>["30", "31", "32", "33", "34", "35"]},
|
231
231
|
@buckets.select
|
232
232
|
)
|
233
233
|
end
|
@@ -237,15 +237,41 @@ describe Splashy::Buckets do
|
|
237
237
|
fill_with_counts( 3, 3, 3 )
|
238
238
|
assert @buckets.satisfied?
|
239
239
|
assert_equal(
|
240
|
-
{:a=>[
|
240
|
+
{:a=>[], :b=>[], :c=>["30", "31", "32"]},
|
241
241
|
@buckets.select
|
242
242
|
)
|
243
243
|
end
|
244
244
|
end
|
245
245
|
|
246
|
+
describe "variances" do
|
247
|
+
it "reports on a pool with an even distribution" do
|
248
|
+
@buckets = Splashy::Buckets.new( {:a => 0.33, :b => 0.33, :c => 0.34} )
|
249
|
+
fill_with_counts( 10, 2, 40 )
|
250
|
+
assert_equal( [:b, :a, :c], @buckets.neediest_buckets )
|
251
|
+
end
|
252
|
+
|
253
|
+
it "reports on a pool with an uneven distribution" do
|
254
|
+
@buckets = Splashy::Buckets.new( {:a => 0.33, :b => 0.33, :c => 0.34}, 3 )
|
255
|
+
fill_with_counts( 10, 2, 40 )
|
256
|
+
assert_equal( [:b, :a, :c], @buckets.neediest_buckets )
|
257
|
+
end
|
258
|
+
|
259
|
+
it "reports on a pool with a skewed distribution" do
|
260
|
+
@buckets = Splashy::Buckets.new( {:a => 0.01, :b => 0.19, :c => 0.80} )
|
261
|
+
fill_with_counts( 10, 2, 1 )
|
262
|
+
assert_equal( [:c, :b, :a], @buckets.neediest_buckets )
|
263
|
+
end
|
264
|
+
|
265
|
+
it "reports on a pool with a wacky distribution" do
|
266
|
+
@buckets = Splashy::Buckets.new( {:a => 0.01, :b => 0.01, :c => 0.98} )
|
267
|
+
fill_with_counts( 3, 3, 3 )
|
268
|
+
assert_equal( [:c, :a, :b], @buckets.neediest_buckets )
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
246
272
|
describe "performance" do
|
247
273
|
it "grows linearly with more elements" do
|
248
|
-
puts
|
274
|
+
puts # Formatting...
|
249
275
|
assert_performance_linear 0.999 do |n|
|
250
276
|
@buckets = Splashy::Buckets.new( :a => 0.20, :b => 0.30, :c => 0.50 )
|
251
277
|
n.times do |i|
|
@@ -257,3 +283,4 @@ describe Splashy::Buckets do
|
|
257
283
|
end
|
258
284
|
end
|
259
285
|
end
|
286
|
+
|