splashy 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +6 -0
- data/VERSION +1 -1
- data/lib/splashy/bucket.rb +6 -2
- data/lib/splashy/buckets.rb +112 -39
- data/splashy.gemspec +1 -1
- data/test/test_splashy_buckets.rb +30 -3
- metadata +2 -2
data/README.markdown
CHANGED
@@ -46,6 +46,12 @@ buckets.select
|
|
46
46
|
Changelog
|
47
47
|
=========
|
48
48
|
|
49
|
+
* 0.1.0 - Several bug fixes, add "neediest_buckets" method to Buckets to allow
|
50
|
+
you to choose which buckets to add to first if an element can be put in
|
51
|
+
multiple buckets, final distributions can now have empty buckets if it means
|
52
|
+
we meet the wanted distribution better (i.e. a 99% % 1% distribution with 5
|
53
|
+
and 1 elements, respectively, which will now select 4 and 0 elements if your
|
54
|
+
wanted count is 4).
|
49
55
|
* 0.0.2 - Raise `ArgumentError` when trying to add to a bucket that doesn't
|
50
56
|
exist, don't consider an empty bucket "satisfied".
|
51
57
|
* 0.0.1 - Initial release.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0
|
1
|
+
0.1.0
|
data/lib/splashy/bucket.rb
CHANGED
data/lib/splashy/buckets.rb
CHANGED
@@ -17,13 +17,13 @@ module Splashy
|
|
17
17
|
@total_count = 0
|
18
18
|
end
|
19
19
|
|
20
|
-
# Public: Put elements into buckets.
|
20
|
+
# Public: Put elements into buckets with a block.
|
21
21
|
#
|
22
22
|
# bucket_name - If supplied, all yielded elements will be added to that
|
23
23
|
# bucket.
|
24
|
-
# &block
|
25
|
-
#
|
26
|
-
#
|
24
|
+
# &block - A block that returns (if `bucket_name` is not supplied) an
|
25
|
+
# Array: [bucket_name, element]. If `bucket_name` is supplied, only
|
26
|
+
# the element needs to be returned.
|
27
27
|
#
|
28
28
|
# Examples
|
29
29
|
#
|
@@ -50,7 +50,10 @@ module Splashy
|
|
50
50
|
@total_count += 1
|
51
51
|
end
|
52
52
|
|
53
|
-
#
|
53
|
+
# Public
|
54
|
+
#
|
55
|
+
# Returns true if the conditions (distribution and, optionally, count) are
|
56
|
+
# satisfied enough to do a final selection of elements.
|
54
57
|
def satisfied?
|
55
58
|
begin
|
56
59
|
self.assert_satisfied!
|
@@ -64,61 +67,123 @@ module Splashy
|
|
64
67
|
# distribution. If a satisfactory distribution is not possible, a
|
65
68
|
# DistributionUnsatisfiedError is raised.
|
66
69
|
#
|
67
|
-
# Returns a Hash of elements
|
70
|
+
# Returns a Hash of elements matching the desired distribution, keyed by
|
68
71
|
# the bucket names.
|
69
72
|
def select
|
70
73
|
self.assert_satisfied!
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
selected
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
74
|
+
selected = self._select_wanted
|
75
|
+
# Sometimes we need to fudge by a few to meet the `@wanted_count`
|
76
|
+
selected = self.trim( selected, @wanted_count ) if @wanted_count
|
77
|
+
selected
|
78
|
+
end
|
79
|
+
|
80
|
+
# Array of the buckets that need more elements to match the desired
|
81
|
+
# distribution, sorted descending by how much more they need.
|
82
|
+
def neediest_buckets
|
83
|
+
multipliers = self.needed_multipliers( self._select_all, @wanted_distribution ).to_a
|
84
|
+
multipliers.sort! { |a, b| b[1] <=> a[1] } # Sort on multiplier ascending
|
85
|
+
multipliers.map{ |bucket_name, multiplier| bucket_name }
|
86
|
+
end
|
87
|
+
|
88
|
+
protected
|
89
|
+
|
90
|
+
# Protected
|
91
|
+
#
|
92
|
+
# Returns Hash of all bucket elements, keyed by bucket name.
|
93
|
+
def _select_all
|
94
|
+
@buckets.values.inject({}) do |memo, bucket|
|
95
|
+
memo[bucket.name] = bucket.elements
|
79
96
|
memo
|
80
97
|
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Protected
|
101
|
+
#
|
102
|
+
# Returns Hash of bucket elements, matching the wanted distribution as
|
103
|
+
# closely as possible.
|
104
|
+
def _select_wanted
|
105
|
+
final_count = self.estimated_final_count
|
81
106
|
|
82
|
-
|
83
|
-
|
107
|
+
@buckets.values.inject({}) do |memo, bucket|
|
108
|
+
count = ( final_count * @wanted_distribution[bucket.name] ).round
|
109
|
+
count = [1, count].max # Ensure every bucket has at least one element
|
110
|
+
memo[bucket.name] = bucket.elements( count )
|
111
|
+
memo
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# Protected: Trim a given Hash of Arrays -- keyed by bucket names -- until
|
116
|
+
# it satisfies @wanted_count.
|
117
|
+
#
|
118
|
+
# selected - A Hash of selected elements, keyed by the bucket names. All
|
119
|
+
# values must be Arrays (or respond to `size`).
|
120
|
+
# size - The desired total size of `selected`.
|
121
|
+
def trim( selected, size )
|
122
|
+
raise ArgumentError.new( "Can't trim to a nil size" ) unless size
|
123
|
+
while self.class.elements_count( selected ) > size
|
124
|
+
candidates = self.trim_candidates( selected, @wanted_distribution )
|
125
|
+
selected[candidates.first].pop
|
126
|
+
end
|
84
127
|
|
85
128
|
selected
|
86
129
|
end
|
87
130
|
|
88
|
-
|
131
|
+
# Protected
|
132
|
+
#
|
133
|
+
# current_selections - Hash of element Arrays, keyed by bucket name.
|
134
|
+
# wanted_distribution - The wanted distribution as a hash of percentage
|
135
|
+
# Floats.
|
136
|
+
#
|
137
|
+
# Returns Array of bucket names for buckets that are good trim candidates,
|
138
|
+
# ordered by best candidates first.
|
139
|
+
def trim_candidates( current_selections, wanted_distribution )
|
140
|
+
multipliers = self.needed_multipliers( current_selections, wanted_distribution ).to_a
|
141
|
+
multipliers.select do |bucket_name, multiplier|
|
142
|
+
# Can't trim empty buckets
|
143
|
+
@buckets[bucket_name].count != 0
|
144
|
+
end
|
145
|
+
return multipliers if multipliers.empty?
|
146
|
+
multipliers.sort! { |a, b| a[1] <=> b[1] } # Sort on multiplier ascending
|
147
|
+
multipliers.map{ |bucket_name, multiplier| bucket_name }
|
148
|
+
end
|
89
149
|
|
90
|
-
#
|
91
|
-
#
|
92
|
-
|
93
|
-
|
150
|
+
# Protected
|
151
|
+
#
|
152
|
+
# current_selections - Hash of element Arrays, keyed by bucket name.
|
153
|
+
# wanted_distribution - The wanted distribution as a hash of percentage
|
154
|
+
# Floats.
|
155
|
+
#
|
156
|
+
# Returns Hash of multipliers needd for each bucket to reach its current
|
157
|
+
# wanted distribution.
|
158
|
+
def needed_multipliers( current_selections, wanted_distribution )
|
159
|
+
total_size = self.class.elements_count( current_selections )
|
94
160
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
memo << [bucket_name, variance]
|
104
|
-
end
|
105
|
-
memo
|
161
|
+
current_selections.keys.inject({}) do |memo, bucket_name|
|
162
|
+
bucket_size = current_selections[bucket_name].size
|
163
|
+
desired_pct = wanted_distribution[bucket_name]
|
164
|
+
current_pct = bucket_size.to_f / total_size
|
165
|
+
if current_pct > 0
|
166
|
+
memo[bucket_name] = desired_pct / current_pct
|
167
|
+
else
|
168
|
+
memo[bucket_name] = 1 / 0.0 # Infinity
|
106
169
|
end
|
107
|
-
|
108
|
-
trim_bucket_name = variances.sort{ |a, b| a[1] }[0][0] # Smallest variance
|
109
|
-
selected[trim_bucket_name].pop
|
170
|
+
memo
|
110
171
|
end
|
111
|
-
|
112
|
-
selected
|
113
172
|
end
|
114
173
|
|
174
|
+
# Protected
|
175
|
+
#
|
176
|
+
# hash - Hash of Objects that respond to `count` (usually Arrays).
|
177
|
+
#
|
115
178
|
# Returns count of all elements in the Hash's Array values.
|
116
179
|
def self.elements_count( hash )
|
117
180
|
hash.values.inject(0){ |memo, array| memo + array.count }
|
118
181
|
end
|
119
182
|
|
183
|
+
# Protected
|
184
|
+
#
|
120
185
|
# Returns projected final number of elements that will be returned to
|
121
|
-
# satisfy the requirements. If this is less than `@wanted_count`,
|
186
|
+
# satisfy the requirements. If this is less than `@wanted_count`, if
|
122
187
|
# supplied, we can't meet the requirements.
|
123
188
|
def estimated_final_count
|
124
189
|
limiter_bucket = self.limiter_bucket
|
@@ -127,6 +192,10 @@ module Splashy
|
|
127
192
|
final_count
|
128
193
|
end
|
129
194
|
|
195
|
+
# Protected
|
196
|
+
#
|
197
|
+
# Raises a DistributionUnsatisfiedError if we can't meet the wanted
|
198
|
+
# distribution or count (or both).
|
130
199
|
def assert_satisfied!
|
131
200
|
if @total_count < @wanted_distribution.size
|
132
201
|
raise DistributionUnsatisfiedError.new(
|
@@ -156,7 +225,11 @@ module Splashy
|
|
156
225
|
end
|
157
226
|
end
|
158
227
|
|
159
|
-
#
|
228
|
+
# Protected
|
229
|
+
#
|
230
|
+
# Return the Bucket that is the current limiter in the distribution. In
|
231
|
+
# other words, this bucket is limiting the total size of the final
|
232
|
+
# selection.
|
160
233
|
def limiter_bucket
|
161
234
|
# Smallest value of "count / desired percent" is the limiter.
|
162
235
|
@buckets.values.map do |bucket|
|
data/splashy.gemspec
CHANGED
@@ -227,7 +227,7 @@ describe Splashy::Buckets do
|
|
227
227
|
fill_with_counts( 10, 2, 40 )
|
228
228
|
assert @buckets.satisfied?
|
229
229
|
assert_equal(
|
230
|
-
{:a=>[
|
230
|
+
{:a=>[], :b=>["20", "21"], :c=>["30", "31", "32", "33", "34", "35"]},
|
231
231
|
@buckets.select
|
232
232
|
)
|
233
233
|
end
|
@@ -237,15 +237,41 @@ describe Splashy::Buckets do
|
|
237
237
|
fill_with_counts( 3, 3, 3 )
|
238
238
|
assert @buckets.satisfied?
|
239
239
|
assert_equal(
|
240
|
-
{:a=>[
|
240
|
+
{:a=>[], :b=>[], :c=>["30", "31", "32"]},
|
241
241
|
@buckets.select
|
242
242
|
)
|
243
243
|
end
|
244
244
|
end
|
245
245
|
|
246
|
+
describe "variances" do
|
247
|
+
it "reports on a pool with an even distribution" do
|
248
|
+
@buckets = Splashy::Buckets.new( {:a => 0.33, :b => 0.33, :c => 0.34} )
|
249
|
+
fill_with_counts( 10, 2, 40 )
|
250
|
+
assert_equal( [:b, :a, :c], @buckets.neediest_buckets )
|
251
|
+
end
|
252
|
+
|
253
|
+
it "reports on a pool with an uneven distribution" do
|
254
|
+
@buckets = Splashy::Buckets.new( {:a => 0.33, :b => 0.33, :c => 0.34}, 3 )
|
255
|
+
fill_with_counts( 10, 2, 40 )
|
256
|
+
assert_equal( [:b, :a, :c], @buckets.neediest_buckets )
|
257
|
+
end
|
258
|
+
|
259
|
+
it "reports on a pool with a skewed distribution" do
|
260
|
+
@buckets = Splashy::Buckets.new( {:a => 0.01, :b => 0.19, :c => 0.80} )
|
261
|
+
fill_with_counts( 10, 2, 1 )
|
262
|
+
assert_equal( [:c, :b, :a], @buckets.neediest_buckets )
|
263
|
+
end
|
264
|
+
|
265
|
+
it "reports on a pool with a wacky distribution" do
|
266
|
+
@buckets = Splashy::Buckets.new( {:a => 0.01, :b => 0.01, :c => 0.98} )
|
267
|
+
fill_with_counts( 3, 3, 3 )
|
268
|
+
assert_equal( [:c, :a, :b], @buckets.neediest_buckets )
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
246
272
|
describe "performance" do
|
247
273
|
it "grows linearly with more elements" do
|
248
|
-
puts
|
274
|
+
puts # Formatting...
|
249
275
|
assert_performance_linear 0.999 do |n|
|
250
276
|
@buckets = Splashy::Buckets.new( :a => 0.20, :b => 0.30, :c => 0.50 )
|
251
277
|
n.times do |i|
|
@@ -257,3 +283,4 @@ describe Splashy::Buckets do
|
|
257
283
|
end
|
258
284
|
end
|
259
285
|
end
|
286
|
+
|