huge_enumerable 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +20 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +17 -0
- data/huge_enumerable.gemspec +28 -0
- data/lib/huge_enumerable/huge_collection.rb +83 -0
- data/lib/huge_enumerable/huge_combination.rb +118 -0
- data/lib/huge_enumerable/huge_permutation.rb +67 -0
- data/lib/huge_enumerable/huge_product.rb +77 -0
- data/lib/huge_enumerable/version.rb +4 -0
- data/lib/huge_enumerable.rb +357 -0
- data/spec/lib/huge_enumerable/huge_collection_spec.rb +33 -0
- data/spec/lib/huge_enumerable/huge_combination_spec.rb +38 -0
- data/spec/lib/huge_enumerable/huge_permutation_spec.rb +38 -0
- data/spec/lib/huge_enumerable/huge_product_spec.rb +35 -0
- data/spec/lib/huge_enumerable_spec.rb +642 -0
- data/spec/spec_helper.rb +1 -0
- metadata +175 -0
@@ -0,0 +1,357 @@
|
|
1
|
+
require "huge_enumerable/version"
|
2
|
+
|
3
|
+
require 'backports' if RUBY_VERSION < '1.9'
|
4
|
+
require 'prime'
|
5
|
+
require 'prime_miller_rabin'
|
6
|
+
|
7
|
+
Prime::MillerRabin.speed_intercept
|
8
|
+
|
9
|
+
# HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite)
|
10
|
+
# data sets without requiring them to be in memory.
|
11
|
+
# In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were
|
12
|
+
# an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original
|
13
|
+
# data set in any fashion.
|
14
|
+
#
|
15
|
+
# To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch.
|
16
|
+
# collection_size should return the size of the full data set.
|
17
|
+
# fetch should return the value at the given index.
|
18
|
+
# It is guaranteed that fetch will always be called with values in the range of (0...collection_size)
|
19
|
+
# It will never be called with a negative index or with an index >= collection_size
|
20
|
+
class HugeEnumerable
|
21
|
+
|
22
|
+
include Enumerable
|
23
|
+
|
24
|
+
# Currently 100,000 elements
|
25
|
+
DEFAULT_MAX_ARRAY_SIZE=100000
|
26
|
+
|
27
|
+
# The maximum number of elements to be returned when to_a is called.
|
28
|
+
# If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller.
|
29
|
+
attr_accessor :max_array_size
|
30
|
+
|
31
|
+
# The random number generator to use for shuffles and samples. Defaults to self#rand.
|
32
|
+
attr_accessor :rng
|
33
|
+
|
34
|
+
# Create a new HugeEnumerable
|
35
|
+
#
|
36
|
+
# ==== Options
|
37
|
+
#
|
38
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
39
|
+
# * +:rng+ - The random number generator to use.
|
40
|
+
def initialize(max_array_size = nil, rng = nil)
|
41
|
+
@max_array_size = max_array_size ? max_array_size.to_i : nil
|
42
|
+
@rng = rng || self.method(:rand)
|
43
|
+
@collection_increment = 1
|
44
|
+
@start_of_sequence = 0
|
45
|
+
@shuffle_head = 0
|
46
|
+
end
|
47
|
+
|
48
|
+
# Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices.
|
49
|
+
# Negative indices count backward from the end of the collection (-1 is the last element).
|
50
|
+
# For start and range cases the starting index is just before an element.
|
51
|
+
# Additionally, an empty array is returned when the starting index for an element range is at the end of the collection.
|
52
|
+
# Returns nil if the index (or starting index) are out of range.
|
53
|
+
# ==== Attributes
|
54
|
+
#
|
55
|
+
# * +index_or_range+ - Either an integer for single element selection or length selection, or a range.
|
56
|
+
#
|
57
|
+
# ==== Options
|
58
|
+
#
|
59
|
+
# * +:length+ - The number of elements to return if index_or_range is not a range.
|
60
|
+
def [](index_or_range, length=nil)
|
61
|
+
# TODO: Consider changing this to return HugeCollection
|
62
|
+
if index_or_range.is_a?(Range)
|
63
|
+
range = index_or_range
|
64
|
+
index = nil
|
65
|
+
else
|
66
|
+
index = index_or_range.to_i
|
67
|
+
range = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
if range
|
71
|
+
index = range.first
|
72
|
+
index += size if index < 0
|
73
|
+
|
74
|
+
length = range.last - index + 1
|
75
|
+
length += size if range.last < 0
|
76
|
+
length = size - index if index + length > size
|
77
|
+
|
78
|
+
if index < 0 || index > size
|
79
|
+
nil
|
80
|
+
elsif length < 0
|
81
|
+
[]
|
82
|
+
else
|
83
|
+
element_or_array(length) { |i| _fetch(i + index) }
|
84
|
+
end
|
85
|
+
elsif length
|
86
|
+
index += size if index < 0
|
87
|
+
length = size - index if index + length > size
|
88
|
+
if index < 0 || length < 0
|
89
|
+
nil
|
90
|
+
else
|
91
|
+
element_or_array(length) { |i| _fetch(i + index) }
|
92
|
+
end
|
93
|
+
else
|
94
|
+
_fetch(index)
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
# Calls the given block once for each element remaining in the collection, passing that element as a parameter.
|
100
|
+
def collection_each(&block) # :yields: element
|
101
|
+
# TODO: Return an Enumerator if no block is given
|
102
|
+
size.times { |i| yield _fetch(i) }
|
103
|
+
end
|
104
|
+
|
105
|
+
# When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself.
|
106
|
+
# If no block is given, an HugeCombination is returned instead.
|
107
|
+
# === Caveat
|
108
|
+
# max_array_size is currently inherited by the generated HugeCombination. This may change in the future.
|
109
|
+
def combination(n) # :yields: element
|
110
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
111
|
+
combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator)
|
112
|
+
if block_given?
|
113
|
+
combo.each { |x| yield x }
|
114
|
+
self
|
115
|
+
else
|
116
|
+
combo
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Calls the given block once for each element in the next array of the collection, passing that element as a parameter.
|
121
|
+
def each # :yields: element
|
122
|
+
# TODO: Return an Enumerator if no block is given
|
123
|
+
remaining_or(max_array_size).times { |i| yield _fetch(i) }
|
124
|
+
end
|
125
|
+
|
126
|
+
def max_array_size #:nodoc:
|
127
|
+
@max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min
|
128
|
+
end
|
129
|
+
|
130
|
+
# Shifts max_array_size elements and returns the following array from to_a.
|
131
|
+
def next_array
|
132
|
+
shift(max_array_size)
|
133
|
+
to_a
|
134
|
+
end
|
135
|
+
|
136
|
+
# Returns true of the collection contains no more elements.
|
137
|
+
def empty?
|
138
|
+
@start_of_sequence == @end_of_sequence
|
139
|
+
end
|
140
|
+
|
141
|
+
# When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself.
|
142
|
+
# If no block is given, a HugePermutation is returned instead.
|
143
|
+
# === Caveat
|
144
|
+
# max_array_size is currently inherited by the generated HugePermutation. This may change in the future.
|
145
|
+
def permutation(n) # :yields: element
|
146
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
147
|
+
perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator)
|
148
|
+
if block_given?
|
149
|
+
perm.each { |x| yield x }
|
150
|
+
self
|
151
|
+
else
|
152
|
+
perm
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Removes the last element from the collection and returns it, or nil if the collection is empty.
|
157
|
+
# If a number n is given, returns an array of the last n elements (or less).
|
158
|
+
def pop(n = nil)
|
159
|
+
result = element_or_array(n) { pop1 }
|
160
|
+
n ? result.reverse : result
|
161
|
+
end
|
162
|
+
|
163
|
+
# When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself.
|
164
|
+
# If no block is given, a HugeProduct is returned instead.
|
165
|
+
# === Caveat
|
166
|
+
# max_array_size is currently inherited by the generated HugeProduct. This may change in the future.
|
167
|
+
# other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future.
|
168
|
+
def product(other_enumerable) # :yields: element
|
169
|
+
other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable)
|
170
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
171
|
+
prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator)
|
172
|
+
if block_given?
|
173
|
+
prod.each { |x| yield x }
|
174
|
+
self
|
175
|
+
else
|
176
|
+
prod
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Choose a random element or n random elements from the collection.
|
181
|
+
# The elements are chosen by using random and unique indices into the array in order to ensure
|
182
|
+
# that an element does not repeat itself unless the collection already contained duplicate elements.
|
183
|
+
# If the collection is empty the first form returns nil and the second form returns an empty array.
|
184
|
+
# The optional rng argument will be used as the random number generator.
|
185
|
+
def sample(*args)
|
186
|
+
if args.size > 2
|
187
|
+
raise ArgumentError, "wrong number of arguments (#{args.size} for 2)"
|
188
|
+
elsif args.size == 2
|
189
|
+
n = args.first
|
190
|
+
rng = args.last
|
191
|
+
elsif args.size == 1
|
192
|
+
arg = args.first
|
193
|
+
if arg.is_a?(Proc) || arg.is_a?(Method)
|
194
|
+
n = 1
|
195
|
+
rng = arg
|
196
|
+
else
|
197
|
+
n = arg
|
198
|
+
rng = method(:rand)
|
199
|
+
end
|
200
|
+
else
|
201
|
+
n = nil
|
202
|
+
rng = method(:rand)
|
203
|
+
end
|
204
|
+
|
205
|
+
element_or_array(n) { sample1(rng) }
|
206
|
+
end
|
207
|
+
|
208
|
+
# Removes the first element of the collection and returns it (shifting all other elements down by one).
|
209
|
+
# Returns nil if the collection is empty.
|
210
|
+
# If a number n is given, returns an array of the first n elements (or less).
|
211
|
+
# With collection containing only the remainder elements, not including what was shifted to returned array.
|
212
|
+
# ==== Options
|
213
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
214
|
+
def shift(n = nil)
|
215
|
+
element_or_array(n) { shift1 }
|
216
|
+
end
|
217
|
+
|
218
|
+
# Returns a new HugeEnumerable with the order of the elements of the new collection randomized.
|
219
|
+
# ==== Options
|
220
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
221
|
+
# ==== Side Effects
|
222
|
+
# The new collection is reset to the current collection's original size and elements before shuffling.
|
223
|
+
def shuffle(rng=nil)
|
224
|
+
self.dup.shuffle!(rng)
|
225
|
+
end
|
226
|
+
|
227
|
+
# Randomly reorders the elements of the collection.
|
228
|
+
# ==== Options
|
229
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
230
|
+
# ==== Side Effects
|
231
|
+
# The collection is reset to its original size and elements before shuffling
|
232
|
+
def shuffle!(rng=nil)
|
233
|
+
rng ||= self.rng
|
234
|
+
reset!
|
235
|
+
@shuffle_head = rng.call(collection_size)
|
236
|
+
@collection_increment = full_cycle_increment(collection_size)
|
237
|
+
self
|
238
|
+
end
|
239
|
+
|
240
|
+
# Returns the current size of the collection.
|
241
|
+
# Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array.
|
242
|
+
def size
|
243
|
+
end_of_sequence - start_of_sequence
|
244
|
+
end
|
245
|
+
|
246
|
+
protected
|
247
|
+
|
248
|
+
def reset!
|
249
|
+
@start_of_sequence = 0
|
250
|
+
@end_of_sequence = nil
|
251
|
+
self
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
|
256
|
+
attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment
|
257
|
+
|
258
|
+
def collection_size
|
259
|
+
raise NotImplementedError, "not implemented for #{self.class.name}"
|
260
|
+
end
|
261
|
+
|
262
|
+
def end_of_sequence
|
263
|
+
@end_of_sequence ||= collection_size
|
264
|
+
end
|
265
|
+
|
266
|
+
def fetch(x)
|
267
|
+
raise NotImplementedError, "not implemented for #{self.class.name}"
|
268
|
+
end
|
269
|
+
|
270
|
+
def miller_rabin
|
271
|
+
@miller_rabin ||= Prime::MillerRabin.new
|
272
|
+
end
|
273
|
+
|
274
|
+
def next_prime(x)
|
275
|
+
if x < 2
|
276
|
+
2
|
277
|
+
elsif x < 3
|
278
|
+
3
|
279
|
+
elsif x < 5
|
280
|
+
5
|
281
|
+
else
|
282
|
+
x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 ))
|
283
|
+
x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin)
|
284
|
+
x
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
def pop1
|
289
|
+
result = _fetch(end_of_sequence - start_of_sequence - 1)
|
290
|
+
@end_of_sequence -= 1
|
291
|
+
result
|
292
|
+
end
|
293
|
+
|
294
|
+
def remaining_or(x)
|
295
|
+
[x, size].min
|
296
|
+
end
|
297
|
+
|
298
|
+
def shuffle_index(index)
|
299
|
+
index ? (shuffle_head + collection_increment * index) % collection_size : nil
|
300
|
+
end
|
301
|
+
|
302
|
+
def relative_index(index)
|
303
|
+
index = end_of_sequence + index if index < 0
|
304
|
+
index += start_of_sequence
|
305
|
+
index >= 0 && index < end_of_sequence ? index : nil
|
306
|
+
end
|
307
|
+
|
308
|
+
def shift1
|
309
|
+
result = _fetch(0)
|
310
|
+
@start_of_sequence += 1
|
311
|
+
result
|
312
|
+
end
|
313
|
+
|
314
|
+
def _fetch(index)
|
315
|
+
index = shuffle_index(relative_index(index))
|
316
|
+
index ? fetch(index) : nil
|
317
|
+
end
|
318
|
+
|
319
|
+
def sample1(rng)
|
320
|
+
if @sample_position.nil? || @sample_position >= size
|
321
|
+
@sample_position = rng.call(size)
|
322
|
+
else
|
323
|
+
if @last_sample_size != size
|
324
|
+
@last_sample_size = size
|
325
|
+
@sample_increment = full_cycle_increment(size)
|
326
|
+
end
|
327
|
+
@sample_position = (@sample_position + @sample_increment) % size
|
328
|
+
end
|
329
|
+
_fetch(@sample_position)
|
330
|
+
end
|
331
|
+
|
332
|
+
def full_cycle_increment(domain_size)
|
333
|
+
increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i)
|
334
|
+
increment == domain_size ? next_prime(increment + 1) : increment
|
335
|
+
end
|
336
|
+
|
337
|
+
def element_or_array(n = nil)
|
338
|
+
unless n.nil?
|
339
|
+
n = n.to_i
|
340
|
+
raise ArgumentError, 'negative array size' if n < 0
|
341
|
+
end
|
342
|
+
unless empty?
|
343
|
+
n ? (0...remaining_or(n)).map { |x| yield(x) } : yield
|
344
|
+
else
|
345
|
+
n.nil? ? nil : []
|
346
|
+
end
|
347
|
+
end
|
348
|
+
|
349
|
+
end
|
350
|
+
|
351
|
+
require 'huge_enumerable/huge_collection'
|
352
|
+
require 'huge_enumerable/huge_combination'
|
353
|
+
require 'huge_enumerable/huge_permutation'
|
354
|
+
require 'huge_enumerable/huge_product'
|
355
|
+
|
356
|
+
|
357
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HugeCollection do
|
4
|
+
|
5
|
+
let(:enumerable) { ('a'..'z').to_a }
|
6
|
+
|
7
|
+
subject(:collection) do
|
8
|
+
HugeCollection.send(:public, :collection_size)
|
9
|
+
HugeCollection.send(:public, :fetch)
|
10
|
+
HugeCollection.new(enumerable)
|
11
|
+
end
|
12
|
+
|
13
|
+
context "#collection_size" do
|
14
|
+
|
15
|
+
it "is equal to the original enumerable size" do
|
16
|
+
collection.collection_size.should eql(enumerable.size)
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
context "#fetch" do
|
22
|
+
|
23
|
+
it "returns values in the same order as enumerable[]" do
|
24
|
+
enumerable_fetches = []
|
25
|
+
collection_fetches = []
|
26
|
+
enumerable.size.times { |i| enumerable_fetches << enumerable[i] }
|
27
|
+
collection.collection_size.times { |i| collection_fetches << collection.fetch(i) }
|
28
|
+
collection_fetches.should eql(enumerable_fetches)
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HugeCombination do
|
4
|
+
|
5
|
+
let(:enumerable) { ('a'..'z').to_a }
|
6
|
+
|
7
|
+
subject(:combination) do
|
8
|
+
HugeCombination.send(:public, :collection_size)
|
9
|
+
HugeCombination.send(:public, :fetch)
|
10
|
+
HugeCombination.new(enumerable, 2)
|
11
|
+
end
|
12
|
+
|
13
|
+
def enum_combo(x)
|
14
|
+
@cache ||= {}
|
15
|
+
@cache[x.to_i] ||= enumerable.combination(x).to_a
|
16
|
+
end
|
17
|
+
|
18
|
+
context "#collection_size" do
|
19
|
+
|
20
|
+
it "is equal to array#combination.to_a.size" do
|
21
|
+
combination.collection_size.should eql(enum_combo(2).size)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
context "#fetch" do
|
27
|
+
|
28
|
+
it "returns values in the same order as array#combination.to_a[]" do
|
29
|
+
enum_combo_fetches = []
|
30
|
+
combination_fetches = []
|
31
|
+
enum_combo(2).size.times { |i| enum_combo_fetches << enum_combo(2)[i] }
|
32
|
+
combination.collection_size.times { |i| combination_fetches << combination.fetch(i) }
|
33
|
+
combination_fetches.should eql(enum_combo_fetches)
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HugePermutation do
|
4
|
+
|
5
|
+
let(:enumerable) { ('a'..'z').to_a }
|
6
|
+
|
7
|
+
subject(:permutation) do
|
8
|
+
HugePermutation.send(:public, :collection_size)
|
9
|
+
HugePermutation.send(:public, :fetch)
|
10
|
+
HugePermutation.new(enumerable, 2)
|
11
|
+
end
|
12
|
+
|
13
|
+
def enum_perm(x)
|
14
|
+
@cache ||= {}
|
15
|
+
@cache[x.to_i] ||= enumerable.permutation(x).to_a
|
16
|
+
end
|
17
|
+
|
18
|
+
context "#collection_size" do
|
19
|
+
|
20
|
+
it "is equal to array#permutation.to_a.size" do
|
21
|
+
permutation.collection_size.should eql(enum_perm(2).size)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
context "#fetch" do
|
27
|
+
|
28
|
+
it "returns values in the same order as array#permutation.to_a[]" do
|
29
|
+
enum_perm_fetches = []
|
30
|
+
permutation_fetches = []
|
31
|
+
enum_perm(2).size.times { |i| enum_perm_fetches << enum_perm(2)[i] }
|
32
|
+
permutation.collection_size.times { |i| permutation_fetches << permutation.fetch(i) }
|
33
|
+
permutation_fetches.should eql(enum_perm_fetches)
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HugeProduct do
|
4
|
+
|
5
|
+
let(:enumerable_a) { ('a'..'z').to_a }
|
6
|
+
let(:enumerable_b) { ('A'..'Z').to_a }
|
7
|
+
let(:enum_prod) { enumerable_a.product(enumerable_b) }
|
8
|
+
|
9
|
+
subject(:product) do
|
10
|
+
HugeProduct.send(:public, :collection_size)
|
11
|
+
HugeProduct.send(:public, :fetch)
|
12
|
+
HugeProduct.new(enumerable_a, enumerable_b)
|
13
|
+
end
|
14
|
+
|
15
|
+
context "#collection_size" do
|
16
|
+
|
17
|
+
it "is equal to array#product(other_ary).size" do
|
18
|
+
product.collection_size.should eql(enum_prod.size)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
context "#fetch" do
|
24
|
+
|
25
|
+
it "returns values in the same order as array#product(other_ary)[]" do
|
26
|
+
enum_prod_fetches = []
|
27
|
+
product_fetches = []
|
28
|
+
enum_prod.size.times { |i| enum_prod_fetches << enum_prod[i] }
|
29
|
+
product.collection_size.times { |i| product_fetches << product.fetch(i) }
|
30
|
+
product_fetches.should eql(enum_prod_fetches)
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|