huge_enumerable 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +17 -0
- data/huge_enumerable.gemspec +28 -0
- data/lib/huge_enumerable/huge_collection.rb +83 -0
- data/lib/huge_enumerable/huge_combination.rb +118 -0
- data/lib/huge_enumerable/huge_permutation.rb +67 -0
- data/lib/huge_enumerable/huge_product.rb +77 -0
- data/lib/huge_enumerable/version.rb +4 -0
- data/lib/huge_enumerable.rb +357 -0
- data/spec/lib/huge_enumerable/huge_collection_spec.rb +33 -0
- data/spec/lib/huge_enumerable/huge_combination_spec.rb +38 -0
- data/spec/lib/huge_enumerable/huge_permutation_spec.rb +38 -0
- data/spec/lib/huge_enumerable/huge_product_spec.rb +35 -0
- data/spec/lib/huge_enumerable_spec.rb +642 -0
- data/spec/spec_helper.rb +1 -0
- metadata +175 -0
@@ -0,0 +1,357 @@
|
|
1
|
+
require "huge_enumerable/version"
|
2
|
+
|
3
|
+
require 'backports' if RUBY_VERSION < '1.9'
|
4
|
+
require 'prime'
|
5
|
+
require 'prime_miller_rabin'
|
6
|
+
|
7
|
+
Prime::MillerRabin.speed_intercept
|
8
|
+
|
9
|
+
# HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite)
|
10
|
+
# data sets without requiring them to be in memory.
|
11
|
+
# In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were
|
12
|
+
# an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original
|
13
|
+
# data set in any fashion.
|
14
|
+
#
|
15
|
+
# To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch.
|
16
|
+
# collection_size should return the size of the full data set.
|
17
|
+
# fetch should return the value at the given index.
|
18
|
+
# It is guaranteed that fetch will always be called with values in the range of (0...collection_size)
|
19
|
+
# It will never be called with a negative index or with an index >= collection_size
|
20
|
+
class HugeEnumerable
|
21
|
+
|
22
|
+
include Enumerable
|
23
|
+
|
24
|
+
# Currently 100,000 elements
|
25
|
+
DEFAULT_MAX_ARRAY_SIZE=100000
|
26
|
+
|
27
|
+
# The maximum number of elements to be returned when to_a is called.
|
28
|
+
# If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller.
|
29
|
+
attr_accessor :max_array_size
|
30
|
+
|
31
|
+
# The random number generator to use for shuffles and samples. Defaults to self#rand.
|
32
|
+
attr_accessor :rng
|
33
|
+
|
34
|
+
# Create a new HugeEnumerable
|
35
|
+
#
|
36
|
+
# ==== Options
|
37
|
+
#
|
38
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
39
|
+
# * +:rng+ - The random number generator to use.
|
40
|
+
def initialize(max_array_size = nil, rng = nil)
|
41
|
+
@max_array_size = max_array_size ? max_array_size.to_i : nil
|
42
|
+
@rng = rng || self.method(:rand)
|
43
|
+
@collection_increment = 1
|
44
|
+
@start_of_sequence = 0
|
45
|
+
@shuffle_head = 0
|
46
|
+
end
|
47
|
+
|
48
|
+
# Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices.
|
49
|
+
# Negative indices count backward from the end of the collection (-1 is the last element).
|
50
|
+
# For start and range cases the starting index is just before an element.
|
51
|
+
# Additionally, an empty array is returned when the starting index for an element range is at the end of the collection.
|
52
|
+
# Returns nil if the index (or starting index) are out of range.
|
53
|
+
# ==== Attributes
|
54
|
+
#
|
55
|
+
# * +index_or_range+ - Either an integer for single element selection or length selection, or a range.
|
56
|
+
#
|
57
|
+
# ==== Options
|
58
|
+
#
|
59
|
+
# * +:length+ - The number of elements to return if index_or_range is not a range.
|
60
|
+
def [](index_or_range, length=nil)
|
61
|
+
# TODO: Consider changing this to return HugeCollection
|
62
|
+
if index_or_range.is_a?(Range)
|
63
|
+
range = index_or_range
|
64
|
+
index = nil
|
65
|
+
else
|
66
|
+
index = index_or_range.to_i
|
67
|
+
range = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
if range
|
71
|
+
index = range.first
|
72
|
+
index += size if index < 0
|
73
|
+
|
74
|
+
length = range.last - index + 1
|
75
|
+
length += size if range.last < 0
|
76
|
+
length = size - index if index + length > size
|
77
|
+
|
78
|
+
if index < 0 || index > size
|
79
|
+
nil
|
80
|
+
elsif length < 0
|
81
|
+
[]
|
82
|
+
else
|
83
|
+
element_or_array(length) { |i| _fetch(i + index) }
|
84
|
+
end
|
85
|
+
elsif length
|
86
|
+
index += size if index < 0
|
87
|
+
length = size - index if index + length > size
|
88
|
+
if index < 0 || length < 0
|
89
|
+
nil
|
90
|
+
else
|
91
|
+
element_or_array(length) { |i| _fetch(i + index) }
|
92
|
+
end
|
93
|
+
else
|
94
|
+
_fetch(index)
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
# Calls the given block once for each element remaining in the collection, passing that element as a parameter.
|
100
|
+
def collection_each(&block) # :yields: element
|
101
|
+
# TODO: Return an Enumerator if no block is given
|
102
|
+
size.times { |i| yield _fetch(i) }
|
103
|
+
end
|
104
|
+
|
105
|
+
# When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself.
|
106
|
+
# If no block is given, an HugeCombination is returned instead.
|
107
|
+
# === Caveat
|
108
|
+
# max_array_size is currently inherited by the generated HugeCombination. This may change in the future.
|
109
|
+
def combination(n) # :yields: element
|
110
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
111
|
+
combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator)
|
112
|
+
if block_given?
|
113
|
+
combo.each { |x| yield x }
|
114
|
+
self
|
115
|
+
else
|
116
|
+
combo
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Calls the given block once for each element in the next array of the collection, passing that element as a parameter.
|
121
|
+
def each # :yields: element
|
122
|
+
# TODO: Return an Enumerator if no block is given
|
123
|
+
remaining_or(max_array_size).times { |i| yield _fetch(i) }
|
124
|
+
end
|
125
|
+
|
126
|
+
def max_array_size #:nodoc:
|
127
|
+
@max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min
|
128
|
+
end
|
129
|
+
|
130
|
+
# Shifts max_array_size elements and returns the following array from to_a.
|
131
|
+
def next_array
|
132
|
+
shift(max_array_size)
|
133
|
+
to_a
|
134
|
+
end
|
135
|
+
|
136
|
+
# Returns true of the collection contains no more elements.
|
137
|
+
def empty?
|
138
|
+
@start_of_sequence == @end_of_sequence
|
139
|
+
end
|
140
|
+
|
141
|
+
# When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself.
|
142
|
+
# If no block is given, a HugePermutation is returned instead.
|
143
|
+
# === Caveat
|
144
|
+
# max_array_size is currently inherited by the generated HugePermutation. This may change in the future.
|
145
|
+
def permutation(n) # :yields: element
|
146
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
147
|
+
perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator)
|
148
|
+
if block_given?
|
149
|
+
perm.each { |x| yield x }
|
150
|
+
self
|
151
|
+
else
|
152
|
+
perm
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Removes the last element from the collection and returns it, or nil if the collection is empty.
|
157
|
+
# If a number n is given, returns an array of the last n elements (or less).
|
158
|
+
def pop(n = nil)
|
159
|
+
result = element_or_array(n) { pop1 }
|
160
|
+
n ? result.reverse : result
|
161
|
+
end
|
162
|
+
|
163
|
+
# When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself.
|
164
|
+
# If no block is given, a HugeProduct is returned instead.
|
165
|
+
# === Caveat
|
166
|
+
# max_array_size is currently inherited by the generated HugeProduct. This may change in the future.
|
167
|
+
# other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future.
|
168
|
+
def product(other_enumerable) # :yields: element
|
169
|
+
other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable)
|
170
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
171
|
+
prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator)
|
172
|
+
if block_given?
|
173
|
+
prod.each { |x| yield x }
|
174
|
+
self
|
175
|
+
else
|
176
|
+
prod
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Choose a random element or n random elements from the collection.
|
181
|
+
# The elements are chosen by using random and unique indices into the array in order to ensure
|
182
|
+
# that an element does not repeat itself unless the collection already contained duplicate elements.
|
183
|
+
# If the collection is empty the first form returns nil and the second form returns an empty array.
|
184
|
+
# The optional rng argument will be used as the random number generator.
|
185
|
+
def sample(*args)
|
186
|
+
if args.size > 2
|
187
|
+
raise ArgumentError, "wrong number of arguments (#{args.size} for 2)"
|
188
|
+
elsif args.size == 2
|
189
|
+
n = args.first
|
190
|
+
rng = args.last
|
191
|
+
elsif args.size == 1
|
192
|
+
arg = args.first
|
193
|
+
if arg.is_a?(Proc) || arg.is_a?(Method)
|
194
|
+
n = 1
|
195
|
+
rng = arg
|
196
|
+
else
|
197
|
+
n = arg
|
198
|
+
rng = method(:rand)
|
199
|
+
end
|
200
|
+
else
|
201
|
+
n = nil
|
202
|
+
rng = method(:rand)
|
203
|
+
end
|
204
|
+
|
205
|
+
element_or_array(n) { sample1(rng) }
|
206
|
+
end
|
207
|
+
|
208
|
+
# Removes the first element of the collection and returns it (shifting all other elements down by one).
|
209
|
+
# Returns nil if the collection is empty.
|
210
|
+
# If a number n is given, returns an array of the first n elements (or less).
|
211
|
+
# With collection containing only the remainder elements, not including what was shifted to returned array.
|
212
|
+
# ==== Options
|
213
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
214
|
+
def shift(n = nil)
|
215
|
+
element_or_array(n) { shift1 }
|
216
|
+
end
|
217
|
+
|
218
|
+
# Returns a new HugeEnumerable with the order of the elements of the new collection randomized.
|
219
|
+
# ==== Options
|
220
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
221
|
+
# ==== Side Effects
|
222
|
+
# The new collection is reset to the current collection's original size and elements before shuffling.
|
223
|
+
def shuffle(rng=nil)
|
224
|
+
self.dup.shuffle!(rng)
|
225
|
+
end
|
226
|
+
|
227
|
+
# Randomly reorders the elements of the collection.
|
228
|
+
# ==== Options
|
229
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
230
|
+
# ==== Side Effects
|
231
|
+
# The collection is reset to its original size and elements before shuffling
|
232
|
+
def shuffle!(rng=nil)
|
233
|
+
rng ||= self.rng
|
234
|
+
reset!
|
235
|
+
@shuffle_head = rng.call(collection_size)
|
236
|
+
@collection_increment = full_cycle_increment(collection_size)
|
237
|
+
self
|
238
|
+
end
|
239
|
+
|
240
|
+
# Returns the current size of the collection.
|
241
|
+
# Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array.
|
242
|
+
def size
|
243
|
+
end_of_sequence - start_of_sequence
|
244
|
+
end
|
245
|
+
|
246
|
+
protected
|
247
|
+
|
248
|
+
def reset!
|
249
|
+
@start_of_sequence = 0
|
250
|
+
@end_of_sequence = nil
|
251
|
+
self
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
|
256
|
+
attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment
|
257
|
+
|
258
|
+
def collection_size
|
259
|
+
raise NotImplementedError, "not implemented for #{self.class.name}"
|
260
|
+
end
|
261
|
+
|
262
|
+
def end_of_sequence
|
263
|
+
@end_of_sequence ||= collection_size
|
264
|
+
end
|
265
|
+
|
266
|
+
def fetch(x)
|
267
|
+
raise NotImplementedError, "not implemented for #{self.class.name}"
|
268
|
+
end
|
269
|
+
|
270
|
+
def miller_rabin
|
271
|
+
@miller_rabin ||= Prime::MillerRabin.new
|
272
|
+
end
|
273
|
+
|
274
|
+
def next_prime(x)
|
275
|
+
if x < 2
|
276
|
+
2
|
277
|
+
elsif x < 3
|
278
|
+
3
|
279
|
+
elsif x < 5
|
280
|
+
5
|
281
|
+
else
|
282
|
+
x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 ))
|
283
|
+
x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin)
|
284
|
+
x
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
def pop1
|
289
|
+
result = _fetch(end_of_sequence - start_of_sequence - 1)
|
290
|
+
@end_of_sequence -= 1
|
291
|
+
result
|
292
|
+
end
|
293
|
+
|
294
|
+
def remaining_or(x)
|
295
|
+
[x, size].min
|
296
|
+
end
|
297
|
+
|
298
|
+
def shuffle_index(index)
|
299
|
+
index ? (shuffle_head + collection_increment * index) % collection_size : nil
|
300
|
+
end
|
301
|
+
|
302
|
+
def relative_index(index)
|
303
|
+
index = end_of_sequence + index if index < 0
|
304
|
+
index += start_of_sequence
|
305
|
+
index >= 0 && index < end_of_sequence ? index : nil
|
306
|
+
end
|
307
|
+
|
308
|
+
def shift1
|
309
|
+
result = _fetch(0)
|
310
|
+
@start_of_sequence += 1
|
311
|
+
result
|
312
|
+
end
|
313
|
+
|
314
|
+
def _fetch(index)
|
315
|
+
index = shuffle_index(relative_index(index))
|
316
|
+
index ? fetch(index) : nil
|
317
|
+
end
|
318
|
+
|
319
|
+
def sample1(rng)
|
320
|
+
if @sample_position.nil? || @sample_position >= size
|
321
|
+
@sample_position = rng.call(size)
|
322
|
+
else
|
323
|
+
if @last_sample_size != size
|
324
|
+
@last_sample_size = size
|
325
|
+
@sample_increment = full_cycle_increment(size)
|
326
|
+
end
|
327
|
+
@sample_position = (@sample_position + @sample_increment) % size
|
328
|
+
end
|
329
|
+
_fetch(@sample_position)
|
330
|
+
end
|
331
|
+
|
332
|
+
def full_cycle_increment(domain_size)
|
333
|
+
increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i)
|
334
|
+
increment == domain_size ? next_prime(increment + 1) : increment
|
335
|
+
end
|
336
|
+
|
337
|
+
def element_or_array(n = nil)
|
338
|
+
unless n.nil?
|
339
|
+
n = n.to_i
|
340
|
+
raise ArgumentError, 'negative array size' if n < 0
|
341
|
+
end
|
342
|
+
unless empty?
|
343
|
+
n ? (0...remaining_or(n)).map { |x| yield(x) } : yield
|
344
|
+
else
|
345
|
+
n.nil? ? nil : []
|
346
|
+
end
|
347
|
+
end
|
348
|
+
|
349
|
+
end
|
350
|
+
|
351
|
+
require 'huge_enumerable/huge_collection'
|
352
|
+
require 'huge_enumerable/huge_combination'
|
353
|
+
require 'huge_enumerable/huge_permutation'
|
354
|
+
require 'huge_enumerable/huge_product'
|
355
|
+
|
356
|
+
|
357
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HugeCollection do
|
4
|
+
|
5
|
+
let(:enumerable) { ('a'..'z').to_a }
|
6
|
+
|
7
|
+
subject(:collection) do
|
8
|
+
HugeCollection.send(:public, :collection_size)
|
9
|
+
HugeCollection.send(:public, :fetch)
|
10
|
+
HugeCollection.new(enumerable)
|
11
|
+
end
|
12
|
+
|
13
|
+
context "#collection_size" do
|
14
|
+
|
15
|
+
it "is equal to the original enumerable size" do
|
16
|
+
collection.collection_size.should eql(enumerable.size)
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
context "#fetch" do
|
22
|
+
|
23
|
+
it "returns values in the same order as enumerable[]" do
|
24
|
+
enumerable_fetches = []
|
25
|
+
collection_fetches = []
|
26
|
+
enumerable.size.times { |i| enumerable_fetches << enumerable[i] }
|
27
|
+
collection.collection_size.times { |i| collection_fetches << collection.fetch(i) }
|
28
|
+
collection_fetches.should eql(enumerable_fetches)
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HugeCombination do
|
4
|
+
|
5
|
+
let(:enumerable) { ('a'..'z').to_a }
|
6
|
+
|
7
|
+
subject(:combination) do
|
8
|
+
HugeCombination.send(:public, :collection_size)
|
9
|
+
HugeCombination.send(:public, :fetch)
|
10
|
+
HugeCombination.new(enumerable, 2)
|
11
|
+
end
|
12
|
+
|
13
|
+
def enum_combo(x)
|
14
|
+
@cache ||= {}
|
15
|
+
@cache[x.to_i] ||= enumerable.combination(x).to_a
|
16
|
+
end
|
17
|
+
|
18
|
+
context "#collection_size" do
|
19
|
+
|
20
|
+
it "is equal to array#combination.to_a.size" do
|
21
|
+
combination.collection_size.should eql(enum_combo(2).size)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
context "#fetch" do
|
27
|
+
|
28
|
+
it "returns values in the same order as array#combination.to_a[]" do
|
29
|
+
enum_combo_fetches = []
|
30
|
+
combination_fetches = []
|
31
|
+
enum_combo(2).size.times { |i| enum_combo_fetches << enum_combo(2)[i] }
|
32
|
+
combination.collection_size.times { |i| combination_fetches << combination.fetch(i) }
|
33
|
+
combination_fetches.should eql(enum_combo_fetches)
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HugePermutation do
|
4
|
+
|
5
|
+
let(:enumerable) { ('a'..'z').to_a }
|
6
|
+
|
7
|
+
subject(:permutation) do
|
8
|
+
HugePermutation.send(:public, :collection_size)
|
9
|
+
HugePermutation.send(:public, :fetch)
|
10
|
+
HugePermutation.new(enumerable, 2)
|
11
|
+
end
|
12
|
+
|
13
|
+
def enum_perm(x)
|
14
|
+
@cache ||= {}
|
15
|
+
@cache[x.to_i] ||= enumerable.permutation(x).to_a
|
16
|
+
end
|
17
|
+
|
18
|
+
context "#collection_size" do
|
19
|
+
|
20
|
+
it "is equal to array#permutation.to_a.size" do
|
21
|
+
permutation.collection_size.should eql(enum_perm(2).size)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
context "#fetch" do
|
27
|
+
|
28
|
+
it "returns values in the same order as array#permutation.to_a[]" do
|
29
|
+
enum_perm_fetches = []
|
30
|
+
permutation_fetches = []
|
31
|
+
enum_perm(2).size.times { |i| enum_perm_fetches << enum_perm(2)[i] }
|
32
|
+
permutation.collection_size.times { |i| permutation_fetches << permutation.fetch(i) }
|
33
|
+
permutation_fetches.should eql(enum_perm_fetches)
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe HugeProduct do
|
4
|
+
|
5
|
+
let(:enumerable_a) { ('a'..'z').to_a }
|
6
|
+
let(:enumerable_b) { ('A'..'Z').to_a }
|
7
|
+
let(:enum_prod) { enumerable_a.product(enumerable_b) }
|
8
|
+
|
9
|
+
subject(:product) do
|
10
|
+
HugeProduct.send(:public, :collection_size)
|
11
|
+
HugeProduct.send(:public, :fetch)
|
12
|
+
HugeProduct.new(enumerable_a, enumerable_b)
|
13
|
+
end
|
14
|
+
|
15
|
+
context "#collection_size" do
|
16
|
+
|
17
|
+
it "is equal to array#product(other_ary).size" do
|
18
|
+
product.collection_size.should eql(enum_prod.size)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
context "#fetch" do
|
24
|
+
|
25
|
+
it "returns values in the same order as array#product(other_ary)[]" do
|
26
|
+
enum_prod_fetches = []
|
27
|
+
product_fetches = []
|
28
|
+
enum_prod.size.times { |i| enum_prod_fetches << enum_prod[i] }
|
29
|
+
product.collection_size.times { |i| product_fetches << product.fetch(i) }
|
30
|
+
product_fetches.should eql(enum_prod_fetches)
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|