huge_enumerable 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,357 @@
1
+ require "huge_enumerable/version"
2
+
3
+ require 'backports' if RUBY_VERSION < '1.9'
4
+ require 'prime'
5
+ require 'prime_miller_rabin'
6
+
7
+ Prime::MillerRabin.speed_intercept
8
+
9
+ # HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite)
10
+ # data sets without requiring them to be in memory.
11
+ # In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were
12
+ # an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original
13
+ # data set in any fashion.
14
+ #
15
+ # To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch.
16
+ # collection_size should return the size of the full data set.
17
+ # fetch should return the value at the given index.
18
+ # It is guaranteed that fetch will always be called with values in the range of (0...collection_size)
19
+ # It will never be called with a negative index or with an index >= collection_size
20
+ class HugeEnumerable
21
+
22
+ include Enumerable
23
+
24
+ # Currently 100,000 elements
25
+ DEFAULT_MAX_ARRAY_SIZE=100000
26
+
27
+ # The maximum number of elements to be returned when to_a is called.
28
+ # If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller.
29
+ attr_accessor :max_array_size
30
+
31
+ # The random number generator to use for shuffles and samples. Defaults to self#rand.
32
+ attr_accessor :rng
33
+
34
+ # Create a new HugeEnumerable
35
+ #
36
+ # ==== Options
37
+ #
38
+ # * +:max_array_size+ - The default size of arrays when #to_a is called.
39
+ # * +:rng+ - The random number generator to use.
40
+ def initialize(max_array_size = nil, rng = nil)
41
+ @max_array_size = max_array_size ? max_array_size.to_i : nil
42
+ @rng = rng || self.method(:rand)
43
+ @collection_increment = 1
44
+ @start_of_sequence = 0
45
+ @shuffle_head = 0
46
+ end
47
+
48
+ # Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices.
49
+ # Negative indices count backward from the end of the collection (-1 is the last element).
50
+ # For start and range cases the starting index is just before an element.
51
+ # Additionally, an empty array is returned when the starting index for an element range is at the end of the collection.
52
+ # Returns nil if the index (or starting index) are out of range.
53
+ # ==== Attributes
54
+ #
55
+ # * +index_or_range+ - Either an integer for single element selection or length selection, or a range.
56
+ #
57
+ # ==== Options
58
+ #
59
+ # * +:length+ - The number of elements to return if index_or_range is not a range.
60
+ def [](index_or_range, length=nil)
61
+ # TODO: Consider changing this to return HugeCollection
62
+ if index_or_range.is_a?(Range)
63
+ range = index_or_range
64
+ index = nil
65
+ else
66
+ index = index_or_range.to_i
67
+ range = nil
68
+ end
69
+
70
+ if range
71
+ index = range.first
72
+ index += size if index < 0
73
+
74
+ length = range.last - index + 1
75
+ length += size if range.last < 0
76
+ length = size - index if index + length > size
77
+
78
+ if index < 0 || index > size
79
+ nil
80
+ elsif length < 0
81
+ []
82
+ else
83
+ element_or_array(length) { |i| _fetch(i + index) }
84
+ end
85
+ elsif length
86
+ index += size if index < 0
87
+ length = size - index if index + length > size
88
+ if index < 0 || length < 0
89
+ nil
90
+ else
91
+ element_or_array(length) { |i| _fetch(i + index) }
92
+ end
93
+ else
94
+ _fetch(index)
95
+ end
96
+
97
+ end
98
+
99
+ # Calls the given block once for each element remaining in the collection, passing that element as a parameter.
100
+ def collection_each(&block) # :yields: element
101
+ # TODO: Return an Enumerator if no block is given
102
+ size.times { |i| yield _fetch(i) }
103
+ end
104
+
105
+ # When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself.
106
+ # If no block is given, an HugeCombination is returned instead.
107
+ # === Caveat
108
+ # max_array_size is currently inherited by the generated HugeCombination. This may change in the future.
109
+ def combination(n) # :yields: element
110
+ random_number_generator = rng != self.method(:rand) ? rng : nil
111
+ combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator)
112
+ if block_given?
113
+ combo.each { |x| yield x }
114
+ self
115
+ else
116
+ combo
117
+ end
118
+ end
119
+
120
+ # Calls the given block once for each element in the next array of the collection, passing that element as a parameter.
121
+ def each # :yields: element
122
+ # TODO: Return an Enumerator if no block is given
123
+ remaining_or(max_array_size).times { |i| yield _fetch(i) }
124
+ end
125
+
126
+ def max_array_size #:nodoc:
127
+ @max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min
128
+ end
129
+
130
+ # Shifts max_array_size elements and returns the following array from to_a.
131
+ def next_array
132
+ shift(max_array_size)
133
+ to_a
134
+ end
135
+
136
+ # Returns true of the collection contains no more elements.
137
+ def empty?
138
+ @start_of_sequence == @end_of_sequence
139
+ end
140
+
141
+ # When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself.
142
+ # If no block is given, a HugePermutation is returned instead.
143
+ # === Caveat
144
+ # max_array_size is currently inherited by the generated HugePermutation. This may change in the future.
145
+ def permutation(n) # :yields: element
146
+ random_number_generator = rng != self.method(:rand) ? rng : nil
147
+ perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator)
148
+ if block_given?
149
+ perm.each { |x| yield x }
150
+ self
151
+ else
152
+ perm
153
+ end
154
+ end
155
+
156
+ # Removes the last element from the collection and returns it, or nil if the collection is empty.
157
+ # If a number n is given, returns an array of the last n elements (or less).
158
+ def pop(n = nil)
159
+ result = element_or_array(n) { pop1 }
160
+ n ? result.reverse : result
161
+ end
162
+
163
+ # When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself.
164
+ # If no block is given, a HugeProduct is returned instead.
165
+ # === Caveat
166
+ # max_array_size is currently inherited by the generated HugeProduct. This may change in the future.
167
+ # other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future.
168
+ def product(other_enumerable) # :yields: element
169
+ other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable)
170
+ random_number_generator = rng != self.method(:rand) ? rng : nil
171
+ prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator)
172
+ if block_given?
173
+ prod.each { |x| yield x }
174
+ self
175
+ else
176
+ prod
177
+ end
178
+ end
179
+
180
+ # Choose a random element or n random elements from the collection.
181
+ # The elements are chosen by using random and unique indices into the array in order to ensure
182
+ # that an element does not repeat itself unless the collection already contained duplicate elements.
183
+ # If the collection is empty the first form returns nil and the second form returns an empty array.
184
+ # The optional rng argument will be used as the random number generator.
185
+ def sample(*args)
186
+ if args.size > 2
187
+ raise ArgumentError, "wrong number of arguments (#{args.size} for 2)"
188
+ elsif args.size == 2
189
+ n = args.first
190
+ rng = args.last
191
+ elsif args.size == 1
192
+ arg = args.first
193
+ if arg.is_a?(Proc) || arg.is_a?(Method)
194
+ n = 1
195
+ rng = arg
196
+ else
197
+ n = arg
198
+ rng = method(:rand)
199
+ end
200
+ else
201
+ n = nil
202
+ rng = method(:rand)
203
+ end
204
+
205
+ element_or_array(n) { sample1(rng) }
206
+ end
207
+
208
+ # Removes the first element of the collection and returns it (shifting all other elements down by one).
209
+ # Returns nil if the collection is empty.
210
+ # If a number n is given, returns an array of the first n elements (or less).
211
+ # With collection containing only the remainder elements, not including what was shifted to returned array.
212
+ # ==== Options
213
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
214
+ def shift(n = nil)
215
+ element_or_array(n) { shift1 }
216
+ end
217
+
218
+ # Returns a new HugeEnumerable with the order of the elements of the new collection randomized.
219
+ # ==== Options
220
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
221
+ # ==== Side Effects
222
+ # The new collection is reset to the current collection's original size and elements before shuffling.
223
+ def shuffle(rng=nil)
224
+ self.dup.shuffle!(rng)
225
+ end
226
+
227
+ # Randomly reorders the elements of the collection.
228
+ # ==== Options
229
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
230
+ # ==== Side Effects
231
+ # The collection is reset to its original size and elements before shuffling
232
+ def shuffle!(rng=nil)
233
+ rng ||= self.rng
234
+ reset!
235
+ @shuffle_head = rng.call(collection_size)
236
+ @collection_increment = full_cycle_increment(collection_size)
237
+ self
238
+ end
239
+
240
+ # Returns the current size of the collection.
241
+ # Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array.
242
+ def size
243
+ end_of_sequence - start_of_sequence
244
+ end
245
+
246
+ protected
247
+
248
+ def reset!
249
+ @start_of_sequence = 0
250
+ @end_of_sequence = nil
251
+ self
252
+ end
253
+
254
+ private
255
+
256
+ attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment
257
+
258
+ def collection_size
259
+ raise NotImplementedError, "not implemented for #{self.class.name}"
260
+ end
261
+
262
+ def end_of_sequence
263
+ @end_of_sequence ||= collection_size
264
+ end
265
+
266
+ def fetch(x)
267
+ raise NotImplementedError, "not implemented for #{self.class.name}"
268
+ end
269
+
270
+ def miller_rabin
271
+ @miller_rabin ||= Prime::MillerRabin.new
272
+ end
273
+
274
+ def next_prime(x)
275
+ if x < 2
276
+ 2
277
+ elsif x < 3
278
+ 3
279
+ elsif x < 5
280
+ 5
281
+ else
282
+ x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 ))
283
+ x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin)
284
+ x
285
+ end
286
+ end
287
+
288
+ def pop1
289
+ result = _fetch(end_of_sequence - start_of_sequence - 1)
290
+ @end_of_sequence -= 1
291
+ result
292
+ end
293
+
294
+ def remaining_or(x)
295
+ [x, size].min
296
+ end
297
+
298
+ def shuffle_index(index)
299
+ index ? (shuffle_head + collection_increment * index) % collection_size : nil
300
+ end
301
+
302
+ def relative_index(index)
303
+ index = end_of_sequence + index if index < 0
304
+ index += start_of_sequence
305
+ index >= 0 && index < end_of_sequence ? index : nil
306
+ end
307
+
308
+ def shift1
309
+ result = _fetch(0)
310
+ @start_of_sequence += 1
311
+ result
312
+ end
313
+
314
+ def _fetch(index)
315
+ index = shuffle_index(relative_index(index))
316
+ index ? fetch(index) : nil
317
+ end
318
+
319
+ def sample1(rng)
320
+ if @sample_position.nil? || @sample_position >= size
321
+ @sample_position = rng.call(size)
322
+ else
323
+ if @last_sample_size != size
324
+ @last_sample_size = size
325
+ @sample_increment = full_cycle_increment(size)
326
+ end
327
+ @sample_position = (@sample_position + @sample_increment) % size
328
+ end
329
+ _fetch(@sample_position)
330
+ end
331
+
332
+ def full_cycle_increment(domain_size)
333
+ increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i)
334
+ increment == domain_size ? next_prime(increment + 1) : increment
335
+ end
336
+
337
+ def element_or_array(n = nil)
338
+ unless n.nil?
339
+ n = n.to_i
340
+ raise ArgumentError, 'negative array size' if n < 0
341
+ end
342
+ unless empty?
343
+ n ? (0...remaining_or(n)).map { |x| yield(x) } : yield
344
+ else
345
+ n.nil? ? nil : []
346
+ end
347
+ end
348
+
349
+ end
350
+
351
+ require 'huge_enumerable/huge_collection'
352
+ require 'huge_enumerable/huge_combination'
353
+ require 'huge_enumerable/huge_permutation'
354
+ require 'huge_enumerable/huge_product'
355
+
356
+
357
+
@@ -0,0 +1,33 @@
1
+ require 'spec_helper'
2
+
3
+ describe HugeCollection do
4
+
5
+ let(:enumerable) { ('a'..'z').to_a }
6
+
7
+ subject(:collection) do
8
+ HugeCollection.send(:public, :collection_size)
9
+ HugeCollection.send(:public, :fetch)
10
+ HugeCollection.new(enumerable)
11
+ end
12
+
13
+ context "#collection_size" do
14
+
15
+ it "is equal to the original enumerable size" do
16
+ collection.collection_size.should eql(enumerable.size)
17
+ end
18
+
19
+ end
20
+
21
+ context "#fetch" do
22
+
23
+ it "returns values in the same order as enumerable[]" do
24
+ enumerable_fetches = []
25
+ collection_fetches = []
26
+ enumerable.size.times { |i| enumerable_fetches << enumerable[i] }
27
+ collection.collection_size.times { |i| collection_fetches << collection.fetch(i) }
28
+ collection_fetches.should eql(enumerable_fetches)
29
+ end
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+
3
+ describe HugeCombination do
4
+
5
+ let(:enumerable) { ('a'..'z').to_a }
6
+
7
+ subject(:combination) do
8
+ HugeCombination.send(:public, :collection_size)
9
+ HugeCombination.send(:public, :fetch)
10
+ HugeCombination.new(enumerable, 2)
11
+ end
12
+
13
+ def enum_combo(x)
14
+ @cache ||= {}
15
+ @cache[x.to_i] ||= enumerable.combination(x).to_a
16
+ end
17
+
18
+ context "#collection_size" do
19
+
20
+ it "is equal to array#combination.to_a.size" do
21
+ combination.collection_size.should eql(enum_combo(2).size)
22
+ end
23
+
24
+ end
25
+
26
+ context "#fetch" do
27
+
28
+ it "returns values in the same order as array#combination.to_a[]" do
29
+ enum_combo_fetches = []
30
+ combination_fetches = []
31
+ enum_combo(2).size.times { |i| enum_combo_fetches << enum_combo(2)[i] }
32
+ combination.collection_size.times { |i| combination_fetches << combination.fetch(i) }
33
+ combination_fetches.should eql(enum_combo_fetches)
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+
3
+ describe HugePermutation do
4
+
5
+ let(:enumerable) { ('a'..'z').to_a }
6
+
7
+ subject(:permutation) do
8
+ HugePermutation.send(:public, :collection_size)
9
+ HugePermutation.send(:public, :fetch)
10
+ HugePermutation.new(enumerable, 2)
11
+ end
12
+
13
+ def enum_perm(x)
14
+ @cache ||= {}
15
+ @cache[x.to_i] ||= enumerable.permutation(x).to_a
16
+ end
17
+
18
+ context "#collection_size" do
19
+
20
+ it "is equal to array#permutation.to_a.size" do
21
+ permutation.collection_size.should eql(enum_perm(2).size)
22
+ end
23
+
24
+ end
25
+
26
+ context "#fetch" do
27
+
28
+ it "returns values in the same order as array#permutation.to_a[]" do
29
+ enum_perm_fetches = []
30
+ permutation_fetches = []
31
+ enum_perm(2).size.times { |i| enum_perm_fetches << enum_perm(2)[i] }
32
+ permutation.collection_size.times { |i| permutation_fetches << permutation.fetch(i) }
33
+ permutation_fetches.should eql(enum_perm_fetches)
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,35 @@
1
+ require 'spec_helper'
2
+
3
+ describe HugeProduct do
4
+
5
+ let(:enumerable_a) { ('a'..'z').to_a }
6
+ let(:enumerable_b) { ('A'..'Z').to_a }
7
+ let(:enum_prod) { enumerable_a.product(enumerable_b) }
8
+
9
+ subject(:product) do
10
+ HugeProduct.send(:public, :collection_size)
11
+ HugeProduct.send(:public, :fetch)
12
+ HugeProduct.new(enumerable_a, enumerable_b)
13
+ end
14
+
15
+ context "#collection_size" do
16
+
17
+ it "is equal to array#product(other_ary).size" do
18
+ product.collection_size.should eql(enum_prod.size)
19
+ end
20
+
21
+ end
22
+
23
+ context "#fetch" do
24
+
25
+ it "returns values in the same order as array#product(other_ary)[]" do
26
+ enum_prod_fetches = []
27
+ product_fetches = []
28
+ enum_prod.size.times { |i| enum_prod_fetches << enum_prod[i] }
29
+ product.collection_size.times { |i| product_fetches << product.fetch(i) }
30
+ product_fetches.should eql(enum_prod_fetches)
31
+ end
32
+
33
+ end
34
+
35
+ end