huge_enumerable 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,357 @@
1
+ require "huge_enumerable/version"
2
+
3
+ require 'backports' if RUBY_VERSION < '1.9'
4
+ require 'prime'
5
+ require 'prime_miller_rabin'
6
+
7
+ Prime::MillerRabin.speed_intercept
8
+
9
+ # HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite)
10
+ # data sets without requiring them to be in memory.
11
+ # In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were
12
+ # an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original
13
+ # data set in any fashion.
14
+ #
15
+ # To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch.
16
+ # collection_size should return the size of the full data set.
17
+ # fetch should return the value at the given index.
18
+ # It is guaranteed that fetch will always be called with values in the range of (0...collection_size)
19
+ # It will never be called with a negative index or with an index >= collection_size
20
+ class HugeEnumerable
21
+
22
+ include Enumerable
23
+
24
+ # Currently 100,000 elements
25
+ DEFAULT_MAX_ARRAY_SIZE=100000
26
+
27
+ # The maximum number of elements to be returned when to_a is called.
28
+ # If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller.
29
+ attr_accessor :max_array_size
30
+
31
+ # The random number generator to use for shuffles and samples. Defaults to self#rand.
32
+ attr_accessor :rng
33
+
34
+ # Create a new HugeEnumerable
35
+ #
36
+ # ==== Options
37
+ #
38
+ # * +:max_array_size+ - The default size of arrays when #to_a is called.
39
+ # * +:rng+ - The random number generator to use.
40
+ def initialize(max_array_size = nil, rng = nil)
41
+ @max_array_size = max_array_size ? max_array_size.to_i : nil
42
+ @rng = rng || self.method(:rand)
43
+ @collection_increment = 1
44
+ @start_of_sequence = 0
45
+ @shuffle_head = 0
46
+ end
47
+
48
+ # Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices.
49
+ # Negative indices count backward from the end of the collection (-1 is the last element).
50
+ # For start and range cases the starting index is just before an element.
51
+ # Additionally, an empty array is returned when the starting index for an element range is at the end of the collection.
52
+ # Returns nil if the index (or starting index) are out of range.
53
+ # ==== Attributes
54
+ #
55
+ # * +index_or_range+ - Either an integer for single element selection or length selection, or a range.
56
+ #
57
+ # ==== Options
58
+ #
59
+ # * +:length+ - The number of elements to return if index_or_range is not a range.
60
+ def [](index_or_range, length=nil)
61
+ # TODO: Consider changing this to return HugeCollection
62
+ if index_or_range.is_a?(Range)
63
+ range = index_or_range
64
+ index = nil
65
+ else
66
+ index = index_or_range.to_i
67
+ range = nil
68
+ end
69
+
70
+ if range
71
+ index = range.first
72
+ index += size if index < 0
73
+
74
+ length = range.last - index + 1
75
+ length += size if range.last < 0
76
+ length = size - index if index + length > size
77
+
78
+ if index < 0 || index > size
79
+ nil
80
+ elsif length < 0
81
+ []
82
+ else
83
+ element_or_array(length) { |i| _fetch(i + index) }
84
+ end
85
+ elsif length
86
+ index += size if index < 0
87
+ length = size - index if index + length > size
88
+ if index < 0 || length < 0
89
+ nil
90
+ else
91
+ element_or_array(length) { |i| _fetch(i + index) }
92
+ end
93
+ else
94
+ _fetch(index)
95
+ end
96
+
97
+ end
98
+
99
+ # Calls the given block once for each element remaining in the collection, passing that element as a parameter.
100
+ def collection_each(&block) # :yields: element
101
+ # TODO: Return an Enumerator if no block is given
102
+ size.times { |i| yield _fetch(i) }
103
+ end
104
+
105
+ # When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself.
106
+ # If no block is given, an HugeCombination is returned instead.
107
+ # === Caveat
108
+ # max_array_size is currently inherited by the generated HugeCombination. This may change in the future.
109
+ def combination(n) # :yields: element
110
+ random_number_generator = rng != self.method(:rand) ? rng : nil
111
+ combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator)
112
+ if block_given?
113
+ combo.each { |x| yield x }
114
+ self
115
+ else
116
+ combo
117
+ end
118
+ end
119
+
120
+ # Calls the given block once for each element in the next array of the collection, passing that element as a parameter.
121
+ def each # :yields: element
122
+ # TODO: Return an Enumerator if no block is given
123
+ remaining_or(max_array_size).times { |i| yield _fetch(i) }
124
+ end
125
+
126
+ def max_array_size #:nodoc:
127
+ @max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min
128
+ end
129
+
130
+ # Shifts max_array_size elements and returns the following array from to_a.
131
+ def next_array
132
+ shift(max_array_size)
133
+ to_a
134
+ end
135
+
136
+ # Returns true of the collection contains no more elements.
137
+ def empty?
138
+ @start_of_sequence == @end_of_sequence
139
+ end
140
+
141
+ # When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself.
142
+ # If no block is given, a HugePermutation is returned instead.
143
+ # === Caveat
144
+ # max_array_size is currently inherited by the generated HugePermutation. This may change in the future.
145
+ def permutation(n) # :yields: element
146
+ random_number_generator = rng != self.method(:rand) ? rng : nil
147
+ perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator)
148
+ if block_given?
149
+ perm.each { |x| yield x }
150
+ self
151
+ else
152
+ perm
153
+ end
154
+ end
155
+
156
+ # Removes the last element from the collection and returns it, or nil if the collection is empty.
157
+ # If a number n is given, returns an array of the last n elements (or less).
158
+ def pop(n = nil)
159
+ result = element_or_array(n) { pop1 }
160
+ n ? result.reverse : result
161
+ end
162
+
163
+ # When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself.
164
+ # If no block is given, a HugeProduct is returned instead.
165
+ # === Caveat
166
+ # max_array_size is currently inherited by the generated HugeProduct. This may change in the future.
167
+ # other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future.
168
+ def product(other_enumerable) # :yields: element
169
+ other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable)
170
+ random_number_generator = rng != self.method(:rand) ? rng : nil
171
+ prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator)
172
+ if block_given?
173
+ prod.each { |x| yield x }
174
+ self
175
+ else
176
+ prod
177
+ end
178
+ end
179
+
180
+ # Choose a random element or n random elements from the collection.
181
+ # The elements are chosen by using random and unique indices into the array in order to ensure
182
+ # that an element does not repeat itself unless the collection already contained duplicate elements.
183
+ # If the collection is empty the first form returns nil and the second form returns an empty array.
184
+ # The optional rng argument will be used as the random number generator.
185
+ def sample(*args)
186
+ if args.size > 2
187
+ raise ArgumentError, "wrong number of arguments (#{args.size} for 2)"
188
+ elsif args.size == 2
189
+ n = args.first
190
+ rng = args.last
191
+ elsif args.size == 1
192
+ arg = args.first
193
+ if arg.is_a?(Proc) || arg.is_a?(Method)
194
+ n = 1
195
+ rng = arg
196
+ else
197
+ n = arg
198
+ rng = method(:rand)
199
+ end
200
+ else
201
+ n = nil
202
+ rng = method(:rand)
203
+ end
204
+
205
+ element_or_array(n) { sample1(rng) }
206
+ end
207
+
208
+ # Removes the first element of the collection and returns it (shifting all other elements down by one).
209
+ # Returns nil if the collection is empty.
210
+ # If a number n is given, returns an array of the first n elements (or less).
211
+ # With collection containing only the remainder elements, not including what was shifted to returned array.
212
+ # ==== Options
213
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
214
+ def shift(n = nil)
215
+ element_or_array(n) { shift1 }
216
+ end
217
+
218
+ # Returns a new HugeEnumerable with the order of the elements of the new collection randomized.
219
+ # ==== Options
220
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
221
+ # ==== Side Effects
222
+ # The new collection is reset to the current collection's original size and elements before shuffling.
223
+ def shuffle(rng=nil)
224
+ self.dup.shuffle!(rng)
225
+ end
226
+
227
+ # Randomly reorders the elements of the collection.
228
+ # ==== Options
229
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
230
+ # ==== Side Effects
231
+ # The collection is reset to its original size and elements before shuffling
232
+ def shuffle!(rng=nil)
233
+ rng ||= self.rng
234
+ reset!
235
+ @shuffle_head = rng.call(collection_size)
236
+ @collection_increment = full_cycle_increment(collection_size)
237
+ self
238
+ end
239
+
240
+ # Returns the current size of the collection.
241
+ # Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array.
242
+ def size
243
+ end_of_sequence - start_of_sequence
244
+ end
245
+
246
+ protected
247
+
248
+ def reset!
249
+ @start_of_sequence = 0
250
+ @end_of_sequence = nil
251
+ self
252
+ end
253
+
254
+ private
255
+
256
+ attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment
257
+
258
+ def collection_size
259
+ raise NotImplementedError, "not implemented for #{self.class.name}"
260
+ end
261
+
262
+ def end_of_sequence
263
+ @end_of_sequence ||= collection_size
264
+ end
265
+
266
+ def fetch(x)
267
+ raise NotImplementedError, "not implemented for #{self.class.name}"
268
+ end
269
+
270
+ def miller_rabin
271
+ @miller_rabin ||= Prime::MillerRabin.new
272
+ end
273
+
274
+ def next_prime(x)
275
+ if x < 2
276
+ 2
277
+ elsif x < 3
278
+ 3
279
+ elsif x < 5
280
+ 5
281
+ else
282
+ x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 ))
283
+ x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin)
284
+ x
285
+ end
286
+ end
287
+
288
+ def pop1
289
+ result = _fetch(end_of_sequence - start_of_sequence - 1)
290
+ @end_of_sequence -= 1
291
+ result
292
+ end
293
+
294
+ def remaining_or(x)
295
+ [x, size].min
296
+ end
297
+
298
+ def shuffle_index(index)
299
+ index ? (shuffle_head + collection_increment * index) % collection_size : nil
300
+ end
301
+
302
+ def relative_index(index)
303
+ index = end_of_sequence + index if index < 0
304
+ index += start_of_sequence
305
+ index >= 0 && index < end_of_sequence ? index : nil
306
+ end
307
+
308
+ def shift1
309
+ result = _fetch(0)
310
+ @start_of_sequence += 1
311
+ result
312
+ end
313
+
314
+ def _fetch(index)
315
+ index = shuffle_index(relative_index(index))
316
+ index ? fetch(index) : nil
317
+ end
318
+
319
+ def sample1(rng)
320
+ if @sample_position.nil? || @sample_position >= size
321
+ @sample_position = rng.call(size)
322
+ else
323
+ if @last_sample_size != size
324
+ @last_sample_size = size
325
+ @sample_increment = full_cycle_increment(size)
326
+ end
327
+ @sample_position = (@sample_position + @sample_increment) % size
328
+ end
329
+ _fetch(@sample_position)
330
+ end
331
+
332
+ def full_cycle_increment(domain_size)
333
+ increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i)
334
+ increment == domain_size ? next_prime(increment + 1) : increment
335
+ end
336
+
337
+ def element_or_array(n = nil)
338
+ unless n.nil?
339
+ n = n.to_i
340
+ raise ArgumentError, 'negative array size' if n < 0
341
+ end
342
+ unless empty?
343
+ n ? (0...remaining_or(n)).map { |x| yield(x) } : yield
344
+ else
345
+ n.nil? ? nil : []
346
+ end
347
+ end
348
+
349
+ end
350
+
351
+ require 'huge_enumerable/huge_collection'
352
+ require 'huge_enumerable/huge_combination'
353
+ require 'huge_enumerable/huge_permutation'
354
+ require 'huge_enumerable/huge_product'
355
+
356
+
357
+
@@ -0,0 +1,33 @@
1
+ require 'spec_helper'
2
+
3
+ describe HugeCollection do
4
+
5
+ let(:enumerable) { ('a'..'z').to_a }
6
+
7
+ subject(:collection) do
8
+ HugeCollection.send(:public, :collection_size)
9
+ HugeCollection.send(:public, :fetch)
10
+ HugeCollection.new(enumerable)
11
+ end
12
+
13
+ context "#collection_size" do
14
+
15
+ it "is equal to the original enumerable size" do
16
+ collection.collection_size.should eql(enumerable.size)
17
+ end
18
+
19
+ end
20
+
21
+ context "#fetch" do
22
+
23
+ it "returns values in the same order as enumerable[]" do
24
+ enumerable_fetches = []
25
+ collection_fetches = []
26
+ enumerable.size.times { |i| enumerable_fetches << enumerable[i] }
27
+ collection.collection_size.times { |i| collection_fetches << collection.fetch(i) }
28
+ collection_fetches.should eql(enumerable_fetches)
29
+ end
30
+
31
+ end
32
+
33
+ end
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+
3
+ describe HugeCombination do
4
+
5
+ let(:enumerable) { ('a'..'z').to_a }
6
+
7
+ subject(:combination) do
8
+ HugeCombination.send(:public, :collection_size)
9
+ HugeCombination.send(:public, :fetch)
10
+ HugeCombination.new(enumerable, 2)
11
+ end
12
+
13
+ def enum_combo(x)
14
+ @cache ||= {}
15
+ @cache[x.to_i] ||= enumerable.combination(x).to_a
16
+ end
17
+
18
+ context "#collection_size" do
19
+
20
+ it "is equal to array#combination.to_a.size" do
21
+ combination.collection_size.should eql(enum_combo(2).size)
22
+ end
23
+
24
+ end
25
+
26
+ context "#fetch" do
27
+
28
+ it "returns values in the same order as array#combination.to_a[]" do
29
+ enum_combo_fetches = []
30
+ combination_fetches = []
31
+ enum_combo(2).size.times { |i| enum_combo_fetches << enum_combo(2)[i] }
32
+ combination.collection_size.times { |i| combination_fetches << combination.fetch(i) }
33
+ combination_fetches.should eql(enum_combo_fetches)
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,38 @@
1
+ require 'spec_helper'
2
+
3
+ describe HugePermutation do
4
+
5
+ let(:enumerable) { ('a'..'z').to_a }
6
+
7
+ subject(:permutation) do
8
+ HugePermutation.send(:public, :collection_size)
9
+ HugePermutation.send(:public, :fetch)
10
+ HugePermutation.new(enumerable, 2)
11
+ end
12
+
13
+ def enum_perm(x)
14
+ @cache ||= {}
15
+ @cache[x.to_i] ||= enumerable.permutation(x).to_a
16
+ end
17
+
18
+ context "#collection_size" do
19
+
20
+ it "is equal to array#permutation.to_a.size" do
21
+ permutation.collection_size.should eql(enum_perm(2).size)
22
+ end
23
+
24
+ end
25
+
26
+ context "#fetch" do
27
+
28
+ it "returns values in the same order as array#permutation.to_a[]" do
29
+ enum_perm_fetches = []
30
+ permutation_fetches = []
31
+ enum_perm(2).size.times { |i| enum_perm_fetches << enum_perm(2)[i] }
32
+ permutation.collection_size.times { |i| permutation_fetches << permutation.fetch(i) }
33
+ permutation_fetches.should eql(enum_perm_fetches)
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,35 @@
1
+ require 'spec_helper'
2
+
3
+ describe HugeProduct do
4
+
5
+ let(:enumerable_a) { ('a'..'z').to_a }
6
+ let(:enumerable_b) { ('A'..'Z').to_a }
7
+ let(:enum_prod) { enumerable_a.product(enumerable_b) }
8
+
9
+ subject(:product) do
10
+ HugeProduct.send(:public, :collection_size)
11
+ HugeProduct.send(:public, :fetch)
12
+ HugeProduct.new(enumerable_a, enumerable_b)
13
+ end
14
+
15
+ context "#collection_size" do
16
+
17
+ it "is equal to array#product(other_ary).size" do
18
+ product.collection_size.should eql(enum_prod.size)
19
+ end
20
+
21
+ end
22
+
23
+ context "#fetch" do
24
+
25
+ it "returns values in the same order as array#product(other_ary)[]" do
26
+ enum_prod_fetches = []
27
+ product_fetches = []
28
+ enum_prod.size.times { |i| enum_prod_fetches << enum_prod[i] }
29
+ product.collection_size.times { |i| product_fetches << product.fetch(i) }
30
+ product_fetches.should eql(enum_prod_fetches)
31
+ end
32
+
33
+ end
34
+
35
+ end