huge_enumerable 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bdd8ece0dcce67e7e8af6652859584c88b568715
4
+ data.tar.gz: cb41eb56dee65cab7d174e5283dd3d7d06a75e84
5
+ SHA512:
6
+ metadata.gz: 1449cbb6b13a47bc3b1853a8f8cccb0392983e42dc48dc4d596b84e61b71b7e1abb4fb6b6b6454aebe45ffc6a73b0bcb6dded3b5963956a2c50030df226941be
7
+ data.tar.gz: 64bdc6abf362ca79c8d4538792c293e81122a23ae95126e60e359edafa05f6e593ff910d440c85325f44f238f162e893afd4c715031d3e3df5b2ec2a5368537f
data/.gitignore CHANGED
@@ -1,20 +1,21 @@
1
- *.gem
2
- *.rbc
3
- .bundle
4
- .config
5
- Gemfile.lock
6
- InstalledFiles
7
- coverage
8
- InstalledFiles
9
- lib/bundler/man
10
- pkg
11
- rdoc
12
- spec/reports
13
- test/tmp
14
- test/version_tmp
15
- tmp
16
-
17
- # YARD artifacts
18
- .yardoc
19
- _yardoc
20
- doc/
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ Gemfile.lock
6
+ InstalledFiles
7
+ coverage
8
+ InstalledFiles
9
+ lib/bundler/man
10
+ pkg
11
+ rdoc
12
+ spec/reports
13
+ test/tmp
14
+ test/version_tmp
15
+ tmp
16
+
17
+ # YARD artifacts
18
+ .yardoc
19
+ _yardoc
20
+ doc/
21
+ .idea
data/Gemfile CHANGED
@@ -1,3 +1,3 @@
1
- source 'https://rubygems.org'
2
-
3
- gemspec
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -1,22 +1,22 @@
1
- Copyright (c) 2013 Frank Hall
2
-
3
- MIT License
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
12
-
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
15
-
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1
+ Copyright (c) 2013 Frank Hall
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,29 +1,29 @@
1
- # HugeEnumerable
2
-
3
- Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory
4
-
5
- ## Installation
6
-
7
- Add this line to your application's Gemfile:
8
-
9
- gem 'huge_enumerable'
10
-
11
- And then execute:
12
-
13
- $ bundle
14
-
15
- Or install it yourself as:
16
-
17
- $ gem install huge_enumerable
18
-
19
- ## Usage
20
-
21
- See HugeCollection, HugeCombination, HugePermutation, and HugeProduct for ways of utilizing HugeEnumerable.
22
-
23
- ## Contributing
24
-
25
- 1. Fork it
26
- 2. Create your feature branch (`git checkout -b my-new-feature`)
27
- 3. Commit your changes (`git commit -am 'Add some feature'`)
28
- 4. Push to the branch (`git push origin my-new-feature`)
29
- 5. Create new Pull Request
1
+ # HugeEnumerable
2
+
3
+ Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'huge_enumerable'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install huge_enumerable
18
+
19
+ ## Usage
20
+
21
+ See HugeCollection, HugeCombination, HugePermutation, and HugeProduct for ways of utilizing HugeEnumerable.
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile CHANGED
@@ -1,17 +1,17 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
3
- require 'rdoc/task'
4
-
5
- RSpec::Core::RakeTask.new
6
-
7
- task :default => :spec
8
- task :test => :spec
9
-
10
- RDoc::Task.new do |rdoc|
11
- rdoc.rdoc_dir = 'doc'
12
- rdoc.main = 'README.md'
13
- rdoc.rdoc_files.include 'README.md', "lib/**/*\.rb"
14
-
15
- rdoc.options << '--line-numbers'
16
- end
17
-
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+ require 'rdoc/task'
4
+
5
+ RSpec::Core::RakeTask.new
6
+
7
+ task :default => :spec
8
+ task :test => :spec
9
+
10
+ RDoc::Task.new do |rdoc|
11
+ rdoc.rdoc_dir = 'doc'
12
+ rdoc.main = 'README.md'
13
+ rdoc.rdoc_files.include 'README.md', "lib/**/*\.rb"
14
+
15
+ rdoc.options << '--line-numbers'
16
+ end
17
+
@@ -1,28 +1,28 @@
1
- # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'huge_enumerable/version'
5
-
6
- Gem::Specification.new do |spec|
7
- spec.name = "huge_enumerable"
8
- spec.version = HugeEnumerable::VERSION
9
- spec.authors = ["Frank Hall"]
10
- spec.email = ["ChapterHouse.Dune@gmail.com"]
11
- spec.description = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
12
- spec.summary = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
13
- spec.homepage = "https://github.com/ChapterHouse/huge_enumerable.git"
14
- spec.license = "MIT"
15
-
16
- spec.files = `git ls-files`.split($/)
17
- spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
- spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
- spec.require_paths = ["lib"]
20
-
21
- spec.add_development_dependency "bundler", "~> 1.3"
22
- spec.add_development_dependency "rake"
23
- spec.add_development_dependency 'rspec', '~> 2.13'
24
- spec.add_development_dependency 'rdoc'
25
- spec.add_runtime_dependency "backports" # Wish this could be conditional. It is only used for ruby 1.8 for as long as I support it.
26
- spec.add_runtime_dependency "prime_miller_rabin", ">= 0.0.2"
27
-
28
- end
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'huge_enumerable/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "huge_enumerable"
8
+ spec.version = HugeEnumerable::VERSION
9
+ spec.authors = ["Frank Hall"]
10
+ spec.email = ["ChapterHouse.Dune@gmail.com"]
11
+ spec.description = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
12
+ spec.summary = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
13
+ spec.homepage = "https://github.com/ChapterHouse/huge_enumerable.git"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency 'rspec', '~> 2.13'
24
+ spec.add_development_dependency 'rdoc'
25
+ spec.add_runtime_dependency "backports" # Wish this could be conditional. It is only used for ruby 1.8 for as long as I support it.
26
+ spec.add_runtime_dependency "prime_miller_rabin", ">= 0.0.2"
27
+
28
+ end
@@ -1,357 +1,362 @@
1
- require "huge_enumerable/version"
2
-
3
- require 'backports' if RUBY_VERSION < '1.9'
4
- require 'prime'
5
- require 'prime_miller_rabin'
6
-
7
- Prime::MillerRabin.speed_intercept
8
-
9
- # HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite)
10
- # data sets without requiring them to be in memory.
11
- # In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were
12
- # an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original
13
- # data set in any fashion.
14
- #
15
- # To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch.
16
- # collection_size should return the size of the full data set.
17
- # fetch should return the value at the given index.
18
- # It is guaranteed that fetch will always be called with values in the range of (0...collection_size)
19
- # It will never be called with a negative index or with an index >= collection_size
20
- class HugeEnumerable
21
-
22
- include Enumerable
23
-
24
- # Currently 100,000 elements
25
- DEFAULT_MAX_ARRAY_SIZE=100000
26
-
27
- # The maximum number of elements to be returned when to_a is called.
28
- # If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller.
29
- attr_accessor :max_array_size
30
-
31
- # The random number generator to use for shuffles and samples. Defaults to self#rand.
32
- attr_accessor :rng
33
-
34
- # Create a new HugeEnumerable
35
- #
36
- # ==== Options
37
- #
38
- # * +:max_array_size+ - The default size of arrays when #to_a is called.
39
- # * +:rng+ - The random number generator to use.
40
- def initialize(max_array_size = nil, rng = nil)
41
- @max_array_size = max_array_size ? max_array_size.to_i : nil
42
- @rng = rng || self.method(:rand)
43
- @collection_increment = 1
44
- @start_of_sequence = 0
45
- @shuffle_head = 0
46
- end
47
-
48
- # Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices.
49
- # Negative indices count backward from the end of the collection (-1 is the last element).
50
- # For start and range cases the starting index is just before an element.
51
- # Additionally, an empty array is returned when the starting index for an element range is at the end of the collection.
52
- # Returns nil if the index (or starting index) are out of range.
53
- # ==== Attributes
54
- #
55
- # * +index_or_range+ - Either an integer for single element selection or length selection, or a range.
56
- #
57
- # ==== Options
58
- #
59
- # * +:length+ - The number of elements to return if index_or_range is not a range.
60
- def [](index_or_range, length=nil)
61
- # TODO: Consider changing this to return HugeCollection
62
- if index_or_range.is_a?(Range)
63
- range = index_or_range
64
- index = nil
65
- else
66
- index = index_or_range.to_i
67
- range = nil
68
- end
69
-
70
- if range
71
- index = range.first
72
- index += size if index < 0
73
-
74
- length = range.last - index + 1
75
- length += size if range.last < 0
76
- length = size - index if index + length > size
77
-
78
- if index < 0 || index > size
79
- nil
80
- elsif length < 0
81
- []
82
- else
83
- element_or_array(length) { |i| _fetch(i + index) }
84
- end
85
- elsif length
86
- index += size if index < 0
87
- length = size - index if index + length > size
88
- if index < 0 || length < 0
89
- nil
90
- else
91
- element_or_array(length) { |i| _fetch(i + index) }
92
- end
93
- else
94
- _fetch(index)
95
- end
96
-
97
- end
98
-
99
- # Calls the given block once for each element remaining in the collection, passing that element as a parameter.
100
- def collection_each(&block) # :yields: element
101
- # TODO: Return an Enumerator if no block is given
102
- size.times { |i| yield _fetch(i) }
103
- end
104
-
105
- # When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself.
106
- # If no block is given, an HugeCombination is returned instead.
107
- # === Caveat
108
- # max_array_size is currently inherited by the generated HugeCombination. This may change in the future.
109
- def combination(n) # :yields: element
110
- random_number_generator = rng != self.method(:rand) ? rng : nil
111
- combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator)
112
- if block_given?
113
- combo.each { |x| yield x }
114
- self
115
- else
116
- combo
117
- end
118
- end
119
-
120
- # Calls the given block once for each element in the next array of the collection, passing that element as a parameter.
121
- def each # :yields: element
122
- # TODO: Return an Enumerator if no block is given
123
- remaining_or(max_array_size).times { |i| yield _fetch(i) }
124
- end
125
-
126
- def max_array_size #:nodoc:
127
- @max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min
128
- end
129
-
130
- # Shifts max_array_size elements and returns the following array from to_a.
131
- def next_array
132
- shift(max_array_size)
133
- to_a
134
- end
135
-
136
- # Returns true of the collection contains no more elements.
137
- def empty?
138
- @start_of_sequence == @end_of_sequence
139
- end
140
-
141
- # When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself.
142
- # If no block is given, a HugePermutation is returned instead.
143
- # === Caveat
144
- # max_array_size is currently inherited by the generated HugePermutation. This may change in the future.
145
- def permutation(n) # :yields: element
146
- random_number_generator = rng != self.method(:rand) ? rng : nil
147
- perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator)
148
- if block_given?
149
- perm.each { |x| yield x }
150
- self
151
- else
152
- perm
153
- end
154
- end
155
-
156
- # Removes the last element from the collection and returns it, or nil if the collection is empty.
157
- # If a number n is given, returns an array of the last n elements (or less).
158
- def pop(n = nil)
159
- result = element_or_array(n) { pop1 }
160
- n ? result.reverse : result
161
- end
162
-
163
- # When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself.
164
- # If no block is given, a HugeProduct is returned instead.
165
- # === Caveat
166
- # max_array_size is currently inherited by the generated HugeProduct. This may change in the future.
167
- # other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future.
168
- def product(other_enumerable) # :yields: element
169
- other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable)
170
- random_number_generator = rng != self.method(:rand) ? rng : nil
171
- prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator)
172
- if block_given?
173
- prod.each { |x| yield x }
174
- self
175
- else
176
- prod
177
- end
178
- end
179
-
180
- # Choose a random element or n random elements from the collection.
181
- # The elements are chosen by using random and unique indices into the array in order to ensure
182
- # that an element does not repeat itself unless the collection already contained duplicate elements.
183
- # If the collection is empty the first form returns nil and the second form returns an empty array.
184
- # The optional rng argument will be used as the random number generator.
185
- def sample(*args)
186
- if args.size > 2
187
- raise ArgumentError, "wrong number of arguments (#{args.size} for 2)"
188
- elsif args.size == 2
189
- n = args.first
190
- rng = args.last
191
- elsif args.size == 1
192
- arg = args.first
193
- if arg.is_a?(Proc) || arg.is_a?(Method)
194
- n = 1
195
- rng = arg
196
- else
197
- n = arg
198
- rng = method(:rand)
199
- end
200
- else
201
- n = nil
202
- rng = method(:rand)
203
- end
204
-
205
- element_or_array(n) { sample1(rng) }
206
- end
207
-
208
- # Removes the first element of the collection and returns it (shifting all other elements down by one).
209
- # Returns nil if the collection is empty.
210
- # If a number n is given, returns an array of the first n elements (or less).
211
- # With collection containing only the remainder elements, not including what was shifted to returned array.
212
- # ==== Options
213
- # * +rng+ - The random number generator to use. Defaults to self#rng.
214
- def shift(n = nil)
215
- element_or_array(n) { shift1 }
216
- end
217
-
218
- # Returns a new HugeEnumerable with the order of the elements of the new collection randomized.
219
- # ==== Options
220
- # * +rng+ - The random number generator to use. Defaults to self#rng.
221
- # ==== Side Effects
222
- # The new collection is reset to the current collection's original size and elements before shuffling.
223
- def shuffle(rng=nil)
224
- self.dup.shuffle!(rng)
225
- end
226
-
227
- # Randomly reorders the elements of the collection.
228
- # ==== Options
229
- # * +rng+ - The random number generator to use. Defaults to self#rng.
230
- # ==== Side Effects
231
- # The collection is reset to its original size and elements before shuffling
232
- def shuffle!(rng=nil)
233
- rng ||= self.rng
234
- reset!
235
- @shuffle_head = rng.call(collection_size)
236
- @collection_increment = full_cycle_increment(collection_size)
237
- self
238
- end
239
-
240
- # Returns the current size of the collection.
241
- # Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array.
242
- def size
243
- end_of_sequence - start_of_sequence
244
- end
245
-
246
- protected
247
-
248
- def reset!
249
- @start_of_sequence = 0
250
- @end_of_sequence = nil
251
- self
252
- end
253
-
254
- private
255
-
256
- attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment
257
-
258
- def collection_size
259
- raise NotImplementedError, "not implemented for #{self.class.name}"
260
- end
261
-
262
- def end_of_sequence
263
- @end_of_sequence ||= collection_size
264
- end
265
-
266
- def fetch(x)
267
- raise NotImplementedError, "not implemented for #{self.class.name}"
268
- end
269
-
270
- def miller_rabin
271
- @miller_rabin ||= Prime::MillerRabin.new
272
- end
273
-
274
- def next_prime(x)
275
- if x < 2
276
- 2
277
- elsif x < 3
278
- 3
279
- elsif x < 5
280
- 5
281
- else
282
- x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 ))
283
- x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin)
284
- x
285
- end
286
- end
287
-
288
- def pop1
289
- result = _fetch(end_of_sequence - start_of_sequence - 1)
290
- @end_of_sequence -= 1
291
- result
292
- end
293
-
294
- def remaining_or(x)
295
- [x, size].min
296
- end
297
-
298
- def shuffle_index(index)
299
- index ? (shuffle_head + collection_increment * index) % collection_size : nil
300
- end
301
-
302
- def relative_index(index)
303
- index = end_of_sequence + index if index < 0
304
- index += start_of_sequence
305
- index >= 0 && index < end_of_sequence ? index : nil
306
- end
307
-
308
- def shift1
309
- result = _fetch(0)
310
- @start_of_sequence += 1
311
- result
312
- end
313
-
314
- def _fetch(index)
315
- index = shuffle_index(relative_index(index))
316
- index ? fetch(index) : nil
317
- end
318
-
319
- def sample1(rng)
320
- if @sample_position.nil? || @sample_position >= size
321
- @sample_position = rng.call(size)
322
- else
323
- if @last_sample_size != size
324
- @last_sample_size = size
325
- @sample_increment = full_cycle_increment(size)
326
- end
327
- @sample_position = (@sample_position + @sample_increment) % size
328
- end
329
- _fetch(@sample_position)
330
- end
331
-
332
- def full_cycle_increment(domain_size)
333
- increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i)
334
- increment == domain_size ? next_prime(increment + 1) : increment
335
- end
336
-
337
- def element_or_array(n = nil)
338
- unless n.nil?
339
- n = n.to_i
340
- raise ArgumentError, 'negative array size' if n < 0
341
- end
342
- unless empty?
343
- n ? (0...remaining_or(n)).map { |x| yield(x) } : yield
344
- else
345
- n.nil? ? nil : []
346
- end
347
- end
348
-
349
- end
350
-
351
- require 'huge_enumerable/huge_collection'
352
- require 'huge_enumerable/huge_combination'
353
- require 'huge_enumerable/huge_permutation'
354
- require 'huge_enumerable/huge_product'
355
-
356
-
357
-
1
+ require "huge_enumerable/version"
2
+
3
+ require 'backports' if RUBY_VERSION < '1.9'
4
+ require 'prime'
5
+ require 'prime_miller_rabin'
6
+
7
+ Prime::MillerRabin.speed_intercept
8
+
9
+ # HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite)
10
+ # data sets without requiring them to be in memory.
11
+ # In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were
12
+ # an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original
13
+ # data set in any fashion.
14
+ #
15
+ # To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch.
16
+ # collection_size should return the size of the full data set.
17
+ # fetch should return the value at the given index.
18
+ # It is guaranteed that fetch will always be called with values in the range of (0...collection_size)
19
+ # It will never be called with a negative index or with an index >= collection_size
20
+ class HugeEnumerable
21
+
22
+ include Enumerable
23
+
24
+ # Currently 100,000 elements
25
+ DEFAULT_MAX_ARRAY_SIZE=100000
26
+
27
+ # The maximum number of elements to be returned when to_a is called.
28
+ # If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller.
29
+ attr_accessor :max_array_size
30
+
31
+ # The random number generator to use for shuffles and samples. Defaults to self#rand.
32
+ attr_accessor :rng
33
+
34
+ # Create a new HugeEnumerable
35
+ #
36
+ # ==== Options
37
+ #
38
+ # * +:max_array_size+ - The default size of arrays when #to_a is called.
39
+ # * +:rng+ - The random number generator to use.
40
+ def initialize(max_array_size = nil, rng = nil)
41
+ @max_array_size = max_array_size ? max_array_size.to_i : nil
42
+ @rng = rng || self.method(:rand)
43
+ @collection_increment = 1
44
+ @start_of_sequence = 0
45
+ @shuffle_head = 0
46
+ end
47
+
48
+ # Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices.
49
+ # Negative indices count backward from the end of the collection (-1 is the last element).
50
+ # For start and range cases the starting index is just before an element.
51
+ # Additionally, an empty array is returned when the starting index for an element range is at the end of the collection.
52
+ # Returns nil if the index (or starting index) are out of range.
53
+ # ==== Attributes
54
+ #
55
+ # * +index_or_range+ - Either an integer for single element selection or length selection, or a range.
56
+ #
57
+ # ==== Options
58
+ #
59
+ # * +:length+ - The number of elements to return if index_or_range is not a range.
60
+ def [](index_or_range, length=nil)
61
+ # TODO: Consider changing this to return HugeCollection
62
+ if index_or_range.is_a?(Range)
63
+ range = index_or_range
64
+ index = nil
65
+ else
66
+ index = index_or_range.to_i
67
+ range = nil
68
+ end
69
+
70
+ if range
71
+ index = range.first
72
+ index += size if index < 0
73
+
74
+ length = range.last - index + 1
75
+ length += size if range.last < 0
76
+ length = size - index if index + length > size
77
+
78
+ if index < 0 || index > size
79
+ nil
80
+ elsif length < 0
81
+ []
82
+ else
83
+ element_or_array(length) { |i| _fetch(i + index) }
84
+ end
85
+ elsif length
86
+ index += size if index < 0
87
+ length = size - index if index + length > size
88
+ if index < 0 || length < 0
89
+ nil
90
+ else
91
+ element_or_array(length) { |i| _fetch(i + index) }
92
+ end
93
+ else
94
+ _fetch(index)
95
+ end
96
+
97
+ end
98
+
99
+ # Calls the given block once for each element remaining in the collection, passing that element as a parameter.
100
+ def collection_each(&block) # :yields: element
101
+ # TODO: Return an Enumerator if no block is given
102
+ size.times { |i| yield _fetch(i) }
103
+ end
104
+
105
+ # When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself.
106
+ # If no block is given, an HugeCombination is returned instead.
107
+ # === Caveat
108
+ # max_array_size is currently inherited by the generated HugeCombination. This may change in the future.
109
+ def combination(n) # :yields: element
110
+ random_number_generator = rng != self.method(:rand) ? rng : nil
111
+ combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator)
112
+ if block_given?
113
+ combo.each { |x| yield x }
114
+ self
115
+ else
116
+ combo
117
+ end
118
+ end
119
+
120
+ # Calls the given block once for each element in the next array of the collection, passing that element as a parameter.
121
+ def each # :yields: element
122
+ # TODO: Return an Enumerator if no block is given
123
+ remaining_or(max_array_size).times { |i| yield _fetch(i) }
124
+ end
125
+
126
+ def max_array_size #:nodoc:
127
+ @max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min
128
+ end
129
+
130
+ # Shifts max_array_size elements and returns the following array from to_a.
131
+ def next_array
132
+ shift(max_array_size)
133
+ to_a
134
+ end
135
+
136
+ # Returns true of the collection contains no more elements.
137
+ def empty?
138
+ @start_of_sequence == @end_of_sequence
139
+ end
140
+
141
+ # When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself.
142
+ # If no block is given, a HugePermutation is returned instead.
143
+ # === Caveat
144
+ # max_array_size is currently inherited by the generated HugePermutation. This may change in the future.
145
+ def permutation(n) # :yields: element
146
+ random_number_generator = rng != self.method(:rand) ? rng : nil
147
+ perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator)
148
+ if block_given?
149
+ perm.each { |x| yield x }
150
+ self
151
+ else
152
+ perm
153
+ end
154
+ end
155
+
156
+ # Removes the last element from the collection and returns it, or nil if the collection is empty.
157
+ # If a number n is given, returns an array of the last n elements (or less).
158
+ def pop(n = nil)
159
+ result = element_or_array(n) { pop1 }
160
+ n ? result.reverse : result
161
+ end
162
+
163
+ # When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself.
164
+ # If no block is given, a HugeProduct is returned instead.
165
+ # === Caveat
166
+ # max_array_size is currently inherited by the generated HugeProduct. This may change in the future.
167
+ # other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future.
168
+ def product(other_enumerable) # :yields: element
169
+ other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable)
170
+ random_number_generator = rng != self.method(:rand) ? rng : nil
171
+ prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator)
172
+ if block_given?
173
+ prod.each { |x| yield x }
174
+ self
175
+ else
176
+ prod
177
+ end
178
+ end
179
+
180
+ # Choose a random element or n random elements from the collection.
181
+ # The elements are chosen by using random and unique indices into the array in order to ensure
182
+ # that an element does not repeat itself unless the collection already contained duplicate elements.
183
+ # If the collection is empty the first form returns nil and the second form returns an empty array.
184
+ # The optional rng argument will be used as the random number generator.
185
+ def sample(*args)
186
+ if args.size > 2
187
+ raise ArgumentError, "wrong number of arguments (#{args.size} for 2)"
188
+ elsif args.size == 2
189
+ n = args.first
190
+ rng = args.last
191
+ elsif args.size == 1
192
+ arg = args.first
193
+ if arg.is_a?(Proc) || arg.is_a?(Method)
194
+ n = 1
195
+ rng = arg
196
+ else
197
+ n = arg
198
+ rng = method(:rand)
199
+ end
200
+ else
201
+ n = nil
202
+ rng = method(:rand)
203
+ end
204
+
205
+ element_or_array(n) { sample1(rng) }
206
+ end
207
+
208
+ # Removes the first element of the collection and returns it (shifting all other elements down by one).
209
+ # Returns nil if the collection is empty.
210
+ # If a number n is given, returns an array of the first n elements (or less).
211
+ # With collection containing only the remainder elements, not including what was shifted to returned array.
212
+ # ==== Options
213
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
214
+ def shift(n = nil)
215
+ element_or_array(n) { shift1 }
216
+ end
217
+
218
+ # Returns a new HugeEnumerable with the order of the elements of the new collection randomized.
219
+ # ==== Options
220
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
221
+ # ==== Side Effects
222
+ # The new collection is reset to the current collection's original size and elements before shuffling.
223
+ def shuffle(rng=nil)
224
+ self.dup.shuffle!(rng)
225
+ end
226
+
227
+ # Randomly reorders the elements of the collection.
228
+ # ==== Options
229
+ # * +rng+ - The random number generator to use. Defaults to self#rng.
230
+ # ==== Side Effects
231
+ # The collection is reset to its original size and elements before shuffling
232
+ def shuffle!(rng=nil)
233
+ rng ||= self.rng
234
+ reset!
235
+ @shuffle_head = rng.call(collection_size)
236
+ @collection_increment = full_cycle_increment(collection_size)
237
+ self
238
+ end
239
+
240
+ # Returns the current size of the collection.
241
+ # Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array.
242
+ def size
243
+ end_of_sequence - start_of_sequence
244
+ end
245
+
246
+ protected
247
+
248
+ def reset!
249
+ @start_of_sequence = 0
250
+ @end_of_sequence = nil
251
+ self
252
+ end
253
+
254
+ private
255
+
256
+ attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment
257
+
258
+ def collection_size
259
+ raise NotImplementedError, "not implemented for #{self.class.name}"
260
+ end
261
+
262
+ def end_of_sequence
263
+ @end_of_sequence ||= collection_size
264
+ end
265
+
266
+ def fetch(x)
267
+ raise NotImplementedError, "not implemented for #{self.class.name}"
268
+ end
269
+
270
+ def miller_rabin
271
+ @miller_rabin ||= Prime::MillerRabin.new
272
+ end
273
+
274
+ def next_prime(x)
275
+ if x < 2
276
+ 2
277
+ elsif x < 3
278
+ 3
279
+ elsif x < 5
280
+ 5
281
+ else
282
+ x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 ))
283
+ x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin)
284
+ x
285
+ end
286
+ end
287
+
288
+ def pop1
289
+ result = _fetch(end_of_sequence - start_of_sequence - 1)
290
+ @end_of_sequence -= 1
291
+ result
292
+ end
293
+
294
+ def remaining_or(x)
295
+ [x, size].min
296
+ end
297
+
298
+ def shuffle_index(index)
299
+ index ? (shuffle_head + collection_increment * index) % collection_size : nil
300
+ end
301
+
302
+ def relative_index(index)
303
+ index = end_of_sequence + index if index < 0
304
+ index += start_of_sequence
305
+ index >= 0 && index < end_of_sequence ? index : nil
306
+ end
307
+
308
+ def shift1
309
+ result = _fetch(0)
310
+ @start_of_sequence += 1
311
+ result
312
+ end
313
+
314
+ def _fetch(index)
315
+ index = shuffle_index(relative_index(index))
316
+ index ? fetch(index) : nil
317
+ end
318
+
319
+ def sample1(rng)
320
+ if @sample_position.nil? || @sample_position >= size
321
+ @sample_position = rng.call(size)
322
+ else
323
+ if @last_sample_size != size
324
+ @last_sample_size = size
325
+ @sample_increment = full_cycle_increment(size)
326
+ end
327
+ @sample_position = (@sample_position + @sample_increment) % size
328
+ end
329
+ _fetch(@sample_position)
330
+ end
331
+
332
+ def full_cycle_increment(domain_size)
333
+ increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i)
334
+ increment == domain_size ? next_prime(increment + 1) : increment
335
+ end
336
+
337
+ def factorial(x)
338
+ x == 0 ? 1 : (1..x).reduce(:*)
339
+ end
340
+
341
+
342
+ def element_or_array(n = nil)
343
+ unless n.nil?
344
+ n = n.to_i
345
+ raise ArgumentError, 'negative array size' if n < 0
346
+ end
347
+ unless empty?
348
+ n ? (0...remaining_or(n)).map { |x| yield(x) } : yield
349
+ else
350
+ n.nil? ? nil : []
351
+ end
352
+ end
353
+
354
+ end
355
+
356
+ require 'huge_enumerable/huge_collection'
357
+ require 'huge_enumerable/huge_combination'
358
+ require 'huge_enumerable/huge_permutation'
359
+ require 'huge_enumerable/huge_product'
360
+
361
+
362
+