huge_enumerable 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +21 -20
- data/Gemfile +3 -3
- data/LICENSE.txt +22 -22
- data/README.md +29 -29
- data/Rakefile +17 -17
- data/huge_enumerable.gemspec +28 -28
- data/lib/huge_enumerable.rb +362 -357
- data/lib/huge_enumerable/huge_collection.rb +83 -83
- data/lib/huge_enumerable/huge_combination.rb +112 -118
- data/lib/huge_enumerable/huge_permutation.rb +80 -67
- data/lib/huge_enumerable/huge_product.rb +77 -77
- data/lib/huge_enumerable/version.rb +4 -4
- data/spec/lib/huge_enumerable/huge_collection_spec.rb +32 -32
- data/spec/lib/huge_enumerable/huge_combination_spec.rb +38 -37
- data/spec/lib/huge_enumerable/huge_permutation_spec.rb +38 -37
- data/spec/lib/huge_enumerable/huge_product_spec.rb +34 -34
- data/spec/lib/huge_enumerable_spec.rb +641 -641
- metadata +97 -118
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bdd8ece0dcce67e7e8af6652859584c88b568715
|
4
|
+
data.tar.gz: cb41eb56dee65cab7d174e5283dd3d7d06a75e84
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1449cbb6b13a47bc3b1853a8f8cccb0392983e42dc48dc4d596b84e61b71b7e1abb4fb6b6b6454aebe45ffc6a73b0bcb6dded3b5963956a2c50030df226941be
|
7
|
+
data.tar.gz: 64bdc6abf362ca79c8d4538792c293e81122a23ae95126e60e359edafa05f6e593ff910d440c85325f44f238f162e893afd4c715031d3e3df5b2ec2a5368537f
|
data/.gitignore
CHANGED
@@ -1,20 +1,21 @@
|
|
1
|
-
*.gem
|
2
|
-
*.rbc
|
3
|
-
.bundle
|
4
|
-
.config
|
5
|
-
Gemfile.lock
|
6
|
-
InstalledFiles
|
7
|
-
coverage
|
8
|
-
InstalledFiles
|
9
|
-
lib/bundler/man
|
10
|
-
pkg
|
11
|
-
rdoc
|
12
|
-
spec/reports
|
13
|
-
test/tmp
|
14
|
-
test/version_tmp
|
15
|
-
tmp
|
16
|
-
|
17
|
-
# YARD artifacts
|
18
|
-
.yardoc
|
19
|
-
_yardoc
|
20
|
-
doc/
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
Gemfile.lock
|
6
|
+
InstalledFiles
|
7
|
+
coverage
|
8
|
+
InstalledFiles
|
9
|
+
lib/bundler/man
|
10
|
+
pkg
|
11
|
+
rdoc
|
12
|
+
spec/reports
|
13
|
+
test/tmp
|
14
|
+
test/version_tmp
|
15
|
+
tmp
|
16
|
+
|
17
|
+
# YARD artifacts
|
18
|
+
.yardoc
|
19
|
+
_yardoc
|
20
|
+
doc/
|
21
|
+
.idea
|
data/Gemfile
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
source 'https://rubygems.org'
|
2
|
-
|
3
|
-
gemspec
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
gemspec
|
data/LICENSE.txt
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
Copyright (c) 2013 Frank Hall
|
2
|
-
|
3
|
-
MIT License
|
4
|
-
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
-
a copy of this software and associated documentation files (the
|
7
|
-
"Software"), to deal in the Software without restriction, including
|
8
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
-
permit persons to whom the Software is furnished to do so, subject to
|
11
|
-
the following conditions:
|
12
|
-
|
13
|
-
The above copyright notice and this permission notice shall be
|
14
|
-
included in all copies or substantial portions of the Software.
|
15
|
-
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
1
|
+
Copyright (c) 2013 Frank Hall
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
CHANGED
@@ -1,29 +1,29 @@
|
|
1
|
-
# HugeEnumerable
|
2
|
-
|
3
|
-
Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory
|
4
|
-
|
5
|
-
## Installation
|
6
|
-
|
7
|
-
Add this line to your application's Gemfile:
|
8
|
-
|
9
|
-
gem 'huge_enumerable'
|
10
|
-
|
11
|
-
And then execute:
|
12
|
-
|
13
|
-
$ bundle
|
14
|
-
|
15
|
-
Or install it yourself as:
|
16
|
-
|
17
|
-
$ gem install huge_enumerable
|
18
|
-
|
19
|
-
## Usage
|
20
|
-
|
21
|
-
See HugeCollection, HugeCombination, HugePermutation, and HugeProduct for ways of utilizing HugeEnumerable.
|
22
|
-
|
23
|
-
## Contributing
|
24
|
-
|
25
|
-
1. Fork it
|
26
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
-
5. Create new Pull Request
|
1
|
+
# HugeEnumerable
|
2
|
+
|
3
|
+
Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'huge_enumerable'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install huge_enumerable
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
See HugeCollection, HugeCombination, HugePermutation, and HugeProduct for ways of utilizing HugeEnumerable.
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
CHANGED
@@ -1,17 +1,17 @@
|
|
1
|
-
require "bundler/gem_tasks"
|
2
|
-
require "rspec/core/rake_task"
|
3
|
-
require 'rdoc/task'
|
4
|
-
|
5
|
-
RSpec::Core::RakeTask.new
|
6
|
-
|
7
|
-
task :default => :spec
|
8
|
-
task :test => :spec
|
9
|
-
|
10
|
-
RDoc::Task.new do |rdoc|
|
11
|
-
rdoc.rdoc_dir = 'doc'
|
12
|
-
rdoc.main = 'README.md'
|
13
|
-
rdoc.rdoc_files.include 'README.md', "lib/**/*\.rb"
|
14
|
-
|
15
|
-
rdoc.options << '--line-numbers'
|
16
|
-
end
|
17
|
-
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
require 'rdoc/task'
|
4
|
+
|
5
|
+
RSpec::Core::RakeTask.new
|
6
|
+
|
7
|
+
task :default => :spec
|
8
|
+
task :test => :spec
|
9
|
+
|
10
|
+
RDoc::Task.new do |rdoc|
|
11
|
+
rdoc.rdoc_dir = 'doc'
|
12
|
+
rdoc.main = 'README.md'
|
13
|
+
rdoc.rdoc_files.include 'README.md', "lib/**/*\.rb"
|
14
|
+
|
15
|
+
rdoc.options << '--line-numbers'
|
16
|
+
end
|
17
|
+
|
data/huge_enumerable.gemspec
CHANGED
@@ -1,28 +1,28 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'huge_enumerable/version'
|
5
|
-
|
6
|
-
Gem::Specification.new do |spec|
|
7
|
-
spec.name = "huge_enumerable"
|
8
|
-
spec.version = HugeEnumerable::VERSION
|
9
|
-
spec.authors = ["Frank Hall"]
|
10
|
-
spec.email = ["ChapterHouse.Dune@gmail.com"]
|
11
|
-
spec.description = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
|
12
|
-
spec.summary = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
|
13
|
-
spec.homepage = "https://github.com/ChapterHouse/huge_enumerable.git"
|
14
|
-
spec.license = "MIT"
|
15
|
-
|
16
|
-
spec.files = `git ls-files`.split($/)
|
17
|
-
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
-
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
-
spec.require_paths = ["lib"]
|
20
|
-
|
21
|
-
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
-
spec.add_development_dependency "rake"
|
23
|
-
spec.add_development_dependency 'rspec', '~> 2.13'
|
24
|
-
spec.add_development_dependency 'rdoc'
|
25
|
-
spec.add_runtime_dependency "backports" # Wish this could be conditional. It is only used for ruby 1.8 for as long as I support it.
|
26
|
-
spec.add_runtime_dependency "prime_miller_rabin", ">= 0.0.2"
|
27
|
-
|
28
|
-
end
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'huge_enumerable/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "huge_enumerable"
|
8
|
+
spec.version = HugeEnumerable::VERSION
|
9
|
+
spec.authors = ["Frank Hall"]
|
10
|
+
spec.email = ["ChapterHouse.Dune@gmail.com"]
|
11
|
+
spec.description = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
|
12
|
+
spec.summary = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
|
13
|
+
spec.homepage = "https://github.com/ChapterHouse/huge_enumerable.git"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency 'rspec', '~> 2.13'
|
24
|
+
spec.add_development_dependency 'rdoc'
|
25
|
+
spec.add_runtime_dependency "backports" # Wish this could be conditional. It is only used for ruby 1.8 for as long as I support it.
|
26
|
+
spec.add_runtime_dependency "prime_miller_rabin", ">= 0.0.2"
|
27
|
+
|
28
|
+
end
|
data/lib/huge_enumerable.rb
CHANGED
@@ -1,357 +1,362 @@
|
|
1
|
-
require "huge_enumerable/version"
|
2
|
-
|
3
|
-
require 'backports' if RUBY_VERSION < '1.9'
|
4
|
-
require 'prime'
|
5
|
-
require 'prime_miller_rabin'
|
6
|
-
|
7
|
-
Prime::MillerRabin.speed_intercept
|
8
|
-
|
9
|
-
# HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite)
|
10
|
-
# data sets without requiring them to be in memory.
|
11
|
-
# In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were
|
12
|
-
# an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original
|
13
|
-
# data set in any fashion.
|
14
|
-
#
|
15
|
-
# To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch.
|
16
|
-
# collection_size should return the size of the full data set.
|
17
|
-
# fetch should return the value at the given index.
|
18
|
-
# It is guaranteed that fetch will always be called with values in the range of (0...collection_size)
|
19
|
-
# It will never be called with a negative index or with an index >= collection_size
|
20
|
-
class HugeEnumerable
|
21
|
-
|
22
|
-
include Enumerable
|
23
|
-
|
24
|
-
# Currently 100,000 elements
|
25
|
-
DEFAULT_MAX_ARRAY_SIZE=100000
|
26
|
-
|
27
|
-
# The maximum number of elements to be returned when to_a is called.
|
28
|
-
# If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller.
|
29
|
-
attr_accessor :max_array_size
|
30
|
-
|
31
|
-
# The random number generator to use for shuffles and samples. Defaults to self#rand.
|
32
|
-
attr_accessor :rng
|
33
|
-
|
34
|
-
# Create a new HugeEnumerable
|
35
|
-
#
|
36
|
-
# ==== Options
|
37
|
-
#
|
38
|
-
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
39
|
-
# * +:rng+ - The random number generator to use.
|
40
|
-
def initialize(max_array_size = nil, rng = nil)
|
41
|
-
@max_array_size = max_array_size ? max_array_size.to_i : nil
|
42
|
-
@rng = rng || self.method(:rand)
|
43
|
-
@collection_increment = 1
|
44
|
-
@start_of_sequence = 0
|
45
|
-
@shuffle_head = 0
|
46
|
-
end
|
47
|
-
|
48
|
-
# Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices.
|
49
|
-
# Negative indices count backward from the end of the collection (-1 is the last element).
|
50
|
-
# For start and range cases the starting index is just before an element.
|
51
|
-
# Additionally, an empty array is returned when the starting index for an element range is at the end of the collection.
|
52
|
-
# Returns nil if the index (or starting index) are out of range.
|
53
|
-
# ==== Attributes
|
54
|
-
#
|
55
|
-
# * +index_or_range+ - Either an integer for single element selection or length selection, or a range.
|
56
|
-
#
|
57
|
-
# ==== Options
|
58
|
-
#
|
59
|
-
# * +:length+ - The number of elements to return if index_or_range is not a range.
|
60
|
-
def [](index_or_range, length=nil)
|
61
|
-
# TODO: Consider changing this to return HugeCollection
|
62
|
-
if index_or_range.is_a?(Range)
|
63
|
-
range = index_or_range
|
64
|
-
index = nil
|
65
|
-
else
|
66
|
-
index = index_or_range.to_i
|
67
|
-
range = nil
|
68
|
-
end
|
69
|
-
|
70
|
-
if range
|
71
|
-
index = range.first
|
72
|
-
index += size if index < 0
|
73
|
-
|
74
|
-
length = range.last - index + 1
|
75
|
-
length += size if range.last < 0
|
76
|
-
length = size - index if index + length > size
|
77
|
-
|
78
|
-
if index < 0 || index > size
|
79
|
-
nil
|
80
|
-
elsif length < 0
|
81
|
-
[]
|
82
|
-
else
|
83
|
-
element_or_array(length) { |i| _fetch(i + index) }
|
84
|
-
end
|
85
|
-
elsif length
|
86
|
-
index += size if index < 0
|
87
|
-
length = size - index if index + length > size
|
88
|
-
if index < 0 || length < 0
|
89
|
-
nil
|
90
|
-
else
|
91
|
-
element_or_array(length) { |i| _fetch(i + index) }
|
92
|
-
end
|
93
|
-
else
|
94
|
-
_fetch(index)
|
95
|
-
end
|
96
|
-
|
97
|
-
end
|
98
|
-
|
99
|
-
# Calls the given block once for each element remaining in the collection, passing that element as a parameter.
|
100
|
-
def collection_each(&block) # :yields: element
|
101
|
-
# TODO: Return an Enumerator if no block is given
|
102
|
-
size.times { |i| yield _fetch(i) }
|
103
|
-
end
|
104
|
-
|
105
|
-
# When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself.
|
106
|
-
# If no block is given, an HugeCombination is returned instead.
|
107
|
-
# === Caveat
|
108
|
-
# max_array_size is currently inherited by the generated HugeCombination. This may change in the future.
|
109
|
-
def combination(n) # :yields: element
|
110
|
-
random_number_generator = rng != self.method(:rand) ? rng : nil
|
111
|
-
combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator)
|
112
|
-
if block_given?
|
113
|
-
combo.each { |x| yield x }
|
114
|
-
self
|
115
|
-
else
|
116
|
-
combo
|
117
|
-
end
|
118
|
-
end
|
119
|
-
|
120
|
-
# Calls the given block once for each element in the next array of the collection, passing that element as a parameter.
|
121
|
-
def each # :yields: element
|
122
|
-
# TODO: Return an Enumerator if no block is given
|
123
|
-
remaining_or(max_array_size).times { |i| yield _fetch(i) }
|
124
|
-
end
|
125
|
-
|
126
|
-
def max_array_size #:nodoc:
|
127
|
-
@max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min
|
128
|
-
end
|
129
|
-
|
130
|
-
# Shifts max_array_size elements and returns the following array from to_a.
|
131
|
-
def next_array
|
132
|
-
shift(max_array_size)
|
133
|
-
to_a
|
134
|
-
end
|
135
|
-
|
136
|
-
# Returns true of the collection contains no more elements.
|
137
|
-
def empty?
|
138
|
-
@start_of_sequence == @end_of_sequence
|
139
|
-
end
|
140
|
-
|
141
|
-
# When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself.
|
142
|
-
# If no block is given, a HugePermutation is returned instead.
|
143
|
-
# === Caveat
|
144
|
-
# max_array_size is currently inherited by the generated HugePermutation. This may change in the future.
|
145
|
-
def permutation(n) # :yields: element
|
146
|
-
random_number_generator = rng != self.method(:rand) ? rng : nil
|
147
|
-
perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator)
|
148
|
-
if block_given?
|
149
|
-
perm.each { |x| yield x }
|
150
|
-
self
|
151
|
-
else
|
152
|
-
perm
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
156
|
-
# Removes the last element from the collection and returns it, or nil if the collection is empty.
|
157
|
-
# If a number n is given, returns an array of the last n elements (or less).
|
158
|
-
def pop(n = nil)
|
159
|
-
result = element_or_array(n) { pop1 }
|
160
|
-
n ? result.reverse : result
|
161
|
-
end
|
162
|
-
|
163
|
-
# When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself.
|
164
|
-
# If no block is given, a HugeProduct is returned instead.
|
165
|
-
# === Caveat
|
166
|
-
# max_array_size is currently inherited by the generated HugeProduct. This may change in the future.
|
167
|
-
# other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future.
|
168
|
-
def product(other_enumerable) # :yields: element
|
169
|
-
other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable)
|
170
|
-
random_number_generator = rng != self.method(:rand) ? rng : nil
|
171
|
-
prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator)
|
172
|
-
if block_given?
|
173
|
-
prod.each { |x| yield x }
|
174
|
-
self
|
175
|
-
else
|
176
|
-
prod
|
177
|
-
end
|
178
|
-
end
|
179
|
-
|
180
|
-
# Choose a random element or n random elements from the collection.
|
181
|
-
# The elements are chosen by using random and unique indices into the array in order to ensure
|
182
|
-
# that an element does not repeat itself unless the collection already contained duplicate elements.
|
183
|
-
# If the collection is empty the first form returns nil and the second form returns an empty array.
|
184
|
-
# The optional rng argument will be used as the random number generator.
|
185
|
-
def sample(*args)
|
186
|
-
if args.size > 2
|
187
|
-
raise ArgumentError, "wrong number of arguments (#{args.size} for 2)"
|
188
|
-
elsif args.size == 2
|
189
|
-
n = args.first
|
190
|
-
rng = args.last
|
191
|
-
elsif args.size == 1
|
192
|
-
arg = args.first
|
193
|
-
if arg.is_a?(Proc) || arg.is_a?(Method)
|
194
|
-
n = 1
|
195
|
-
rng = arg
|
196
|
-
else
|
197
|
-
n = arg
|
198
|
-
rng = method(:rand)
|
199
|
-
end
|
200
|
-
else
|
201
|
-
n = nil
|
202
|
-
rng = method(:rand)
|
203
|
-
end
|
204
|
-
|
205
|
-
element_or_array(n) { sample1(rng) }
|
206
|
-
end
|
207
|
-
|
208
|
-
# Removes the first element of the collection and returns it (shifting all other elements down by one).
|
209
|
-
# Returns nil if the collection is empty.
|
210
|
-
# If a number n is given, returns an array of the first n elements (or less).
|
211
|
-
# With collection containing only the remainder elements, not including what was shifted to returned array.
|
212
|
-
# ==== Options
|
213
|
-
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
214
|
-
def shift(n = nil)
|
215
|
-
element_or_array(n) { shift1 }
|
216
|
-
end
|
217
|
-
|
218
|
-
# Returns a new HugeEnumerable with the order of the elements of the new collection randomized.
|
219
|
-
# ==== Options
|
220
|
-
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
221
|
-
# ==== Side Effects
|
222
|
-
# The new collection is reset to the current collection's original size and elements before shuffling.
|
223
|
-
def shuffle(rng=nil)
|
224
|
-
self.dup.shuffle!(rng)
|
225
|
-
end
|
226
|
-
|
227
|
-
# Randomly reorders the elements of the collection.
|
228
|
-
# ==== Options
|
229
|
-
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
230
|
-
# ==== Side Effects
|
231
|
-
# The collection is reset to its original size and elements before shuffling
|
232
|
-
def shuffle!(rng=nil)
|
233
|
-
rng ||= self.rng
|
234
|
-
reset!
|
235
|
-
@shuffle_head = rng.call(collection_size)
|
236
|
-
@collection_increment = full_cycle_increment(collection_size)
|
237
|
-
self
|
238
|
-
end
|
239
|
-
|
240
|
-
# Returns the current size of the collection.
|
241
|
-
# Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array.
|
242
|
-
def size
|
243
|
-
end_of_sequence - start_of_sequence
|
244
|
-
end
|
245
|
-
|
246
|
-
protected
|
247
|
-
|
248
|
-
def reset!
|
249
|
-
@start_of_sequence = 0
|
250
|
-
@end_of_sequence = nil
|
251
|
-
self
|
252
|
-
end
|
253
|
-
|
254
|
-
private
|
255
|
-
|
256
|
-
attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment
|
257
|
-
|
258
|
-
def collection_size
|
259
|
-
raise NotImplementedError, "not implemented for #{self.class.name}"
|
260
|
-
end
|
261
|
-
|
262
|
-
def end_of_sequence
|
263
|
-
@end_of_sequence ||= collection_size
|
264
|
-
end
|
265
|
-
|
266
|
-
def fetch(x)
|
267
|
-
raise NotImplementedError, "not implemented for #{self.class.name}"
|
268
|
-
end
|
269
|
-
|
270
|
-
def miller_rabin
|
271
|
-
@miller_rabin ||= Prime::MillerRabin.new
|
272
|
-
end
|
273
|
-
|
274
|
-
def next_prime(x)
|
275
|
-
if x < 2
|
276
|
-
2
|
277
|
-
elsif x < 3
|
278
|
-
3
|
279
|
-
elsif x < 5
|
280
|
-
5
|
281
|
-
else
|
282
|
-
x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 ))
|
283
|
-
x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin)
|
284
|
-
x
|
285
|
-
end
|
286
|
-
end
|
287
|
-
|
288
|
-
def pop1
|
289
|
-
result = _fetch(end_of_sequence - start_of_sequence - 1)
|
290
|
-
@end_of_sequence -= 1
|
291
|
-
result
|
292
|
-
end
|
293
|
-
|
294
|
-
def remaining_or(x)
|
295
|
-
[x, size].min
|
296
|
-
end
|
297
|
-
|
298
|
-
def shuffle_index(index)
|
299
|
-
index ? (shuffle_head + collection_increment * index) % collection_size : nil
|
300
|
-
end
|
301
|
-
|
302
|
-
def relative_index(index)
|
303
|
-
index = end_of_sequence + index if index < 0
|
304
|
-
index += start_of_sequence
|
305
|
-
index >= 0 && index < end_of_sequence ? index : nil
|
306
|
-
end
|
307
|
-
|
308
|
-
def shift1
|
309
|
-
result = _fetch(0)
|
310
|
-
@start_of_sequence += 1
|
311
|
-
result
|
312
|
-
end
|
313
|
-
|
314
|
-
def _fetch(index)
|
315
|
-
index = shuffle_index(relative_index(index))
|
316
|
-
index ? fetch(index) : nil
|
317
|
-
end
|
318
|
-
|
319
|
-
def sample1(rng)
|
320
|
-
if @sample_position.nil? || @sample_position >= size
|
321
|
-
@sample_position = rng.call(size)
|
322
|
-
else
|
323
|
-
if @last_sample_size != size
|
324
|
-
@last_sample_size = size
|
325
|
-
@sample_increment = full_cycle_increment(size)
|
326
|
-
end
|
327
|
-
@sample_position = (@sample_position + @sample_increment) % size
|
328
|
-
end
|
329
|
-
_fetch(@sample_position)
|
330
|
-
end
|
331
|
-
|
332
|
-
def full_cycle_increment(domain_size)
|
333
|
-
increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i)
|
334
|
-
increment == domain_size ? next_prime(increment + 1) : increment
|
335
|
-
end
|
336
|
-
|
337
|
-
def
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
end
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
1
|
+
require "huge_enumerable/version"
|
2
|
+
|
3
|
+
require 'backports' if RUBY_VERSION < '1.9'
|
4
|
+
require 'prime'
|
5
|
+
require 'prime_miller_rabin'
|
6
|
+
|
7
|
+
Prime::MillerRabin.speed_intercept
|
8
|
+
|
9
|
+
# HugeEnumerable is a base class that allows for enumerations over very large (potentially infinite)
|
10
|
+
# data sets without requiring them to be in memory.
|
11
|
+
# In addition to enumerable, abilities it also allows for shuffling, sampling, shifting, and popping as if it were
|
12
|
+
# an array. These actions also do not require for the entire data set to be in memory. Nor do they alter the original
|
13
|
+
# data set in any fashion.
|
14
|
+
#
|
15
|
+
# To use HugeEnumerable, inherit it via a subclass and provide the methods collection_size and fetch.
|
16
|
+
# collection_size should return the size of the full data set.
|
17
|
+
# fetch should return the value at the given index.
|
18
|
+
# It is guaranteed that fetch will always be called with values in the range of (0...collection_size)
|
19
|
+
# It will never be called with a negative index or with an index >= collection_size
|
20
|
+
class HugeEnumerable
|
21
|
+
|
22
|
+
include Enumerable
|
23
|
+
|
24
|
+
# Currently 100,000 elements
|
25
|
+
DEFAULT_MAX_ARRAY_SIZE=100000
|
26
|
+
|
27
|
+
# The maximum number of elements to be returned when to_a is called.
|
28
|
+
# If this is not set it will default to the collection_size or DEFAULT_MAX_ARRAY_SIZE depending on which is smaller.
|
29
|
+
attr_accessor :max_array_size
|
30
|
+
|
31
|
+
# The random number generator to use for shuffles and samples. Defaults to self#rand.
|
32
|
+
attr_accessor :rng
|
33
|
+
|
34
|
+
# Create a new HugeEnumerable
|
35
|
+
#
|
36
|
+
# ==== Options
|
37
|
+
#
|
38
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
39
|
+
# * +:rng+ - The random number generator to use.
|
40
|
+
def initialize(max_array_size = nil, rng = nil)
|
41
|
+
@max_array_size = max_array_size ? max_array_size.to_i : nil
|
42
|
+
@rng = rng || self.method(:rand)
|
43
|
+
@collection_increment = 1
|
44
|
+
@start_of_sequence = 0
|
45
|
+
@shuffle_head = 0
|
46
|
+
end
|
47
|
+
|
48
|
+
# Element Reference — Returns the element at index, or returns a subarray starting at the start index and continuing for length elements, or returns a subarray specified by range of indices.
|
49
|
+
# Negative indices count backward from the end of the collection (-1 is the last element).
|
50
|
+
# For start and range cases the starting index is just before an element.
|
51
|
+
# Additionally, an empty array is returned when the starting index for an element range is at the end of the collection.
|
52
|
+
# Returns nil if the index (or starting index) are out of range.
|
53
|
+
# ==== Attributes
|
54
|
+
#
|
55
|
+
# * +index_or_range+ - Either an integer for single element selection or length selection, or a range.
|
56
|
+
#
|
57
|
+
# ==== Options
|
58
|
+
#
|
59
|
+
# * +:length+ - The number of elements to return if index_or_range is not a range.
|
60
|
+
def [](index_or_range, length=nil)
|
61
|
+
# TODO: Consider changing this to return HugeCollection
|
62
|
+
if index_or_range.is_a?(Range)
|
63
|
+
range = index_or_range
|
64
|
+
index = nil
|
65
|
+
else
|
66
|
+
index = index_or_range.to_i
|
67
|
+
range = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
if range
|
71
|
+
index = range.first
|
72
|
+
index += size if index < 0
|
73
|
+
|
74
|
+
length = range.last - index + 1
|
75
|
+
length += size if range.last < 0
|
76
|
+
length = size - index if index + length > size
|
77
|
+
|
78
|
+
if index < 0 || index > size
|
79
|
+
nil
|
80
|
+
elsif length < 0
|
81
|
+
[]
|
82
|
+
else
|
83
|
+
element_or_array(length) { |i| _fetch(i + index) }
|
84
|
+
end
|
85
|
+
elsif length
|
86
|
+
index += size if index < 0
|
87
|
+
length = size - index if index + length > size
|
88
|
+
if index < 0 || length < 0
|
89
|
+
nil
|
90
|
+
else
|
91
|
+
element_or_array(length) { |i| _fetch(i + index) }
|
92
|
+
end
|
93
|
+
else
|
94
|
+
_fetch(index)
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
|
99
|
+
# Calls the given block once for each element remaining in the collection, passing that element as a parameter.
|
100
|
+
def collection_each(&block) # :yields: element
|
101
|
+
# TODO: Return an Enumerator if no block is given
|
102
|
+
size.times { |i| yield _fetch(i) }
|
103
|
+
end
|
104
|
+
|
105
|
+
# When invoked with a block, yields all combinations of length n of elements from the collection and then returns the collection itself.
|
106
|
+
# If no block is given, an HugeCombination is returned instead.
|
107
|
+
# === Caveat
|
108
|
+
# max_array_size is currently inherited by the generated HugeCombination. This may change in the future.
|
109
|
+
def combination(n) # :yields: element
|
110
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
111
|
+
combo = HugeCombination.new(self.dup.reset!, n, max_array_size, random_number_generator)
|
112
|
+
if block_given?
|
113
|
+
combo.each { |x| yield x }
|
114
|
+
self
|
115
|
+
else
|
116
|
+
combo
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Calls the given block once for each element in the next array of the collection, passing that element as a parameter.
|
121
|
+
def each # :yields: element
|
122
|
+
# TODO: Return an Enumerator if no block is given
|
123
|
+
remaining_or(max_array_size).times { |i| yield _fetch(i) }
|
124
|
+
end
|
125
|
+
|
126
|
+
def max_array_size #:nodoc:
|
127
|
+
@max_array_size ||= [collection_size, DEFAULT_MAX_ARRAY_SIZE].min
|
128
|
+
end
|
129
|
+
|
130
|
+
# Shifts max_array_size elements and returns the following array from to_a.
|
131
|
+
def next_array
|
132
|
+
shift(max_array_size)
|
133
|
+
to_a
|
134
|
+
end
|
135
|
+
|
136
|
+
# Returns true of the collection contains no more elements.
|
137
|
+
def empty?
|
138
|
+
@start_of_sequence == @end_of_sequence
|
139
|
+
end
|
140
|
+
|
141
|
+
# When invoked with a block, yields all permutations of length n of elements from the collection and then returns the collection itself.
|
142
|
+
# If no block is given, a HugePermutation is returned instead.
|
143
|
+
# === Caveat
|
144
|
+
# max_array_size is currently inherited by the generated HugePermutation. This may change in the future.
|
145
|
+
def permutation(n) # :yields: element
|
146
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
147
|
+
perm = HugePermutation.new(self.dup.reset!, n, max_array_size, random_number_generator)
|
148
|
+
if block_given?
|
149
|
+
perm.each { |x| yield x }
|
150
|
+
self
|
151
|
+
else
|
152
|
+
perm
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
# Removes the last element from the collection and returns it, or nil if the collection is empty.
|
157
|
+
# If a number n is given, returns an array of the last n elements (or less).
|
158
|
+
def pop(n = nil)
|
159
|
+
result = element_or_array(n) { pop1 }
|
160
|
+
n ? result.reverse : result
|
161
|
+
end
|
162
|
+
|
163
|
+
# When invoked with a block, yields all combinations of elements from the collection and the other enumerable and then returns the collection itself.
|
164
|
+
# If no block is given, a HugeProduct is returned instead.
|
165
|
+
# === Caveat
|
166
|
+
# max_array_size is currently inherited by the generated HugeProduct. This may change in the future.
|
167
|
+
# other_enumerable is duped and reset if it is a HugeEnumerable. This may change in the future.
|
168
|
+
def product(other_enumerable) # :yields: element
|
169
|
+
other_enumerable = other_enumerable.dup.reset! if other_enumerable.is_a?(HugeEnumerable)
|
170
|
+
random_number_generator = rng != self.method(:rand) ? rng : nil
|
171
|
+
prod = HugeProduct.new(self.dup.reset!, other_enumerable, max_array_size, random_number_generator)
|
172
|
+
if block_given?
|
173
|
+
prod.each { |x| yield x }
|
174
|
+
self
|
175
|
+
else
|
176
|
+
prod
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# Choose a random element or n random elements from the collection.
|
181
|
+
# The elements are chosen by using random and unique indices into the array in order to ensure
|
182
|
+
# that an element does not repeat itself unless the collection already contained duplicate elements.
|
183
|
+
# If the collection is empty the first form returns nil and the second form returns an empty array.
|
184
|
+
# The optional rng argument will be used as the random number generator.
|
185
|
+
def sample(*args)
|
186
|
+
if args.size > 2
|
187
|
+
raise ArgumentError, "wrong number of arguments (#{args.size} for 2)"
|
188
|
+
elsif args.size == 2
|
189
|
+
n = args.first
|
190
|
+
rng = args.last
|
191
|
+
elsif args.size == 1
|
192
|
+
arg = args.first
|
193
|
+
if arg.is_a?(Proc) || arg.is_a?(Method)
|
194
|
+
n = 1
|
195
|
+
rng = arg
|
196
|
+
else
|
197
|
+
n = arg
|
198
|
+
rng = method(:rand)
|
199
|
+
end
|
200
|
+
else
|
201
|
+
n = nil
|
202
|
+
rng = method(:rand)
|
203
|
+
end
|
204
|
+
|
205
|
+
element_or_array(n) { sample1(rng) }
|
206
|
+
end
|
207
|
+
|
208
|
+
# Removes the first element of the collection and returns it (shifting all other elements down by one).
|
209
|
+
# Returns nil if the collection is empty.
|
210
|
+
# If a number n is given, returns an array of the first n elements (or less).
|
211
|
+
# With collection containing only the remainder elements, not including what was shifted to returned array.
|
212
|
+
# ==== Options
|
213
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
214
|
+
def shift(n = nil)
|
215
|
+
element_or_array(n) { shift1 }
|
216
|
+
end
|
217
|
+
|
218
|
+
# Returns a new HugeEnumerable with the order of the elements of the new collection randomized.
|
219
|
+
# ==== Options
|
220
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
221
|
+
# ==== Side Effects
|
222
|
+
# The new collection is reset to the current collection's original size and elements before shuffling.
|
223
|
+
def shuffle(rng=nil)
|
224
|
+
self.dup.shuffle!(rng)
|
225
|
+
end
|
226
|
+
|
227
|
+
# Randomly reorders the elements of the collection.
|
228
|
+
# ==== Options
|
229
|
+
# * +rng+ - The random number generator to use. Defaults to self#rng.
|
230
|
+
# ==== Side Effects
|
231
|
+
# The collection is reset to its original size and elements before shuffling
|
232
|
+
def shuffle!(rng=nil)
|
233
|
+
rng ||= self.rng
|
234
|
+
reset!
|
235
|
+
@shuffle_head = rng.call(collection_size)
|
236
|
+
@collection_increment = full_cycle_increment(collection_size)
|
237
|
+
self
|
238
|
+
end
|
239
|
+
|
240
|
+
# Returns the current size of the collection.
|
241
|
+
# Unlike collection_size, this tracks size changes caused by push, pop, shift, and next_array.
|
242
|
+
def size
|
243
|
+
end_of_sequence - start_of_sequence
|
244
|
+
end
|
245
|
+
|
246
|
+
protected
|
247
|
+
|
248
|
+
def reset!
|
249
|
+
@start_of_sequence = 0
|
250
|
+
@end_of_sequence = nil
|
251
|
+
self
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
|
256
|
+
attr_reader :shuffle_head, :start_of_sequence, :end_of_sequence, :collection_increment
|
257
|
+
|
258
|
+
def collection_size
|
259
|
+
raise NotImplementedError, "not implemented for #{self.class.name}"
|
260
|
+
end
|
261
|
+
|
262
|
+
def end_of_sequence
|
263
|
+
@end_of_sequence ||= collection_size
|
264
|
+
end
|
265
|
+
|
266
|
+
def fetch(x)
|
267
|
+
raise NotImplementedError, "not implemented for #{self.class.name}"
|
268
|
+
end
|
269
|
+
|
270
|
+
def miller_rabin
|
271
|
+
@miller_rabin ||= Prime::MillerRabin.new
|
272
|
+
end
|
273
|
+
|
274
|
+
def next_prime(x)
|
275
|
+
if x < 2
|
276
|
+
2
|
277
|
+
elsif x < 3
|
278
|
+
3
|
279
|
+
elsif x < 5
|
280
|
+
5
|
281
|
+
else
|
282
|
+
x += (x.even? ? 1 : (x % 10 == 3 ? 4 : 2 ))
|
283
|
+
x += (x % 10 == 3 ? 4 : 2 ) until Prime.prime?(x, miller_rabin)
|
284
|
+
x
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
def pop1
|
289
|
+
result = _fetch(end_of_sequence - start_of_sequence - 1)
|
290
|
+
@end_of_sequence -= 1
|
291
|
+
result
|
292
|
+
end
|
293
|
+
|
294
|
+
def remaining_or(x)
|
295
|
+
[x, size].min
|
296
|
+
end
|
297
|
+
|
298
|
+
def shuffle_index(index)
|
299
|
+
index ? (shuffle_head + collection_increment * index) % collection_size : nil
|
300
|
+
end
|
301
|
+
|
302
|
+
def relative_index(index)
|
303
|
+
index = end_of_sequence + index if index < 0
|
304
|
+
index += start_of_sequence
|
305
|
+
index >= 0 && index < end_of_sequence ? index : nil
|
306
|
+
end
|
307
|
+
|
308
|
+
def shift1
|
309
|
+
result = _fetch(0)
|
310
|
+
@start_of_sequence += 1
|
311
|
+
result
|
312
|
+
end
|
313
|
+
|
314
|
+
def _fetch(index)
|
315
|
+
index = shuffle_index(relative_index(index))
|
316
|
+
index ? fetch(index) : nil
|
317
|
+
end
|
318
|
+
|
319
|
+
def sample1(rng)
|
320
|
+
if @sample_position.nil? || @sample_position >= size
|
321
|
+
@sample_position = rng.call(size)
|
322
|
+
else
|
323
|
+
if @last_sample_size != size
|
324
|
+
@last_sample_size = size
|
325
|
+
@sample_increment = full_cycle_increment(size)
|
326
|
+
end
|
327
|
+
@sample_position = (@sample_position + @sample_increment) % size
|
328
|
+
end
|
329
|
+
_fetch(@sample_position)
|
330
|
+
end
|
331
|
+
|
332
|
+
def full_cycle_increment(domain_size)
|
333
|
+
increment = next_prime(( 2 * domain_size / (1 + Math.sqrt(5)) ).to_i)
|
334
|
+
increment == domain_size ? next_prime(increment + 1) : increment
|
335
|
+
end
|
336
|
+
|
337
|
+
def factorial(x)
|
338
|
+
x == 0 ? 1 : (1..x).reduce(:*)
|
339
|
+
end
|
340
|
+
|
341
|
+
|
342
|
+
def element_or_array(n = nil)
|
343
|
+
unless n.nil?
|
344
|
+
n = n.to_i
|
345
|
+
raise ArgumentError, 'negative array size' if n < 0
|
346
|
+
end
|
347
|
+
unless empty?
|
348
|
+
n ? (0...remaining_or(n)).map { |x| yield(x) } : yield
|
349
|
+
else
|
350
|
+
n.nil? ? nil : []
|
351
|
+
end
|
352
|
+
end
|
353
|
+
|
354
|
+
end
|
355
|
+
|
356
|
+
require 'huge_enumerable/huge_collection'
|
357
|
+
require 'huge_enumerable/huge_combination'
|
358
|
+
require 'huge_enumerable/huge_permutation'
|
359
|
+
require 'huge_enumerable/huge_product'
|
360
|
+
|
361
|
+
|
362
|
+
|