huge_enumerable 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +20 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +17 -0
- data/huge_enumerable.gemspec +28 -0
- data/lib/huge_enumerable/huge_collection.rb +83 -0
- data/lib/huge_enumerable/huge_combination.rb +118 -0
- data/lib/huge_enumerable/huge_permutation.rb +67 -0
- data/lib/huge_enumerable/huge_product.rb +77 -0
- data/lib/huge_enumerable/version.rb +4 -0
- data/lib/huge_enumerable.rb +357 -0
- data/spec/lib/huge_enumerable/huge_collection_spec.rb +33 -0
- data/spec/lib/huge_enumerable/huge_combination_spec.rb +38 -0
- data/spec/lib/huge_enumerable/huge_permutation_spec.rb +38 -0
- data/spec/lib/huge_enumerable/huge_product_spec.rb +35 -0
- data/spec/lib/huge_enumerable_spec.rb +642 -0
- data/spec/spec_helper.rb +1 -0
- metadata +175 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Frank Hall
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# HugeEnumerable
|
2
|
+
|
3
|
+
Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'huge_enumerable'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install huge_enumerable
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
See HugeCollection, HugeCombination, HugePermutation, and HugeProduct for ways of utilizing HugeEnumerable.
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
require 'rdoc/task'
|
4
|
+
|
5
|
+
RSpec::Core::RakeTask.new
|
6
|
+
|
7
|
+
task :default => :spec
|
8
|
+
task :test => :spec
|
9
|
+
|
10
|
+
RDoc::Task.new do |rdoc|
|
11
|
+
rdoc.rdoc_dir = 'doc'
|
12
|
+
rdoc.main = 'README.md'
|
13
|
+
rdoc.rdoc_files.include 'README.md', "lib/**/*\.rb"
|
14
|
+
|
15
|
+
rdoc.options << '--line-numbers'
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'huge_enumerable/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "huge_enumerable"
|
8
|
+
spec.version = HugeEnumerable::VERSION
|
9
|
+
spec.authors = ["Frank Hall"]
|
10
|
+
spec.email = ["ChapterHouse.Dune@gmail.com"]
|
11
|
+
spec.description = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
|
12
|
+
spec.summary = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
|
13
|
+
spec.homepage = "https://github.com/ChapterHouse/huge_enumerable.git"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency 'rspec', '~> 2.13'
|
24
|
+
spec.add_development_dependency 'rdoc'
|
25
|
+
spec.add_runtime_dependency "backports" # Wish this could be conditional. It is only used for ruby 1.8 for as long as I support it.
|
26
|
+
spec.add_runtime_dependency "prime_miller_rabin", ">= 0.0.2"
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'huge_enumerable'
|
2
|
+
# The simplest form of a HugeEnumerable.
|
3
|
+
# This class can be used for large arrays or anything else that responds to [].
|
4
|
+
# It is not necessary for the enumerable to be completely mapped into memory.
|
5
|
+
# It only has to be able to return the element mapped to the index given to [].
|
6
|
+
# ==== Examples
|
7
|
+
#
|
8
|
+
# Using HugeCollection directly:
|
9
|
+
#
|
10
|
+
# original_array = ('a'..'z').to_a
|
11
|
+
# collection = HugeCollection.new(original_array)
|
12
|
+
# collection.shuffle!
|
13
|
+
# original_array[0..4] # => ["a", "b", "c", "d", "e"]
|
14
|
+
# collection[0..4] # => ["j", "a", "r", "i", "z"]
|
15
|
+
#
|
16
|
+
#
|
17
|
+
# Subclassing HugeCollection
|
18
|
+
#
|
19
|
+
# class StringNext < HugeCollection
|
20
|
+
#
|
21
|
+
# def initialize(size)
|
22
|
+
# @collection_size = size
|
23
|
+
# super ('a'..'z').to_a
|
24
|
+
# end
|
25
|
+
#
|
26
|
+
# private
|
27
|
+
#
|
28
|
+
# def fetch(index)
|
29
|
+
# result = ""
|
30
|
+
# index += 1
|
31
|
+
# while index > 0
|
32
|
+
# index -= 1
|
33
|
+
# result.prepend super(index % 26)
|
34
|
+
# index /= 26
|
35
|
+
# end
|
36
|
+
# result
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# googol = 10*100
|
42
|
+
# collection = StringNext.new(googol)
|
43
|
+
# collection.size # => 10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
|
44
|
+
# collection[0] # => "a"
|
45
|
+
# collection[-1] # => "zhxrtplbmwaiwcqlzpmglpziaegsdivmbvlnssusbjtbcgywaycqnhxztqwwikxvrsptazpp"
|
46
|
+
# collection[googol / 2] # => "dlijhfafxmqxnusmhfpshmdmopvodxfnkfgivwvnejaapyxmynutdlmjhxxqrykiiuizzhi"
|
47
|
+
# collection.shuffle!
|
48
|
+
# collection[0] # => "bipzqqzayczkgsmaseflwktpsotzclcjsqlnnjaciaawufpojywxflknuddhqkilhoedacn"
|
49
|
+
# collecyion[-1] # => "etneuebyurxgrvrfsreesxuvjaiyoqwplofsptacjdbhuhafdiwbwujvniokltgkjbfkiuy"
|
50
|
+
class HugeCollection < HugeEnumerable
|
51
|
+
|
52
|
+
# Create a new HugeCollection
|
53
|
+
#
|
54
|
+
# ==== Attributes
|
55
|
+
#
|
56
|
+
# * +enumerable+ - Any enumerable that responds to []
|
57
|
+
#
|
58
|
+
# ==== Options
|
59
|
+
#
|
60
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
61
|
+
# * +:rng+ - The random number generator to use.
|
62
|
+
def initialize(enumerable, max_array_size = nil, rng = nil)
|
63
|
+
@enum = enumerable
|
64
|
+
super(max_array_size, rng)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
attr_accessor :enum
|
70
|
+
|
71
|
+
def collection_size
|
72
|
+
@collection_size ||= enum_size
|
73
|
+
end
|
74
|
+
|
75
|
+
def enum_size
|
76
|
+
@enum_size ||= enum.size
|
77
|
+
end
|
78
|
+
|
79
|
+
def fetch(index)
|
80
|
+
enum[index]
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'huge_enumerable'
|
2
|
+
# HugeCombination is a HugeEnumerable style combination. Comparable to Array#combination.
|
3
|
+
# This class can be used to generate combinations of large arrays or anything else that responds to [].
|
4
|
+
# It is not necessary for the enumerable to be completely mapped into memory.
|
5
|
+
# It only has to be able to return the element mapped to the index given to [].
|
6
|
+
# ==== Examples
|
7
|
+
#
|
8
|
+
# Using HugeCombination directly:
|
9
|
+
#
|
10
|
+
# combination = HugeCombination.new(('a'..'z').to_a, 2)
|
11
|
+
# combination[0..4] # => [["a", "b"], ["a", "c"], ["a", "d"], ["a", "e"], ["a", "f"]]
|
12
|
+
# combination[23..27] # => [["a", "y"], ["a", "z"], ["b", "c"], ["b", "d"], ["b", "e"]]
|
13
|
+
#
|
14
|
+
#
|
15
|
+
# Subclassing HugeCombination
|
16
|
+
#
|
17
|
+
# class NumberArray < HugeCollection
|
18
|
+
#
|
19
|
+
# def initialize(size)
|
20
|
+
# @collection_size = size
|
21
|
+
# super(nil)
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# private
|
25
|
+
#
|
26
|
+
# def fetch(index)
|
27
|
+
# index
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# class NumberCombination < HugeCombination
|
33
|
+
#
|
34
|
+
# def initialize(size)
|
35
|
+
# enumerable = size < 10 ? (0...size).to_a : NumberArray.new(size)
|
36
|
+
# super enumerable, 2, nil, nil
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# private
|
40
|
+
#
|
41
|
+
# def fetch(index)
|
42
|
+
# array = super
|
43
|
+
# sum = array.inject(0) { |sum, i| sum += i }
|
44
|
+
# "#{array.first} + #{array.last} = #{sum}"
|
45
|
+
# end
|
46
|
+
#
|
47
|
+
# end
|
48
|
+
#
|
49
|
+
# combination = NumberCombination.new(10**30)
|
50
|
+
# size = combination.size # => 499999999999999999999999999999500000000000000000000000000000
|
51
|
+
# combination[0] # => "0 + 1 = 1"
|
52
|
+
# combination[-1] # => "999999999999999999999999999998 + 999999999999999999999999999999 = 1999999999999999999999999999997"
|
53
|
+
# combination[size / 2] # => "292893218813452475599155637895 + 296085173605458049080913472356 = 588978392418910524680069110251"
|
54
|
+
class HugeCombination < HugeCollection
|
55
|
+
|
56
|
+
# Create a new HugeCombination
|
57
|
+
#
|
58
|
+
# ==== Attributes
|
59
|
+
#
|
60
|
+
# * +enumerable+ - Any enumerable that responds to []
|
61
|
+
# * +size+ - The number of elements per combination to use from enumerable. (Currently only size 2 is supported)
|
62
|
+
#
|
63
|
+
# ==== Options
|
64
|
+
#
|
65
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
66
|
+
# * +:rng+ - The random number generator to use.
|
67
|
+
def initialize(enumerable, size, max_array_size = nil, rng = nil)
|
68
|
+
raise NotImplementedError, "Not yet implemented for any size != 2" if size != 2 # TODO: Extend this class to handle length N
|
69
|
+
@combination_size = size
|
70
|
+
super(enumerable, max_array_size, rng)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def collection_size
|
76
|
+
sum(enum_size - 1)
|
77
|
+
end
|
78
|
+
|
79
|
+
def fetch(index)
|
80
|
+
cycle = locate_cycle(index)
|
81
|
+
first_index = cycle - 1
|
82
|
+
max_cycles = enum_size - 1
|
83
|
+
used = (cycle - 1) == 0 ? 0 : sum_from(max_cycles, max_cycles - (cycle - 2))
|
84
|
+
second_index = index - used + cycle
|
85
|
+
[enum[first_index], enum[second_index]]
|
86
|
+
end
|
87
|
+
|
88
|
+
def locate_cycle(index, min=0, max=enum_size-1)
|
89
|
+
cycle = min + (max - min) / 2
|
90
|
+
|
91
|
+
check_high = sum_at_cycle(cycle)
|
92
|
+
check_low = sum_at_cycle(cycle - 1)
|
93
|
+
|
94
|
+
if check_high > index && check_low <= index
|
95
|
+
cycle
|
96
|
+
elsif check_low > index
|
97
|
+
locate_cycle(index, min, cycle-1)
|
98
|
+
else
|
99
|
+
locate_cycle(index, cycle+1, max)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def sum(x)
|
104
|
+
x * (x + 1) / 2
|
105
|
+
end
|
106
|
+
|
107
|
+
def sum_from(m, n)
|
108
|
+
m, n = [n, m] if m > n
|
109
|
+
(n + 1 - m)*(n + m)/2
|
110
|
+
end
|
111
|
+
|
112
|
+
def sum_at_cycle(c)
|
113
|
+
ec = enum_size * c
|
114
|
+
(-c + 2*ec - c**2)/2
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'huge_enumerable'
|
2
|
+
# HugePermutation is a HugeEnumerable style permutation. Comparable to Array#permutation.
|
3
|
+
# This class can be used to generate permutations of large arrays or anything else that responds to [].
|
4
|
+
# It is not necessary for the enumerable to be completely mapped into memory.
|
5
|
+
# It only has to be able to return the element mapped to the index given to [].
|
6
|
+
# ==== Examples
|
7
|
+
#
|
8
|
+
# Using HugePermutation directly:
|
9
|
+
#
|
10
|
+
# permutation = HugePermutation.new(('a'..'z').to_a, 2)
|
11
|
+
# permutation[0..4] # => [["a", "b"], ["a", "c"], ["a", "d"], ["a", "e"], ["a", "f"]]
|
12
|
+
# permutation[23..27] # => [["a", "y"], ["a", "z"], ["b", "a"], ["b", "c"], ["b", "d"]]
|
13
|
+
#
|
14
|
+
#
|
15
|
+
# Subclassing HugePermutation
|
16
|
+
#
|
17
|
+
# class SouthernNames < HugePermutation
|
18
|
+
#
|
19
|
+
# def initialize
|
20
|
+
# base_names = %w{Bill Joe Jo Bob Mary Lou Betty Sue Jimmy Ann Lee Ruby Jack Belle Daisy Dixie Lynn}
|
21
|
+
# super base_names, 2, nil, nil
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# private
|
25
|
+
#
|
26
|
+
# def fetch(index)
|
27
|
+
# "Your southern name is: #{super(index).join(' ')}"
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# southern_name = SouthernNames.new
|
33
|
+
# southern_name[0] # => "Your southern name is: Bill Joe"
|
34
|
+
# southern_name[-1] # => "Your southern name is: Lynn Dixie"
|
35
|
+
# size = southern_name.size # => 272
|
36
|
+
# southern_name[size / 2] # => "Your southern name is: Jimmy Ann"
|
37
|
+
class HugePermutation < HugeCollection
|
38
|
+
|
39
|
+
# Create a new HugePermutation
|
40
|
+
#
|
41
|
+
# ==== Attributes
|
42
|
+
#
|
43
|
+
# * +enumerable+ - Any enumerable that responds to []
|
44
|
+
# * +size+ - The number of elements per permutation to use from enumerable. (Currently only size 2 is supported)
|
45
|
+
#
|
46
|
+
# ==== Options
|
47
|
+
#
|
48
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
49
|
+
# * +:rng+ - The random number generator to use.
|
50
|
+
def initialize(enumerable, length, max_array_size = nil, rng = nil)
|
51
|
+
raise NotImplementedError, "Not yet implemented for any length != 2" if length != 2 # TODO: Extend this class to handle length N
|
52
|
+
super(enumerable, max_array_size, rng)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def fetch(x)
|
58
|
+
first_index = x / (enum_size - 1)
|
59
|
+
second_index = ((x % enum_size) + (x / enum_size + 1)) % enum_size
|
60
|
+
[enum[first_index], enum[second_index]]
|
61
|
+
end
|
62
|
+
|
63
|
+
def collection_size
|
64
|
+
enum_size * (enum_size - 1)
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'huge_enumerable'
|
2
|
+
# HugePermutation is a HugeEnumerable style product. Comparable to Array#product.
|
3
|
+
# This class can be used to generate products of large arrays or anything else that responds to [].
|
4
|
+
# It is not necessary for the enumerables to be completely mapped into memory.
|
5
|
+
# They only have to be able to return the element mapped to the index given to [].
|
6
|
+
# ==== Examples
|
7
|
+
#
|
8
|
+
# Using HugeProduct directly:
|
9
|
+
#
|
10
|
+
# product = HugeProduct.new(('a'..'z').to_a, ('A'..'Z').to_a)
|
11
|
+
# product[0..4] # => [["a", "A"], ["a", "B"], ["a", "C"], ["a", "D"], ["a", "E"]]
|
12
|
+
# product[23..27] # => [["a", "X"], ["a", "Y"], ["a", "Z"], ["b", "A"], ["b", "B"]]
|
13
|
+
#
|
14
|
+
#
|
15
|
+
# Subclassing HugeProduct
|
16
|
+
#
|
17
|
+
# class BabyGirlNames < HugeProduct
|
18
|
+
#
|
19
|
+
# def initialize
|
20
|
+
# first_names = %w{Emma Olivia Sophia Isabella Ava Mia Emily Charlotte Ella Amelia Abigail Madison Lily Chloe}
|
21
|
+
# middle_names = %w{Zoe Sophie Evelyn Aubrey Elizabeth Layla Anna Natalie Brooklyn Aria Audrey Ellie Lucy}
|
22
|
+
# super(first_names, middle_names)
|
23
|
+
# end
|
24
|
+
#
|
25
|
+
# private
|
26
|
+
#
|
27
|
+
# def fetch(index)
|
28
|
+
# super(index).join(' ')
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# name = BabyGirlNames.new
|
34
|
+
# name[0] # => "Emma Zoe"
|
35
|
+
# name[-1] # => "Chloe Lucy"
|
36
|
+
# size = name.size # => 182
|
37
|
+
# name[size / 2] # => "Charlotte Zoe"
|
38
|
+
class HugeProduct < HugeEnumerable
|
39
|
+
|
40
|
+
# Create a new HugeProduct
|
41
|
+
#
|
42
|
+
# ==== Attributes
|
43
|
+
#
|
44
|
+
# * +enumerable_a+ - Any enumerable that responds to []
|
45
|
+
# * +enumerable_b+ - Any enumerable that responds to [] (This can be the same object as enumerable_a)
|
46
|
+
#
|
47
|
+
# ==== Options
|
48
|
+
#
|
49
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
50
|
+
# * +:rng+ - The random number generator to use.
|
51
|
+
def initialize(enumerable_a, enumerable_b, max_array_size = nil, rng = nil)
|
52
|
+
@enum_a = enumerable_a
|
53
|
+
@enum_b = enumerable_b
|
54
|
+
super(max_array_size, rng)
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
attr_accessor :enum_a, :enum_b
|
60
|
+
|
61
|
+
def collection_size
|
62
|
+
enum_a_size * enum_b_size
|
63
|
+
end
|
64
|
+
|
65
|
+
def fetch(x)
|
66
|
+
[enum_a[x / enum_b_size], enum_b[x % enum_b_size]]
|
67
|
+
end
|
68
|
+
|
69
|
+
def enum_a_size
|
70
|
+
@enum_a_size ||= enum_a.size
|
71
|
+
end
|
72
|
+
|
73
|
+
def enum_b_size
|
74
|
+
@enum_b_size ||= enum_b.size
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|