huge_enumerable 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +20 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +17 -0
- data/huge_enumerable.gemspec +28 -0
- data/lib/huge_enumerable/huge_collection.rb +83 -0
- data/lib/huge_enumerable/huge_combination.rb +118 -0
- data/lib/huge_enumerable/huge_permutation.rb +67 -0
- data/lib/huge_enumerable/huge_product.rb +77 -0
- data/lib/huge_enumerable/version.rb +4 -0
- data/lib/huge_enumerable.rb +357 -0
- data/spec/lib/huge_enumerable/huge_collection_spec.rb +33 -0
- data/spec/lib/huge_enumerable/huge_combination_spec.rb +38 -0
- data/spec/lib/huge_enumerable/huge_permutation_spec.rb +38 -0
- data/spec/lib/huge_enumerable/huge_product_spec.rb +35 -0
- data/spec/lib/huge_enumerable_spec.rb +642 -0
- data/spec/spec_helper.rb +1 -0
- metadata +175 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Frank Hall
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# HugeEnumerable
|
2
|
+
|
3
|
+
Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'huge_enumerable'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install huge_enumerable
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
See HugeCollection, HugeCombination, HugePermutation, and HugeProduct for ways of utilizing HugeEnumerable.
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
require 'rdoc/task'
|
4
|
+
|
5
|
+
RSpec::Core::RakeTask.new
|
6
|
+
|
7
|
+
task :default => :spec
|
8
|
+
task :test => :spec
|
9
|
+
|
10
|
+
RDoc::Task.new do |rdoc|
|
11
|
+
rdoc.rdoc_dir = 'doc'
|
12
|
+
rdoc.main = 'README.md'
|
13
|
+
rdoc.rdoc_files.include 'README.md', "lib/**/*\.rb"
|
14
|
+
|
15
|
+
rdoc.options << '--line-numbers'
|
16
|
+
end
|
17
|
+
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'huge_enumerable/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "huge_enumerable"
|
8
|
+
spec.version = HugeEnumerable::VERSION
|
9
|
+
spec.authors = ["Frank Hall"]
|
10
|
+
spec.email = ["ChapterHouse.Dune@gmail.com"]
|
11
|
+
spec.description = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
|
12
|
+
spec.summary = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
|
13
|
+
spec.homepage = "https://github.com/ChapterHouse/huge_enumerable.git"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_development_dependency 'rspec', '~> 2.13'
|
24
|
+
spec.add_development_dependency 'rdoc'
|
25
|
+
spec.add_runtime_dependency "backports" # Wish this could be conditional. It is only used for ruby 1.8 for as long as I support it.
|
26
|
+
spec.add_runtime_dependency "prime_miller_rabin", ">= 0.0.2"
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'huge_enumerable'
|
2
|
+
# The simplest form of a HugeEnumerable.
|
3
|
+
# This class can be used for large arrays or anything else that responds to [].
|
4
|
+
# It is not necessary for the enumerable to be completely mapped into memory.
|
5
|
+
# It only has to be able to return the element mapped to the index given to [].
|
6
|
+
# ==== Examples
|
7
|
+
#
|
8
|
+
# Using HugeCollection directly:
|
9
|
+
#
|
10
|
+
# original_array = ('a'..'z').to_a
|
11
|
+
# collection = HugeCollection.new(original_array)
|
12
|
+
# collection.shuffle!
|
13
|
+
# original_array[0..4] # => ["a", "b", "c", "d", "e"]
|
14
|
+
# collection[0..4] # => ["j", "a", "r", "i", "z"]
|
15
|
+
#
|
16
|
+
#
|
17
|
+
# Subclassing HugeCollection
|
18
|
+
#
|
19
|
+
# class StringNext < HugeCollection
|
20
|
+
#
|
21
|
+
# def initialize(size)
|
22
|
+
# @collection_size = size
|
23
|
+
# super ('a'..'z').to_a
|
24
|
+
# end
|
25
|
+
#
|
26
|
+
# private
|
27
|
+
#
|
28
|
+
# def fetch(index)
|
29
|
+
# result = ""
|
30
|
+
# index += 1
|
31
|
+
# while index > 0
|
32
|
+
# index -= 1
|
33
|
+
# result.prepend super(index % 26)
|
34
|
+
# index /= 26
|
35
|
+
# end
|
36
|
+
# result
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# end
|
40
|
+
#
|
41
|
+
# googol = 10*100
|
42
|
+
# collection = StringNext.new(googol)
|
43
|
+
# collection.size # => 10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
|
44
|
+
# collection[0] # => "a"
|
45
|
+
# collection[-1] # => "zhxrtplbmwaiwcqlzpmglpziaegsdivmbvlnssusbjtbcgywaycqnhxztqwwikxvrsptazpp"
|
46
|
+
# collection[googol / 2] # => "dlijhfafxmqxnusmhfpshmdmopvodxfnkfgivwvnejaapyxmynutdlmjhxxqrykiiuizzhi"
|
47
|
+
# collection.shuffle!
|
48
|
+
# collection[0] # => "bipzqqzayczkgsmaseflwktpsotzclcjsqlnnjaciaawufpojywxflknuddhqkilhoedacn"
|
49
|
+
# collecyion[-1] # => "etneuebyurxgrvrfsreesxuvjaiyoqwplofsptacjdbhuhafdiwbwujvniokltgkjbfkiuy"
|
50
|
+
class HugeCollection < HugeEnumerable
|
51
|
+
|
52
|
+
# Create a new HugeCollection
|
53
|
+
#
|
54
|
+
# ==== Attributes
|
55
|
+
#
|
56
|
+
# * +enumerable+ - Any enumerable that responds to []
|
57
|
+
#
|
58
|
+
# ==== Options
|
59
|
+
#
|
60
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
61
|
+
# * +:rng+ - The random number generator to use.
|
62
|
+
def initialize(enumerable, max_array_size = nil, rng = nil)
|
63
|
+
@enum = enumerable
|
64
|
+
super(max_array_size, rng)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
attr_accessor :enum
|
70
|
+
|
71
|
+
def collection_size
|
72
|
+
@collection_size ||= enum_size
|
73
|
+
end
|
74
|
+
|
75
|
+
def enum_size
|
76
|
+
@enum_size ||= enum.size
|
77
|
+
end
|
78
|
+
|
79
|
+
def fetch(index)
|
80
|
+
enum[index]
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'huge_enumerable'
|
2
|
+
# HugeCombination is a HugeEnumerable style combination. Comparable to Array#combination.
|
3
|
+
# This class can be used to generate combinations of large arrays or anything else that responds to [].
|
4
|
+
# It is not necessary for the enumerable to be completely mapped into memory.
|
5
|
+
# It only has to be able to return the element mapped to the index given to [].
|
6
|
+
# ==== Examples
|
7
|
+
#
|
8
|
+
# Using HugeCombination directly:
|
9
|
+
#
|
10
|
+
# combination = HugeCombination.new(('a'..'z').to_a, 2)
|
11
|
+
# combination[0..4] # => [["a", "b"], ["a", "c"], ["a", "d"], ["a", "e"], ["a", "f"]]
|
12
|
+
# combination[23..27] # => [["a", "y"], ["a", "z"], ["b", "c"], ["b", "d"], ["b", "e"]]
|
13
|
+
#
|
14
|
+
#
|
15
|
+
# Subclassing HugeCombination
|
16
|
+
#
|
17
|
+
# class NumberArray < HugeCollection
|
18
|
+
#
|
19
|
+
# def initialize(size)
|
20
|
+
# @collection_size = size
|
21
|
+
# super(nil)
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# private
|
25
|
+
#
|
26
|
+
# def fetch(index)
|
27
|
+
# index
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# class NumberCombination < HugeCombination
|
33
|
+
#
|
34
|
+
# def initialize(size)
|
35
|
+
# enumerable = size < 10 ? (0...size).to_a : NumberArray.new(size)
|
36
|
+
# super enumerable, 2, nil, nil
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# private
|
40
|
+
#
|
41
|
+
# def fetch(index)
|
42
|
+
# array = super
|
43
|
+
# sum = array.inject(0) { |sum, i| sum += i }
|
44
|
+
# "#{array.first} + #{array.last} = #{sum}"
|
45
|
+
# end
|
46
|
+
#
|
47
|
+
# end
|
48
|
+
#
|
49
|
+
# combination = NumberCombination.new(10**30)
|
50
|
+
# size = combination.size # => 499999999999999999999999999999500000000000000000000000000000
|
51
|
+
# combination[0] # => "0 + 1 = 1"
|
52
|
+
# combination[-1] # => "999999999999999999999999999998 + 999999999999999999999999999999 = 1999999999999999999999999999997"
|
53
|
+
# combination[size / 2] # => "292893218813452475599155637895 + 296085173605458049080913472356 = 588978392418910524680069110251"
|
54
|
+
class HugeCombination < HugeCollection
|
55
|
+
|
56
|
+
# Create a new HugeCombination
|
57
|
+
#
|
58
|
+
# ==== Attributes
|
59
|
+
#
|
60
|
+
# * +enumerable+ - Any enumerable that responds to []
|
61
|
+
# * +size+ - The number of elements per combination to use from enumerable. (Currently only size 2 is supported)
|
62
|
+
#
|
63
|
+
# ==== Options
|
64
|
+
#
|
65
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
66
|
+
# * +:rng+ - The random number generator to use.
|
67
|
+
def initialize(enumerable, size, max_array_size = nil, rng = nil)
|
68
|
+
raise NotImplementedError, "Not yet implemented for any size != 2" if size != 2 # TODO: Extend this class to handle length N
|
69
|
+
@combination_size = size
|
70
|
+
super(enumerable, max_array_size, rng)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def collection_size
|
76
|
+
sum(enum_size - 1)
|
77
|
+
end
|
78
|
+
|
79
|
+
def fetch(index)
|
80
|
+
cycle = locate_cycle(index)
|
81
|
+
first_index = cycle - 1
|
82
|
+
max_cycles = enum_size - 1
|
83
|
+
used = (cycle - 1) == 0 ? 0 : sum_from(max_cycles, max_cycles - (cycle - 2))
|
84
|
+
second_index = index - used + cycle
|
85
|
+
[enum[first_index], enum[second_index]]
|
86
|
+
end
|
87
|
+
|
88
|
+
def locate_cycle(index, min=0, max=enum_size-1)
|
89
|
+
cycle = min + (max - min) / 2
|
90
|
+
|
91
|
+
check_high = sum_at_cycle(cycle)
|
92
|
+
check_low = sum_at_cycle(cycle - 1)
|
93
|
+
|
94
|
+
if check_high > index && check_low <= index
|
95
|
+
cycle
|
96
|
+
elsif check_low > index
|
97
|
+
locate_cycle(index, min, cycle-1)
|
98
|
+
else
|
99
|
+
locate_cycle(index, cycle+1, max)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def sum(x)
|
104
|
+
x * (x + 1) / 2
|
105
|
+
end
|
106
|
+
|
107
|
+
def sum_from(m, n)
|
108
|
+
m, n = [n, m] if m > n
|
109
|
+
(n + 1 - m)*(n + m)/2
|
110
|
+
end
|
111
|
+
|
112
|
+
def sum_at_cycle(c)
|
113
|
+
ec = enum_size * c
|
114
|
+
(-c + 2*ec - c**2)/2
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|
118
|
+
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'huge_enumerable'
|
2
|
+
# HugePermutation is a HugeEnumerable style permutation. Comparable to Array#permutation.
|
3
|
+
# This class can be used to generate permutations of large arrays or anything else that responds to [].
|
4
|
+
# It is not necessary for the enumerable to be completely mapped into memory.
|
5
|
+
# It only has to be able to return the element mapped to the index given to [].
|
6
|
+
# ==== Examples
|
7
|
+
#
|
8
|
+
# Using HugePermutation directly:
|
9
|
+
#
|
10
|
+
# permutation = HugePermutation.new(('a'..'z').to_a, 2)
|
11
|
+
# permutation[0..4] # => [["a", "b"], ["a", "c"], ["a", "d"], ["a", "e"], ["a", "f"]]
|
12
|
+
# permutation[23..27] # => [["a", "y"], ["a", "z"], ["b", "a"], ["b", "c"], ["b", "d"]]
|
13
|
+
#
|
14
|
+
#
|
15
|
+
# Subclassing HugePermutation
|
16
|
+
#
|
17
|
+
# class SouthernNames < HugePermutation
|
18
|
+
#
|
19
|
+
# def initialize
|
20
|
+
# base_names = %w{Bill Joe Jo Bob Mary Lou Betty Sue Jimmy Ann Lee Ruby Jack Belle Daisy Dixie Lynn}
|
21
|
+
# super base_names, 2, nil, nil
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# private
|
25
|
+
#
|
26
|
+
# def fetch(index)
|
27
|
+
# "Your southern name is: #{super(index).join(' ')}"
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# end
|
31
|
+
#
|
32
|
+
# southern_name = SouthernNames.new
|
33
|
+
# southern_name[0] # => "Your southern name is: Bill Joe"
|
34
|
+
# southern_name[-1] # => "Your southern name is: Lynn Dixie"
|
35
|
+
# size = southern_name.size # => 272
|
36
|
+
# southern_name[size / 2] # => "Your southern name is: Jimmy Ann"
|
37
|
+
class HugePermutation < HugeCollection
|
38
|
+
|
39
|
+
# Create a new HugePermutation
|
40
|
+
#
|
41
|
+
# ==== Attributes
|
42
|
+
#
|
43
|
+
# * +enumerable+ - Any enumerable that responds to []
|
44
|
+
# * +size+ - The number of elements per permutation to use from enumerable. (Currently only size 2 is supported)
|
45
|
+
#
|
46
|
+
# ==== Options
|
47
|
+
#
|
48
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
49
|
+
# * +:rng+ - The random number generator to use.
|
50
|
+
def initialize(enumerable, length, max_array_size = nil, rng = nil)
|
51
|
+
raise NotImplementedError, "Not yet implemented for any length != 2" if length != 2 # TODO: Extend this class to handle length N
|
52
|
+
super(enumerable, max_array_size, rng)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def fetch(x)
|
58
|
+
first_index = x / (enum_size - 1)
|
59
|
+
second_index = ((x % enum_size) + (x / enum_size + 1)) % enum_size
|
60
|
+
[enum[first_index], enum[second_index]]
|
61
|
+
end
|
62
|
+
|
63
|
+
def collection_size
|
64
|
+
enum_size * (enum_size - 1)
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
require 'huge_enumerable'
|
2
|
+
# HugePermutation is a HugeEnumerable style product. Comparable to Array#product.
|
3
|
+
# This class can be used to generate products of large arrays or anything else that responds to [].
|
4
|
+
# It is not necessary for the enumerables to be completely mapped into memory.
|
5
|
+
# They only have to be able to return the element mapped to the index given to [].
|
6
|
+
# ==== Examples
|
7
|
+
#
|
8
|
+
# Using HugeProduct directly:
|
9
|
+
#
|
10
|
+
# product = HugeProduct.new(('a'..'z').to_a, ('A'..'Z').to_a)
|
11
|
+
# product[0..4] # => [["a", "A"], ["a", "B"], ["a", "C"], ["a", "D"], ["a", "E"]]
|
12
|
+
# product[23..27] # => [["a", "X"], ["a", "Y"], ["a", "Z"], ["b", "A"], ["b", "B"]]
|
13
|
+
#
|
14
|
+
#
|
15
|
+
# Subclassing HugeProduct
|
16
|
+
#
|
17
|
+
# class BabyGirlNames < HugeProduct
|
18
|
+
#
|
19
|
+
# def initialize
|
20
|
+
# first_names = %w{Emma Olivia Sophia Isabella Ava Mia Emily Charlotte Ella Amelia Abigail Madison Lily Chloe}
|
21
|
+
# middle_names = %w{Zoe Sophie Evelyn Aubrey Elizabeth Layla Anna Natalie Brooklyn Aria Audrey Ellie Lucy}
|
22
|
+
# super(first_names, middle_names)
|
23
|
+
# end
|
24
|
+
#
|
25
|
+
# private
|
26
|
+
#
|
27
|
+
# def fetch(index)
|
28
|
+
# super(index).join(' ')
|
29
|
+
# end
|
30
|
+
#
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# name = BabyGirlNames.new
|
34
|
+
# name[0] # => "Emma Zoe"
|
35
|
+
# name[-1] # => "Chloe Lucy"
|
36
|
+
# size = name.size # => 182
|
37
|
+
# name[size / 2] # => "Charlotte Zoe"
|
38
|
+
class HugeProduct < HugeEnumerable
|
39
|
+
|
40
|
+
# Create a new HugeProduct
|
41
|
+
#
|
42
|
+
# ==== Attributes
|
43
|
+
#
|
44
|
+
# * +enumerable_a+ - Any enumerable that responds to []
|
45
|
+
# * +enumerable_b+ - Any enumerable that responds to [] (This can be the same object as enumerable_a)
|
46
|
+
#
|
47
|
+
# ==== Options
|
48
|
+
#
|
49
|
+
# * +:max_array_size+ - The default size of arrays when #to_a is called.
|
50
|
+
# * +:rng+ - The random number generator to use.
|
51
|
+
def initialize(enumerable_a, enumerable_b, max_array_size = nil, rng = nil)
|
52
|
+
@enum_a = enumerable_a
|
53
|
+
@enum_b = enumerable_b
|
54
|
+
super(max_array_size, rng)
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
attr_accessor :enum_a, :enum_b
|
60
|
+
|
61
|
+
def collection_size
|
62
|
+
enum_a_size * enum_b_size
|
63
|
+
end
|
64
|
+
|
65
|
+
def fetch(x)
|
66
|
+
[enum_a[x / enum_b_size], enum_b[x % enum_b_size]]
|
67
|
+
end
|
68
|
+
|
69
|
+
def enum_a_size
|
70
|
+
@enum_a_size ||= enum_a.size
|
71
|
+
end
|
72
|
+
|
73
|
+
def enum_b_size
|
74
|
+
@enum_b_size ||= enum_b.size
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|