huge_enumerable 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ Gemfile.lock
6
+ InstalledFiles
7
+ coverage
8
+ InstalledFiles
9
+ lib/bundler/man
10
+ pkg
11
+ rdoc
12
+ spec/reports
13
+ test/tmp
14
+ test/version_tmp
15
+ tmp
16
+
17
+ # YARD artifacts
18
+ .yardoc
19
+ _yardoc
20
+ doc/
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Frank Hall
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # HugeEnumerable
2
+
3
+ Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'huge_enumerable'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install huge_enumerable
18
+
19
+ ## Usage
20
+
21
+ See HugeCollection, HugeCombination, HugePermutation, and HugeProduct for ways of utilizing HugeEnumerable.
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+ require 'rdoc/task'
4
+
5
+ RSpec::Core::RakeTask.new
6
+
7
+ task :default => :spec
8
+ task :test => :spec
9
+
10
+ RDoc::Task.new do |rdoc|
11
+ rdoc.rdoc_dir = 'doc'
12
+ rdoc.main = 'README.md'
13
+ rdoc.rdoc_files.include 'README.md', "lib/**/*\.rb"
14
+
15
+ rdoc.options << '--line-numbers'
16
+ end
17
+
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'huge_enumerable/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "huge_enumerable"
8
+ spec.version = HugeEnumerable::VERSION
9
+ spec.authors = ["Frank Hall"]
10
+ spec.email = ["ChapterHouse.Dune@gmail.com"]
11
+ spec.description = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
12
+ spec.summary = %q{Enumerate, sample, shuffle, combine, permutate, and create products of massive data sets using minimal memory}
13
+ spec.homepage = "https://github.com/ChapterHouse/huge_enumerable.git"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency 'rspec', '~> 2.13'
24
+ spec.add_development_dependency 'rdoc'
25
+ spec.add_runtime_dependency "backports" # Wish this could be conditional. It is only used for ruby 1.8 for as long as I support it.
26
+ spec.add_runtime_dependency "prime_miller_rabin", ">= 0.0.2"
27
+
28
+ end
@@ -0,0 +1,83 @@
1
+ require 'huge_enumerable'
2
+ # The simplest form of a HugeEnumerable.
3
+ # This class can be used for large arrays or anything else that responds to [].
4
+ # It is not necessary for the enumerable to be completely mapped into memory.
5
+ # It only has to be able to return the element mapped to the index given to [].
6
+ # ==== Examples
7
+ #
8
+ # Using HugeCollection directly:
9
+ #
10
+ # original_array = ('a'..'z').to_a
11
+ # collection = HugeCollection.new(original_array)
12
+ # collection.shuffle!
13
+ # original_array[0..4] # => ["a", "b", "c", "d", "e"]
14
+ # collection[0..4] # => ["j", "a", "r", "i", "z"]
15
+ #
16
+ #
17
+ # Subclassing HugeCollection
18
+ #
19
+ # class StringNext < HugeCollection
20
+ #
21
+ # def initialize(size)
22
+ # @collection_size = size
23
+ # super ('a'..'z').to_a
24
+ # end
25
+ #
26
+ # private
27
+ #
28
+ # def fetch(index)
29
+ # result = ""
30
+ # index += 1
31
+ # while index > 0
32
+ # index -= 1
33
+ # result.prepend super(index % 26)
34
+ # index /= 26
35
+ # end
36
+ # result
37
+ # end
38
+ #
39
+ # end
40
+ #
41
+ # googol = 10*100
42
+ # collection = StringNext.new(googol)
43
+ # collection.size # => 10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
44
+ # collection[0] # => "a"
45
+ # collection[-1] # => "zhxrtplbmwaiwcqlzpmglpziaegsdivmbvlnssusbjtbcgywaycqnhxztqwwikxvrsptazpp"
46
+ # collection[googol / 2] # => "dlijhfafxmqxnusmhfpshmdmopvodxfnkfgivwvnejaapyxmynutdlmjhxxqrykiiuizzhi"
47
+ # collection.shuffle!
48
+ # collection[0] # => "bipzqqzayczkgsmaseflwktpsotzclcjsqlnnjaciaawufpojywxflknuddhqkilhoedacn"
49
+ # collecyion[-1] # => "etneuebyurxgrvrfsreesxuvjaiyoqwplofsptacjdbhuhafdiwbwujvniokltgkjbfkiuy"
50
+ class HugeCollection < HugeEnumerable
51
+
52
+ # Create a new HugeCollection
53
+ #
54
+ # ==== Attributes
55
+ #
56
+ # * +enumerable+ - Any enumerable that responds to []
57
+ #
58
+ # ==== Options
59
+ #
60
+ # * +:max_array_size+ - The default size of arrays when #to_a is called.
61
+ # * +:rng+ - The random number generator to use.
62
+ def initialize(enumerable, max_array_size = nil, rng = nil)
63
+ @enum = enumerable
64
+ super(max_array_size, rng)
65
+ end
66
+
67
+ private
68
+
69
+ attr_accessor :enum
70
+
71
+ def collection_size
72
+ @collection_size ||= enum_size
73
+ end
74
+
75
+ def enum_size
76
+ @enum_size ||= enum.size
77
+ end
78
+
79
+ def fetch(index)
80
+ enum[index]
81
+ end
82
+
83
+ end
@@ -0,0 +1,118 @@
1
+ require 'huge_enumerable'
2
+ # HugeCombination is a HugeEnumerable style combination. Comparable to Array#combination.
3
+ # This class can be used to generate combinations of large arrays or anything else that responds to [].
4
+ # It is not necessary for the enumerable to be completely mapped into memory.
5
+ # It only has to be able to return the element mapped to the index given to [].
6
+ # ==== Examples
7
+ #
8
+ # Using HugeCombination directly:
9
+ #
10
+ # combination = HugeCombination.new(('a'..'z').to_a, 2)
11
+ # combination[0..4] # => [["a", "b"], ["a", "c"], ["a", "d"], ["a", "e"], ["a", "f"]]
12
+ # combination[23..27] # => [["a", "y"], ["a", "z"], ["b", "c"], ["b", "d"], ["b", "e"]]
13
+ #
14
+ #
15
+ # Subclassing HugeCombination
16
+ #
17
+ # class NumberArray < HugeCollection
18
+ #
19
+ # def initialize(size)
20
+ # @collection_size = size
21
+ # super(nil)
22
+ # end
23
+ #
24
+ # private
25
+ #
26
+ # def fetch(index)
27
+ # index
28
+ # end
29
+ #
30
+ # end
31
+ #
32
+ # class NumberCombination < HugeCombination
33
+ #
34
+ # def initialize(size)
35
+ # enumerable = size < 10 ? (0...size).to_a : NumberArray.new(size)
36
+ # super enumerable, 2, nil, nil
37
+ # end
38
+ #
39
+ # private
40
+ #
41
+ # def fetch(index)
42
+ # array = super
43
+ # sum = array.inject(0) { |sum, i| sum += i }
44
+ # "#{array.first} + #{array.last} = #{sum}"
45
+ # end
46
+ #
47
+ # end
48
+ #
49
+ # combination = NumberCombination.new(10**30)
50
+ # size = combination.size # => 499999999999999999999999999999500000000000000000000000000000
51
+ # combination[0] # => "0 + 1 = 1"
52
+ # combination[-1] # => "999999999999999999999999999998 + 999999999999999999999999999999 = 1999999999999999999999999999997"
53
+ # combination[size / 2] # => "292893218813452475599155637895 + 296085173605458049080913472356 = 588978392418910524680069110251"
54
+ class HugeCombination < HugeCollection
55
+
56
+ # Create a new HugeCombination
57
+ #
58
+ # ==== Attributes
59
+ #
60
+ # * +enumerable+ - Any enumerable that responds to []
61
+ # * +size+ - The number of elements per combination to use from enumerable. (Currently only size 2 is supported)
62
+ #
63
+ # ==== Options
64
+ #
65
+ # * +:max_array_size+ - The default size of arrays when #to_a is called.
66
+ # * +:rng+ - The random number generator to use.
67
+ def initialize(enumerable, size, max_array_size = nil, rng = nil)
68
+ raise NotImplementedError, "Not yet implemented for any size != 2" if size != 2 # TODO: Extend this class to handle length N
69
+ @combination_size = size
70
+ super(enumerable, max_array_size, rng)
71
+ end
72
+
73
+ private
74
+
75
+ def collection_size
76
+ sum(enum_size - 1)
77
+ end
78
+
79
+ def fetch(index)
80
+ cycle = locate_cycle(index)
81
+ first_index = cycle - 1
82
+ max_cycles = enum_size - 1
83
+ used = (cycle - 1) == 0 ? 0 : sum_from(max_cycles, max_cycles - (cycle - 2))
84
+ second_index = index - used + cycle
85
+ [enum[first_index], enum[second_index]]
86
+ end
87
+
88
+ def locate_cycle(index, min=0, max=enum_size-1)
89
+ cycle = min + (max - min) / 2
90
+
91
+ check_high = sum_at_cycle(cycle)
92
+ check_low = sum_at_cycle(cycle - 1)
93
+
94
+ if check_high > index && check_low <= index
95
+ cycle
96
+ elsif check_low > index
97
+ locate_cycle(index, min, cycle-1)
98
+ else
99
+ locate_cycle(index, cycle+1, max)
100
+ end
101
+ end
102
+
103
+ def sum(x)
104
+ x * (x + 1) / 2
105
+ end
106
+
107
+ def sum_from(m, n)
108
+ m, n = [n, m] if m > n
109
+ (n + 1 - m)*(n + m)/2
110
+ end
111
+
112
+ def sum_at_cycle(c)
113
+ ec = enum_size * c
114
+ (-c + 2*ec - c**2)/2
115
+ end
116
+
117
+ end
118
+
@@ -0,0 +1,67 @@
1
+ require 'huge_enumerable'
2
+ # HugePermutation is a HugeEnumerable style permutation. Comparable to Array#permutation.
3
+ # This class can be used to generate permutations of large arrays or anything else that responds to [].
4
+ # It is not necessary for the enumerable to be completely mapped into memory.
5
+ # It only has to be able to return the element mapped to the index given to [].
6
+ # ==== Examples
7
+ #
8
+ # Using HugePermutation directly:
9
+ #
10
+ # permutation = HugePermutation.new(('a'..'z').to_a, 2)
11
+ # permutation[0..4] # => [["a", "b"], ["a", "c"], ["a", "d"], ["a", "e"], ["a", "f"]]
12
+ # permutation[23..27] # => [["a", "y"], ["a", "z"], ["b", "a"], ["b", "c"], ["b", "d"]]
13
+ #
14
+ #
15
+ # Subclassing HugePermutation
16
+ #
17
+ # class SouthernNames < HugePermutation
18
+ #
19
+ # def initialize
20
+ # base_names = %w{Bill Joe Jo Bob Mary Lou Betty Sue Jimmy Ann Lee Ruby Jack Belle Daisy Dixie Lynn}
21
+ # super base_names, 2, nil, nil
22
+ # end
23
+ #
24
+ # private
25
+ #
26
+ # def fetch(index)
27
+ # "Your southern name is: #{super(index).join(' ')}"
28
+ # end
29
+ #
30
+ # end
31
+ #
32
+ # southern_name = SouthernNames.new
33
+ # southern_name[0] # => "Your southern name is: Bill Joe"
34
+ # southern_name[-1] # => "Your southern name is: Lynn Dixie"
35
+ # size = southern_name.size # => 272
36
+ # southern_name[size / 2] # => "Your southern name is: Jimmy Ann"
37
+ class HugePermutation < HugeCollection
38
+
39
+ # Create a new HugePermutation
40
+ #
41
+ # ==== Attributes
42
+ #
43
+ # * +enumerable+ - Any enumerable that responds to []
44
+ # * +size+ - The number of elements per permutation to use from enumerable. (Currently only size 2 is supported)
45
+ #
46
+ # ==== Options
47
+ #
48
+ # * +:max_array_size+ - The default size of arrays when #to_a is called.
49
+ # * +:rng+ - The random number generator to use.
50
+ def initialize(enumerable, length, max_array_size = nil, rng = nil)
51
+ raise NotImplementedError, "Not yet implemented for any length != 2" if length != 2 # TODO: Extend this class to handle length N
52
+ super(enumerable, max_array_size, rng)
53
+ end
54
+
55
+ private
56
+
57
+ def fetch(x)
58
+ first_index = x / (enum_size - 1)
59
+ second_index = ((x % enum_size) + (x / enum_size + 1)) % enum_size
60
+ [enum[first_index], enum[second_index]]
61
+ end
62
+
63
+ def collection_size
64
+ enum_size * (enum_size - 1)
65
+ end
66
+
67
+ end
@@ -0,0 +1,77 @@
1
+ require 'huge_enumerable'
2
+ # HugePermutation is a HugeEnumerable style product. Comparable to Array#product.
3
+ # This class can be used to generate products of large arrays or anything else that responds to [].
4
+ # It is not necessary for the enumerables to be completely mapped into memory.
5
+ # They only have to be able to return the element mapped to the index given to [].
6
+ # ==== Examples
7
+ #
8
+ # Using HugeProduct directly:
9
+ #
10
+ # product = HugeProduct.new(('a'..'z').to_a, ('A'..'Z').to_a)
11
+ # product[0..4] # => [["a", "A"], ["a", "B"], ["a", "C"], ["a", "D"], ["a", "E"]]
12
+ # product[23..27] # => [["a", "X"], ["a", "Y"], ["a", "Z"], ["b", "A"], ["b", "B"]]
13
+ #
14
+ #
15
+ # Subclassing HugeProduct
16
+ #
17
+ # class BabyGirlNames < HugeProduct
18
+ #
19
+ # def initialize
20
+ # first_names = %w{Emma Olivia Sophia Isabella Ava Mia Emily Charlotte Ella Amelia Abigail Madison Lily Chloe}
21
+ # middle_names = %w{Zoe Sophie Evelyn Aubrey Elizabeth Layla Anna Natalie Brooklyn Aria Audrey Ellie Lucy}
22
+ # super(first_names, middle_names)
23
+ # end
24
+ #
25
+ # private
26
+ #
27
+ # def fetch(index)
28
+ # super(index).join(' ')
29
+ # end
30
+ #
31
+ # end
32
+ #
33
+ # name = BabyGirlNames.new
34
+ # name[0] # => "Emma Zoe"
35
+ # name[-1] # => "Chloe Lucy"
36
+ # size = name.size # => 182
37
+ # name[size / 2] # => "Charlotte Zoe"
38
+ class HugeProduct < HugeEnumerable
39
+
40
+ # Create a new HugeProduct
41
+ #
42
+ # ==== Attributes
43
+ #
44
+ # * +enumerable_a+ - Any enumerable that responds to []
45
+ # * +enumerable_b+ - Any enumerable that responds to [] (This can be the same object as enumerable_a)
46
+ #
47
+ # ==== Options
48
+ #
49
+ # * +:max_array_size+ - The default size of arrays when #to_a is called.
50
+ # * +:rng+ - The random number generator to use.
51
+ def initialize(enumerable_a, enumerable_b, max_array_size = nil, rng = nil)
52
+ @enum_a = enumerable_a
53
+ @enum_b = enumerable_b
54
+ super(max_array_size, rng)
55
+ end
56
+
57
+ private
58
+
59
+ attr_accessor :enum_a, :enum_b
60
+
61
+ def collection_size
62
+ enum_a_size * enum_b_size
63
+ end
64
+
65
+ def fetch(x)
66
+ [enum_a[x / enum_b_size], enum_b[x % enum_b_size]]
67
+ end
68
+
69
+ def enum_a_size
70
+ @enum_a_size ||= enum_a.size
71
+ end
72
+
73
+ def enum_b_size
74
+ @enum_b_size ||= enum_b.size
75
+ end
76
+
77
+ end
@@ -0,0 +1,4 @@
1
+ class HugeEnumerable
2
+ # "0.0.1"
3
+ VERSION = "0.0.1"
4
+ end