map-reduce-ruby 2.1.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -1
- data/CHANGELOG.md +19 -0
- data/Gemfile.lock +1 -1
- data/README.md +11 -5
- data/lib/map_reduce/mapper.rb +60 -33
- data/lib/map_reduce/mergeable.rb +54 -12
- data/lib/map_reduce/reducer.rb +15 -19
- data/lib/map_reduce/version.rb +1 -1
- data/lib/map_reduce.rb +4 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a794ca527320099e74e49e3f3abe1a19a6fc6df186eb04a822c926af5ae4985c
|
4
|
+
data.tar.gz: f8e1b77d7e3fbcf7171aa1fce94273a21fef45d9d76f8768b07ba450c3d51bdb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 79c9b21cd9fd8c586fc61b0b9cc9dd3ddee039fdce2c355965e06309a17436aff28932b68647b0b4ebe90e59208dee0dd4ecf7f83dc06a5216c6da4dea3eb00f
|
7
|
+
data.tar.gz: 22bae4b877451ee66fdcdcd0ff0861caf65a41cab5e3b77c03370a4bb172c5d381f7623ab2f91516875e060c54eec3a8ac9063edeffe9e167a9a3390e25f8e62
|
data/.rubocop.yml
CHANGED
@@ -45,7 +45,7 @@ Style/StringLiteralsInInterpolation:
|
|
45
45
|
EnforcedStyle: double_quotes
|
46
46
|
|
47
47
|
Layout/LineLength:
|
48
|
-
Max:
|
48
|
+
Max: 250
|
49
49
|
|
50
50
|
Style/FrozenStringLiteralComment:
|
51
51
|
EnforcedStyle: never
|
@@ -55,3 +55,6 @@ Style/ObjectThen:
|
|
55
55
|
|
56
56
|
Gemspec/RequireMFA:
|
57
57
|
Enabled: false
|
58
|
+
|
59
|
+
Style/HashTransformValues:
|
60
|
+
Enabled: false
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,24 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## v3.0.0
|
4
|
+
|
5
|
+
* [BREAKING] `MapReduce::Mapper#shuffle` now yields a hash of (partition, path)
|
6
|
+
pairs, which e.g. allows to upload the files in parallel
|
7
|
+
* [BREAKING] `MapReduce::Mapper#shuffle` now requires a `chunk_limit`. This
|
8
|
+
allows to further limit the maximum number of open file descriptors
|
9
|
+
* [BREAKING] `MapReduce::Mapper#shuffle` no longer returns an `Enumerator` when
|
10
|
+
no block is given
|
11
|
+
* [BREAKING] `MapReduce::Reducer::InvalidChunkLimit` is now
|
12
|
+
`MapReduce::InvalidChunkLimit` and inherits from `MapReduce::Error` being the
|
13
|
+
base class for all errors
|
14
|
+
* `MapReduce::Mapper#shuffle` no longer keeps all partition files open. Instead,
|
15
|
+
it writes them one after another to further strictly reduce the number of
|
16
|
+
open file descriptors.
|
17
|
+
|
18
|
+
## v2.1.1
|
19
|
+
|
20
|
+
* Fix in `MapReduce::Mapper` when no `reduce` implementation is given
|
21
|
+
|
3
22
|
## v2.1.0
|
4
23
|
|
5
24
|
* Do not reduce in `MapReduce::Mapper` when no `reduce` implementation is given
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -57,9 +57,11 @@ class WordCountMapper
|
|
57
57
|
mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
|
58
58
|
mapper.map(url)
|
59
59
|
|
60
|
-
mapper.shuffle do |
|
61
|
-
|
62
|
-
|
60
|
+
mapper.shuffle(chunk_limit: 64) do |partitions|
|
61
|
+
partitions.each do |partition, path|
|
62
|
+
# store content of the tempfile located at path e.g. on s3:
|
63
|
+
bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: File.open(path))
|
64
|
+
end
|
63
65
|
end
|
64
66
|
end
|
65
67
|
end
|
@@ -205,6 +207,10 @@ interface of callables, could even be expressed as a simple one-liner:
|
|
205
207
|
MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
|
206
208
|
```
|
207
209
|
|
210
|
+
## Semantic Versioning
|
211
|
+
|
212
|
+
MapReduce is using Semantic Versioning: [SemVer](http://semver.org/)
|
213
|
+
|
208
214
|
## Development
|
209
215
|
|
210
216
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
@@ -224,5 +230,5 @@ https://github.com/mrkamel/map-reduce-ruby
|
|
224
230
|
|
225
231
|
## License
|
226
232
|
|
227
|
-
The gem is available as open source under the terms of the
|
228
|
-
License](https://opensource.org/licenses/MIT).
|
233
|
+
The gem is available as open source under the terms of the
|
234
|
+
[MIT License](https://opensource.org/licenses/MIT).
|
data/lib/map_reduce/mapper.rb
CHANGED
@@ -6,8 +6,6 @@ module MapReduce
|
|
6
6
|
include Reduceable
|
7
7
|
include MonitorMixin
|
8
8
|
|
9
|
-
attr_reader :partitions
|
10
|
-
|
11
9
|
# Initializes a new mapper.
|
12
10
|
#
|
13
11
|
# @param implementation Your map-reduce implementation, i.e. an object
|
@@ -45,9 +43,11 @@ module MapReduce
|
|
45
43
|
def map(*args, **kwargs)
|
46
44
|
@implementation.map(*args, **kwargs) do |new_key, new_value|
|
47
45
|
synchronize do
|
48
|
-
@
|
46
|
+
partition = @partitioner.call(new_key)
|
47
|
+
item = [[partition, new_key], new_value]
|
49
48
|
|
50
|
-
@
|
49
|
+
@buffer.push(item)
|
50
|
+
@buffer_size += JSON.generate(item).bytesize
|
51
51
|
|
52
52
|
write_chunk if @buffer_size >= @memory_limit
|
53
53
|
end
|
@@ -55,59 +55,86 @@ module MapReduce
|
|
55
55
|
end
|
56
56
|
|
57
57
|
# Performs a k-way-merge of the sorted chunks written to tempfiles while
|
58
|
-
# already reducing the result using your map-reduce implementation
|
59
|
-
# splitting the dataset into partitions. Finally yields
|
60
|
-
#
|
58
|
+
# already reducing the result using your map-reduce implementation (if
|
59
|
+
# available) and splitting the dataset into partitions. Finally yields a
|
60
|
+
# hash of (partition, path) pairs containing the data for the partitions
|
61
|
+
# in tempfiles.
|
62
|
+
#
|
63
|
+
# @param chunk_limit [Integer] The maximum number of files to process
|
64
|
+
# at the same time. Most useful when you run on a system where the
|
65
|
+
# number of open file descriptors is limited. If your number of file
|
66
|
+
# descriptors is unlimited, you want to set it to a higher number to
|
67
|
+
# avoid the overhead of multiple runs.
|
61
68
|
#
|
62
69
|
# @example
|
63
|
-
# mapper.shuffle do |
|
64
|
-
#
|
70
|
+
# mapper.shuffle do |partitions|
|
71
|
+
# partitions.each do |partition, path|
|
72
|
+
# # store data e.g. on s3
|
73
|
+
# end
|
65
74
|
# end
|
66
75
|
|
67
|
-
def shuffle(
|
68
|
-
|
76
|
+
def shuffle(chunk_limit:)
|
77
|
+
raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
|
69
78
|
|
70
|
-
|
79
|
+
begin
|
80
|
+
write_chunk if @buffer_size > 0
|
71
81
|
|
72
|
-
|
82
|
+
chunk = k_way_merge(@chunks, chunk_limit: chunk_limit)
|
83
|
+
chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
|
73
84
|
|
74
|
-
|
75
|
-
chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
|
85
|
+
partitions = split_chunk(chunk)
|
76
86
|
|
77
|
-
|
78
|
-
|
87
|
+
yield(partitions.transform_values(&:path))
|
88
|
+
ensure
|
89
|
+
partitions.each_value(&:delete)
|
79
90
|
|
80
|
-
|
91
|
+
@chunks.each(&:delete)
|
92
|
+
@chunks = []
|
81
93
|
end
|
82
94
|
|
83
|
-
|
84
|
-
|
95
|
+
nil
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
def split_chunk(chunk)
|
101
|
+
res = {}
|
102
|
+
current_partition = nil
|
103
|
+
file = nil
|
85
104
|
|
86
|
-
|
105
|
+
chunk.each do |((new_partition, key), value)|
|
106
|
+
if new_partition != current_partition
|
107
|
+
file&.close
|
87
108
|
|
88
|
-
|
89
|
-
|
109
|
+
current_partition = new_partition
|
110
|
+
temp_path = TempPath.new
|
111
|
+
res[new_partition] = temp_path
|
112
|
+
file = File.open(temp_path.path, "w+")
|
113
|
+
end
|
114
|
+
|
115
|
+
file.puts(JSON.generate([key, value]))
|
90
116
|
end
|
91
117
|
|
92
|
-
|
118
|
+
file&.close
|
93
119
|
|
94
|
-
|
120
|
+
res
|
95
121
|
end
|
96
122
|
|
97
|
-
private
|
98
|
-
|
99
123
|
def write_chunk
|
100
|
-
|
124
|
+
temp_path = TempPath.new
|
101
125
|
|
102
126
|
@buffer.sort_by!(&:first)
|
103
127
|
|
104
|
-
|
105
|
-
|
106
|
-
end
|
128
|
+
chunk = @buffer
|
129
|
+
chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
|
107
130
|
|
108
|
-
|
131
|
+
File.open(temp_path.path, "w+") do |file|
|
132
|
+
chunk.each do |pair|
|
133
|
+
file.puts JSON.generate(pair)
|
134
|
+
end
|
135
|
+
end
|
109
136
|
|
110
|
-
@chunks.push(
|
137
|
+
@chunks.push(temp_path)
|
111
138
|
|
112
139
|
@buffer_size = 0
|
113
140
|
@buffer = []
|
data/lib/map_reduce/mergeable.rb
CHANGED
@@ -5,20 +5,62 @@ module MapReduce
|
|
5
5
|
module Mergeable
|
6
6
|
private
|
7
7
|
|
8
|
-
# Performs the k-way-merge of the passed files
|
9
|
-
# a binomial heap. The content of the passed
|
10
|
-
# starts by reading one item of each file and
|
11
|
-
# queue. Afterwards, it continously pops an item
|
12
|
-
# and reads a new item from the file the popped
|
13
|
-
# read item to the queue. This continues up
|
14
|
-
# have been read. This guarantees that the
|
15
|
-
# sorted without having all items in-memory.
|
8
|
+
# Performs the k-way-merge of the passed files referenced by the temp paths
|
9
|
+
# using a priority queue using a binomial heap. The content of the passed
|
10
|
+
# files needs to be sorted. It starts by reading one item of each file and
|
11
|
+
# adding it to the priority queue. Afterwards, it continously pops an item
|
12
|
+
# from the queue, yields it and reads a new item from the file the popped
|
13
|
+
# item belongs to, adding the read item to the queue. This continues up
|
14
|
+
# until all items from the files have been read. This guarantees that the
|
15
|
+
# yielded key-value pairs are sorted without having all items in-memory.
|
16
16
|
#
|
17
|
-
# @param
|
18
|
-
# content of the files must be sorted.
|
17
|
+
# @param temp_paths [TempPath] The files referenced by the temp paths to
|
18
|
+
# run the k-way-merge for. The content of the files must be sorted.
|
19
|
+
# @param chunk_limit [Integer] The maximum number of files to process
|
20
|
+
# at the same time. Most useful when you run on a system where the
|
21
|
+
# number of open file descriptors is limited. If your number of file
|
22
|
+
# descriptors is unlimited, you want to set it to a higher number to
|
23
|
+
# avoid the overhead of multiple runs.
|
19
24
|
|
20
|
-
def k_way_merge(
|
21
|
-
return enum_for(
|
25
|
+
def k_way_merge(temp_paths, chunk_limit:, &block)
|
26
|
+
return enum_for(__method__, temp_paths, chunk_limit: chunk_limit) unless block_given?
|
27
|
+
|
28
|
+
dupped_temp_paths = temp_paths.dup
|
29
|
+
additional_temp_paths = []
|
30
|
+
|
31
|
+
while dupped_temp_paths.size > chunk_limit
|
32
|
+
temp_path_out = TempPath.new
|
33
|
+
|
34
|
+
File.open(temp_path_out.path, "w+") do |file|
|
35
|
+
files = dupped_temp_paths.shift(chunk_limit).map { |temp_path| File.open(temp_path.path, "r") }
|
36
|
+
|
37
|
+
k_way_merge!(files) do |pair|
|
38
|
+
file.puts(JSON.generate(pair))
|
39
|
+
end
|
40
|
+
|
41
|
+
files.each(&:close)
|
42
|
+
end
|
43
|
+
|
44
|
+
dupped_temp_paths.push(temp_path_out)
|
45
|
+
additional_temp_paths.push(temp_path_out)
|
46
|
+
end
|
47
|
+
|
48
|
+
files = dupped_temp_paths.map { |temp_path| File.open(temp_path.path, "r") }
|
49
|
+
k_way_merge!(files, &block)
|
50
|
+
files.each(&:close)
|
51
|
+
|
52
|
+
nil
|
53
|
+
ensure
|
54
|
+
additional_temp_paths&.each(&:delete)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Performs the actual k-way-merge of the specified files.
|
58
|
+
#
|
59
|
+
# @param files [IO, Tempfile] The files to run the k-way-merge for.
|
60
|
+
# The content of the files must be sorted.
|
61
|
+
|
62
|
+
def k_way_merge!(files)
|
63
|
+
return enum_for(__method__, files) unless block_given?
|
22
64
|
|
23
65
|
if files.size == 1
|
24
66
|
files.first.each_line do |line|
|
data/lib/map_reduce/reducer.rb
CHANGED
@@ -6,8 +6,6 @@ module MapReduce
|
|
6
6
|
include Reduceable
|
7
7
|
include MonitorMixin
|
8
8
|
|
9
|
-
class InvalidChunkLimit < StandardError; end
|
10
|
-
|
11
9
|
# Initializes a new reducer.
|
12
10
|
#
|
13
11
|
# @param implementation Your map-reduce implementation, i.e. an object
|
@@ -70,38 +68,36 @@ module MapReduce
|
|
70
68
|
# end
|
71
69
|
|
72
70
|
def reduce(chunk_limit:, &block)
|
73
|
-
return enum_for(
|
71
|
+
return enum_for(__method__, chunk_limit: chunk_limit) unless block_given?
|
74
72
|
|
75
73
|
raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
|
76
74
|
|
77
75
|
begin
|
78
76
|
loop do
|
79
77
|
slice = @temp_paths.shift(chunk_limit)
|
80
|
-
files = slice.select { |temp_path| File.exist?(temp_path.path) }
|
81
|
-
.map { |temp_path| File.open(temp_path.path, "r") }
|
82
|
-
|
83
|
-
begin
|
84
|
-
if @temp_paths.empty?
|
85
|
-
reduce_chunk(k_way_merge(files), @implementation).each do |pair|
|
86
|
-
block.call(pair)
|
87
|
-
end
|
88
78
|
|
89
|
-
|
79
|
+
if @temp_paths.empty?
|
80
|
+
reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
|
81
|
+
block.call(pair)
|
90
82
|
end
|
91
83
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
84
|
+
return
|
85
|
+
end
|
86
|
+
|
87
|
+
File.open(add_chunk, "w+") do |file|
|
88
|
+
reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
|
89
|
+
file.puts JSON.generate(pair)
|
96
90
|
end
|
97
|
-
ensure
|
98
|
-
files.each(&:close)
|
99
|
-
slice.each(&:delete)
|
100
91
|
end
|
92
|
+
ensure
|
93
|
+
slice&.each(&:delete)
|
101
94
|
end
|
102
95
|
ensure
|
103
96
|
@temp_paths.each(&:delete)
|
97
|
+
@temp_paths = []
|
104
98
|
end
|
99
|
+
|
100
|
+
nil
|
105
101
|
end
|
106
102
|
end
|
107
103
|
end
|
data/lib/map_reduce/version.rb
CHANGED
data/lib/map_reduce.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: map-reduce-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|