map-reduce-ruby 2.1.1 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +4 -1
- data/CHANGELOG.md +15 -0
- data/Gemfile.lock +1 -1
- data/README.md +11 -5
- data/lib/map_reduce/mapper.rb +57 -33
- data/lib/map_reduce/mergeable.rb +54 -12
- data/lib/map_reduce/reducer.rb +15 -19
- data/lib/map_reduce/version.rb +1 -1
- data/lib/map_reduce.rb +4 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a794ca527320099e74e49e3f3abe1a19a6fc6df186eb04a822c926af5ae4985c
|
4
|
+
data.tar.gz: f8e1b77d7e3fbcf7171aa1fce94273a21fef45d9d76f8768b07ba450c3d51bdb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 79c9b21cd9fd8c586fc61b0b9cc9dd3ddee039fdce2c355965e06309a17436aff28932b68647b0b4ebe90e59208dee0dd4ecf7f83dc06a5216c6da4dea3eb00f
|
7
|
+
data.tar.gz: 22bae4b877451ee66fdcdcd0ff0861caf65a41cab5e3b77c03370a4bb172c5d381f7623ab2f91516875e060c54eec3a8ac9063edeffe9e167a9a3390e25f8e62
|
data/.rubocop.yml
CHANGED
@@ -45,7 +45,7 @@ Style/StringLiteralsInInterpolation:
|
|
45
45
|
EnforcedStyle: double_quotes
|
46
46
|
|
47
47
|
Layout/LineLength:
|
48
|
-
Max:
|
48
|
+
Max: 250
|
49
49
|
|
50
50
|
Style/FrozenStringLiteralComment:
|
51
51
|
EnforcedStyle: never
|
@@ -55,3 +55,6 @@ Style/ObjectThen:
|
|
55
55
|
|
56
56
|
Gemspec/RequireMFA:
|
57
57
|
Enabled: false
|
58
|
+
|
59
|
+
Style/HashTransformValues:
|
60
|
+
Enabled: false
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,20 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## v3.0.0
|
4
|
+
|
5
|
+
* [BREAKING] `MapReduce::Mapper#shuffle` now yields a hash of (partition, path)
|
6
|
+
pairs, which e.g. allows to upload the files in parallel
|
7
|
+
* [BREAKING] `MapReduce::Mapper#shuffle` now requires a `chunk_limit`. This
|
8
|
+
allows to further limit the maximum number of open file descriptors
|
9
|
+
* [BREAKING] `MapReduce::Mapper#shuffle` no longer returns an `Enumerator` when
|
10
|
+
no block is given
|
11
|
+
* [BREAKING] `MapReduce::Reducer::InvalidChunkLimit` is now
|
12
|
+
`MapReduce::InvalidChunkLimit` and inherits from `MapReduce::Error` being the
|
13
|
+
base class for all errors
|
14
|
+
* `MapReduce::Mapper#shuffle` no longer keeps all partition files open. Instead,
|
15
|
+
it writes them one after another to further strictly reduce the number of
|
16
|
+
open file descriptors.
|
17
|
+
|
3
18
|
## v2.1.1
|
4
19
|
|
5
20
|
* Fix in `MapReduce::Mapper` when no `reduce` implementation is given
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -57,9 +57,11 @@ class WordCountMapper
|
|
57
57
|
mapper = MapReduce::Mapper.new(WordCounter.new, partitioner: MapReduce::HashPartitioner.new(16), memory_limit: 100.megabytes)
|
58
58
|
mapper.map(url)
|
59
59
|
|
60
|
-
mapper.shuffle do |
|
61
|
-
|
62
|
-
|
60
|
+
mapper.shuffle(chunk_limit: 64) do |partitions|
|
61
|
+
partitions.each do |partition, path|
|
62
|
+
# store content of the tempfile located at path e.g. on s3:
|
63
|
+
bucket.object("map_reduce/jobs/#{job_id}/partitions/#{partition}/chunk.#{mapper_id}.json").put(body: File.open(path))
|
64
|
+
end
|
63
65
|
end
|
64
66
|
end
|
65
67
|
end
|
@@ -205,6 +207,10 @@ interface of callables, could even be expressed as a simple one-liner:
|
|
205
207
|
MyPartitioner = proc { |key| Digest::SHA1.hexdigest(JSON.generate(key))[0..4].to_i(16) % 8 }
|
206
208
|
```
|
207
209
|
|
210
|
+
## Semantic Versioning
|
211
|
+
|
212
|
+
MapReduce is using Semantic Versioning: [SemVer](http://semver.org/)
|
213
|
+
|
208
214
|
## Development
|
209
215
|
|
210
216
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
@@ -224,5 +230,5 @@ https://github.com/mrkamel/map-reduce-ruby
|
|
224
230
|
|
225
231
|
## License
|
226
232
|
|
227
|
-
The gem is available as open source under the terms of the
|
228
|
-
License](https://opensource.org/licenses/MIT).
|
233
|
+
The gem is available as open source under the terms of the
|
234
|
+
[MIT License](https://opensource.org/licenses/MIT).
|
data/lib/map_reduce/mapper.rb
CHANGED
@@ -6,8 +6,6 @@ module MapReduce
|
|
6
6
|
include Reduceable
|
7
7
|
include MonitorMixin
|
8
8
|
|
9
|
-
attr_reader :partitions
|
10
|
-
|
11
9
|
# Initializes a new mapper.
|
12
10
|
#
|
13
11
|
# @param implementation Your map-reduce implementation, i.e. an object
|
@@ -45,9 +43,11 @@ module MapReduce
|
|
45
43
|
def map(*args, **kwargs)
|
46
44
|
@implementation.map(*args, **kwargs) do |new_key, new_value|
|
47
45
|
synchronize do
|
48
|
-
@
|
46
|
+
partition = @partitioner.call(new_key)
|
47
|
+
item = [[partition, new_key], new_value]
|
49
48
|
|
50
|
-
@
|
49
|
+
@buffer.push(item)
|
50
|
+
@buffer_size += JSON.generate(item).bytesize
|
51
51
|
|
52
52
|
write_chunk if @buffer_size >= @memory_limit
|
53
53
|
end
|
@@ -55,62 +55,86 @@ module MapReduce
|
|
55
55
|
end
|
56
56
|
|
57
57
|
# Performs a k-way-merge of the sorted chunks written to tempfiles while
|
58
|
-
# already reducing the result using your map-reduce implementation
|
59
|
-
# splitting the dataset into partitions. Finally yields
|
60
|
-
#
|
58
|
+
# already reducing the result using your map-reduce implementation (if
|
59
|
+
# available) and splitting the dataset into partitions. Finally yields a
|
60
|
+
# hash of (partition, path) pairs containing the data for the partitions
|
61
|
+
# in tempfiles.
|
62
|
+
#
|
63
|
+
# @param chunk_limit [Integer] The maximum number of files to process
|
64
|
+
# at the same time. Most useful when you run on a system where the
|
65
|
+
# number of open file descriptors is limited. If your number of file
|
66
|
+
# descriptors is unlimited, you want to set it to a higher number to
|
67
|
+
# avoid the overhead of multiple runs.
|
61
68
|
#
|
62
69
|
# @example
|
63
|
-
# mapper.shuffle do |
|
64
|
-
#
|
70
|
+
# mapper.shuffle do |partitions|
|
71
|
+
# partitions.each do |partition, path|
|
72
|
+
# # store data e.g. on s3
|
73
|
+
# end
|
65
74
|
# end
|
66
75
|
|
67
|
-
def shuffle(
|
68
|
-
|
76
|
+
def shuffle(chunk_limit:)
|
77
|
+
raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
|
69
78
|
|
70
|
-
|
79
|
+
begin
|
80
|
+
write_chunk if @buffer_size > 0
|
71
81
|
|
72
|
-
|
82
|
+
chunk = k_way_merge(@chunks, chunk_limit: chunk_limit)
|
83
|
+
chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
|
73
84
|
|
74
|
-
|
75
|
-
chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
|
85
|
+
partitions = split_chunk(chunk)
|
76
86
|
|
77
|
-
|
78
|
-
|
87
|
+
yield(partitions.transform_values(&:path))
|
88
|
+
ensure
|
89
|
+
partitions.each_value(&:delete)
|
79
90
|
|
80
|
-
|
91
|
+
@chunks.each(&:delete)
|
92
|
+
@chunks = []
|
81
93
|
end
|
82
94
|
|
83
|
-
|
84
|
-
|
95
|
+
nil
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
85
99
|
|
86
|
-
|
100
|
+
def split_chunk(chunk)
|
101
|
+
res = {}
|
102
|
+
current_partition = nil
|
103
|
+
file = nil
|
87
104
|
|
88
|
-
|
89
|
-
|
105
|
+
chunk.each do |((new_partition, key), value)|
|
106
|
+
if new_partition != current_partition
|
107
|
+
file&.close
|
108
|
+
|
109
|
+
current_partition = new_partition
|
110
|
+
temp_path = TempPath.new
|
111
|
+
res[new_partition] = temp_path
|
112
|
+
file = File.open(temp_path.path, "w+")
|
113
|
+
end
|
114
|
+
|
115
|
+
file.puts(JSON.generate([key, value]))
|
90
116
|
end
|
91
117
|
|
92
|
-
|
118
|
+
file&.close
|
93
119
|
|
94
|
-
|
120
|
+
res
|
95
121
|
end
|
96
122
|
|
97
|
-
private
|
98
|
-
|
99
123
|
def write_chunk
|
100
|
-
|
124
|
+
temp_path = TempPath.new
|
101
125
|
|
102
126
|
@buffer.sort_by!(&:first)
|
103
127
|
|
104
128
|
chunk = @buffer
|
105
129
|
chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
|
106
130
|
|
107
|
-
|
108
|
-
|
131
|
+
File.open(temp_path.path, "w+") do |file|
|
132
|
+
chunk.each do |pair|
|
133
|
+
file.puts JSON.generate(pair)
|
134
|
+
end
|
109
135
|
end
|
110
136
|
|
111
|
-
|
112
|
-
|
113
|
-
@chunks.push(tempfile)
|
137
|
+
@chunks.push(temp_path)
|
114
138
|
|
115
139
|
@buffer_size = 0
|
116
140
|
@buffer = []
|
data/lib/map_reduce/mergeable.rb
CHANGED
@@ -5,20 +5,62 @@ module MapReduce
|
|
5
5
|
module Mergeable
|
6
6
|
private
|
7
7
|
|
8
|
-
# Performs the k-way-merge of the passed files
|
9
|
-
# a binomial heap. The content of the passed
|
10
|
-
# starts by reading one item of each file and
|
11
|
-
# queue. Afterwards, it continously pops an item
|
12
|
-
# and reads a new item from the file the popped
|
13
|
-
# read item to the queue. This continues up
|
14
|
-
# have been read. This guarantees that the
|
15
|
-
# sorted without having all items in-memory.
|
8
|
+
# Performs the k-way-merge of the passed files referenced by the temp paths
|
9
|
+
# using a priority queue using a binomial heap. The content of the passed
|
10
|
+
# files needs to be sorted. It starts by reading one item of each file and
|
11
|
+
# adding it to the priority queue. Afterwards, it continously pops an item
|
12
|
+
# from the queue, yields it and reads a new item from the file the popped
|
13
|
+
# item belongs to, adding the read item to the queue. This continues up
|
14
|
+
# until all items from the files have been read. This guarantees that the
|
15
|
+
# yielded key-value pairs are sorted without having all items in-memory.
|
16
16
|
#
|
17
|
-
# @param
|
18
|
-
# content of the files must be sorted.
|
17
|
+
# @param temp_paths [TempPath] The files referenced by the temp paths to
|
18
|
+
# run the k-way-merge for. The content of the files must be sorted.
|
19
|
+
# @param chunk_limit [Integer] The maximum number of files to process
|
20
|
+
# at the same time. Most useful when you run on a system where the
|
21
|
+
# number of open file descriptors is limited. If your number of file
|
22
|
+
# descriptors is unlimited, you want to set it to a higher number to
|
23
|
+
# avoid the overhead of multiple runs.
|
19
24
|
|
20
|
-
def k_way_merge(
|
21
|
-
return enum_for(
|
25
|
+
def k_way_merge(temp_paths, chunk_limit:, &block)
|
26
|
+
return enum_for(__method__, temp_paths, chunk_limit: chunk_limit) unless block_given?
|
27
|
+
|
28
|
+
dupped_temp_paths = temp_paths.dup
|
29
|
+
additional_temp_paths = []
|
30
|
+
|
31
|
+
while dupped_temp_paths.size > chunk_limit
|
32
|
+
temp_path_out = TempPath.new
|
33
|
+
|
34
|
+
File.open(temp_path_out.path, "w+") do |file|
|
35
|
+
files = dupped_temp_paths.shift(chunk_limit).map { |temp_path| File.open(temp_path.path, "r") }
|
36
|
+
|
37
|
+
k_way_merge!(files) do |pair|
|
38
|
+
file.puts(JSON.generate(pair))
|
39
|
+
end
|
40
|
+
|
41
|
+
files.each(&:close)
|
42
|
+
end
|
43
|
+
|
44
|
+
dupped_temp_paths.push(temp_path_out)
|
45
|
+
additional_temp_paths.push(temp_path_out)
|
46
|
+
end
|
47
|
+
|
48
|
+
files = dupped_temp_paths.map { |temp_path| File.open(temp_path.path, "r") }
|
49
|
+
k_way_merge!(files, &block)
|
50
|
+
files.each(&:close)
|
51
|
+
|
52
|
+
nil
|
53
|
+
ensure
|
54
|
+
additional_temp_paths&.each(&:delete)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Performs the actual k-way-merge of the specified files.
|
58
|
+
#
|
59
|
+
# @param files [IO, Tempfile] The files to run the k-way-merge for.
|
60
|
+
# The content of the files must be sorted.
|
61
|
+
|
62
|
+
def k_way_merge!(files)
|
63
|
+
return enum_for(__method__, files) unless block_given?
|
22
64
|
|
23
65
|
if files.size == 1
|
24
66
|
files.first.each_line do |line|
|
data/lib/map_reduce/reducer.rb
CHANGED
@@ -6,8 +6,6 @@ module MapReduce
|
|
6
6
|
include Reduceable
|
7
7
|
include MonitorMixin
|
8
8
|
|
9
|
-
class InvalidChunkLimit < StandardError; end
|
10
|
-
|
11
9
|
# Initializes a new reducer.
|
12
10
|
#
|
13
11
|
# @param implementation Your map-reduce implementation, i.e. an object
|
@@ -70,38 +68,36 @@ module MapReduce
|
|
70
68
|
# end
|
71
69
|
|
72
70
|
def reduce(chunk_limit:, &block)
|
73
|
-
return enum_for(
|
71
|
+
return enum_for(__method__, chunk_limit: chunk_limit) unless block_given?
|
74
72
|
|
75
73
|
raise(InvalidChunkLimit, "Chunk limit must be >= 2") unless chunk_limit >= 2
|
76
74
|
|
77
75
|
begin
|
78
76
|
loop do
|
79
77
|
slice = @temp_paths.shift(chunk_limit)
|
80
|
-
files = slice.select { |temp_path| File.exist?(temp_path.path) }
|
81
|
-
.map { |temp_path| File.open(temp_path.path, "r") }
|
82
|
-
|
83
|
-
begin
|
84
|
-
if @temp_paths.empty?
|
85
|
-
reduce_chunk(k_way_merge(files), @implementation).each do |pair|
|
86
|
-
block.call(pair)
|
87
|
-
end
|
88
78
|
|
89
|
-
|
79
|
+
if @temp_paths.empty?
|
80
|
+
reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
|
81
|
+
block.call(pair)
|
90
82
|
end
|
91
83
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
84
|
+
return
|
85
|
+
end
|
86
|
+
|
87
|
+
File.open(add_chunk, "w+") do |file|
|
88
|
+
reduce_chunk(k_way_merge(slice, chunk_limit: chunk_limit), @implementation).each do |pair|
|
89
|
+
file.puts JSON.generate(pair)
|
96
90
|
end
|
97
|
-
ensure
|
98
|
-
files.each(&:close)
|
99
|
-
slice.each(&:delete)
|
100
91
|
end
|
92
|
+
ensure
|
93
|
+
slice&.each(&:delete)
|
101
94
|
end
|
102
95
|
ensure
|
103
96
|
@temp_paths.each(&:delete)
|
97
|
+
@temp_paths = []
|
104
98
|
end
|
99
|
+
|
100
|
+
nil
|
105
101
|
end
|
106
102
|
end
|
107
103
|
end
|
data/lib/map_reduce/version.rb
CHANGED
data/lib/map_reduce.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: map-reduce-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|