map-reduce-ruby 1.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +6 -0
- data/CHANGELOG.md +31 -0
- data/Gemfile.lock +29 -28
- data/README.md +24 -7
- data/lib/map_reduce/mapper.rb +5 -2
- data/lib/map_reduce/mergeable.rb +12 -2
- data/lib/map_reduce/priority_queue.rb +23 -1
- data/lib/map_reduce/version.rb +1 -1
- data/map-reduce-ruby.gemspec +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4f3eeb7739b733f1abdf325ddfcf5a4c42fb257edd837952781246fcda9f48fb
|
4
|
+
data.tar.gz: f40eb08a341fc522c8f7043b74ac1c47fb00af491a107f51a6a76ca67f389629
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6e91d7a1f55f0b89d333b317ecedc7978ca29a709dfad403906f6f2835b121a0a49afb29d03f55d8d7c8aa5c1b70628400404b7bf4d590046ae2e66ed48d7abc
|
7
|
+
data.tar.gz: cc4524aec895d935b70548163e5818f8725c5a05985e55f9b8d9330c34491968bb3ec54f02466ac0748eeb130f47a40d07baefa2627b04ed10dbac1c9638b1af
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1 +1,32 @@
|
|
1
1
|
# CHANGELOG
|
2
|
+
|
3
|
+
## v2.1.0
|
4
|
+
|
5
|
+
* Do not reduce in `MapReduce::Mapper` when no `reduce` implementation is given
|
6
|
+
|
7
|
+
## v2.0.0
|
8
|
+
|
9
|
+
* [BREAKING] Keys are no longer automatically converted to json before using
|
10
|
+
them for sorting
|
11
|
+
* This allows to have proper semantic sort order for numeric keys in addition
|
12
|
+
to just the clustering of keys
|
13
|
+
* Examples of valid keys: `"key"`, `["foo", 1.0]`, `["foo", ["bar"]]`
|
14
|
+
* Examples of problematic keys: `nil`, `true`, `["foo", nil]`, `{ "foo" => "bar" }`
|
15
|
+
* For migration purposes it is recommended to convert your keys to and from
|
16
|
+
json manually if you have complex keys using `JSON.generate`/`JSON.parse`:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
class WordCounter
|
20
|
+
def map(url)
|
21
|
+
HTTP.get(url).to_s.split.each do |word|
|
22
|
+
yield(JSON.generate("key" => word), 1) # if you use a hash for the key
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def reduce(json_key, count1, count2)
|
27
|
+
key = JSON.parse(json_key) # if you want to access the original key
|
28
|
+
|
29
|
+
count1 + count2
|
30
|
+
end
|
31
|
+
end
|
32
|
+
```
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
map-reduce-ruby (1.0
|
4
|
+
map-reduce-ruby (2.1.0)
|
5
5
|
json
|
6
6
|
lazy_priority_queue
|
7
7
|
|
@@ -9,41 +9,42 @@ GEM
|
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
11
|
ast (2.4.2)
|
12
|
-
diff-lcs (1.
|
13
|
-
json (2.
|
12
|
+
diff-lcs (1.5.0)
|
13
|
+
json (2.6.2)
|
14
14
|
lazy_priority_queue (0.1.1)
|
15
|
-
parallel (1.
|
16
|
-
parser (3.
|
15
|
+
parallel (1.22.1)
|
16
|
+
parser (3.1.2.1)
|
17
17
|
ast (~> 2.4.1)
|
18
|
-
rainbow (3.
|
19
|
-
regexp_parser (2.0
|
20
|
-
rexml (3.2.
|
21
|
-
rspec (3.
|
22
|
-
rspec-core (~> 3.
|
23
|
-
rspec-expectations (~> 3.
|
24
|
-
rspec-mocks (~> 3.
|
25
|
-
rspec-core (3.
|
26
|
-
rspec-support (~> 3.
|
27
|
-
rspec-expectations (3.
|
18
|
+
rainbow (3.1.1)
|
19
|
+
regexp_parser (2.5.0)
|
20
|
+
rexml (3.2.5)
|
21
|
+
rspec (3.11.0)
|
22
|
+
rspec-core (~> 3.11.0)
|
23
|
+
rspec-expectations (~> 3.11.0)
|
24
|
+
rspec-mocks (~> 3.11.0)
|
25
|
+
rspec-core (3.11.0)
|
26
|
+
rspec-support (~> 3.11.0)
|
27
|
+
rspec-expectations (3.11.1)
|
28
28
|
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
-
rspec-support (~> 3.
|
30
|
-
rspec-mocks (3.
|
29
|
+
rspec-support (~> 3.11.0)
|
30
|
+
rspec-mocks (3.11.1)
|
31
31
|
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
-
rspec-support (~> 3.
|
33
|
-
rspec-support (3.
|
34
|
-
rubocop (
|
32
|
+
rspec-support (~> 3.11.0)
|
33
|
+
rspec-support (3.11.1)
|
34
|
+
rubocop (1.36.0)
|
35
|
+
json (~> 2.3)
|
35
36
|
parallel (~> 1.10)
|
36
|
-
parser (>= 2.
|
37
|
+
parser (>= 3.1.2.1)
|
37
38
|
rainbow (>= 2.2.2, < 4.0)
|
38
|
-
regexp_parser (>= 1.8)
|
39
|
-
rexml
|
40
|
-
rubocop-ast (>=
|
39
|
+
regexp_parser (>= 1.8, < 3.0)
|
40
|
+
rexml (>= 3.2.5, < 4.0)
|
41
|
+
rubocop-ast (>= 1.20.1, < 2.0)
|
41
42
|
ruby-progressbar (~> 1.7)
|
42
|
-
unicode-display_width (>= 1.4.0, <
|
43
|
-
rubocop-ast (1.
|
44
|
-
parser (>=
|
43
|
+
unicode-display_width (>= 1.4.0, < 3.0)
|
44
|
+
rubocop-ast (1.21.0)
|
45
|
+
parser (>= 3.1.1.0)
|
45
46
|
ruby-progressbar (1.11.0)
|
46
|
-
unicode-display_width (
|
47
|
+
unicode-display_width (2.3.0)
|
47
48
|
|
48
49
|
PLATFORMS
|
49
50
|
ruby
|
data/README.md
CHANGED
@@ -7,8 +7,7 @@ than memory map-reduce jobs by using your local disk and some arbitrary storage
|
|
7
7
|
layer like s3. You can specify how much memory you are willing to offer and
|
8
8
|
MapReduce will use its buffers accordingly. Finally, you can use your already
|
9
9
|
existing background job system like `sidekiq` or one of its various
|
10
|
-
alternatives.
|
11
|
-
serialized as json.
|
10
|
+
alternatives.
|
12
11
|
|
13
12
|
## Installation
|
14
13
|
|
@@ -30,9 +29,7 @@ Or install it yourself as:
|
|
30
29
|
|
31
30
|
Any map-reduce job consists of an implementation of your `map` function, your
|
32
31
|
`reduce` function and worker code. So let's start with an implementation for a
|
33
|
-
word count map-reduce task which fetches txt documents from the web.
|
34
|
-
note that your keys and values can be everything that can be serialized as
|
35
|
-
json, but nothing else.
|
32
|
+
word count map-reduce task which fetches txt documents from the web.
|
36
33
|
|
37
34
|
```ruby
|
38
35
|
class WordCounter
|
@@ -68,8 +65,8 @@ class WordCountMapper
|
|
68
65
|
end
|
69
66
|
```
|
70
67
|
|
71
|
-
Please note that `MapReduce::HashPartitioner.new(16)` states that we want
|
72
|
-
the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
|
68
|
+
Please note that `MapReduce::HashPartitioner.new(16)` states that we want to
|
69
|
+
split the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
|
73
70
|
worker code to run the reduce part:
|
74
71
|
|
75
72
|
```ruby
|
@@ -120,6 +117,26 @@ mappers are finished.
|
|
120
117
|
|
121
118
|
That's it.
|
122
119
|
|
120
|
+
## Limitations for Keys
|
121
|
+
|
122
|
+
You have to make sure that your keys are properly sortable in ruby. Please
|
123
|
+
note:
|
124
|
+
|
125
|
+
```ruby
|
126
|
+
"key" < nil # comparison of String with nil failed (ArgumentError)
|
127
|
+
|
128
|
+
false < true # undefined method `<' for false:FalseClass (NoMethodError)
|
129
|
+
|
130
|
+
1 > "key" # comparison of Integer with String failed (ArgumentError
|
131
|
+
|
132
|
+
{ "key" => "value1" } < { "key" => "value2" } #=> false
|
133
|
+
{ "key" => "value1" } > { "key" => "value2" } #=> false
|
134
|
+
{ "key" => "value1" } <=> { "key" => "value2" } #=> nil
|
135
|
+
```
|
136
|
+
|
137
|
+
For those reasons, it is recommended to only use strings, numbers and arrays or
|
138
|
+
a combination of those.
|
139
|
+
|
123
140
|
## Internals
|
124
141
|
|
125
142
|
To fully understand the performance details, the following outlines the inner
|
data/lib/map_reduce/mapper.rb
CHANGED
@@ -71,7 +71,10 @@ module MapReduce
|
|
71
71
|
|
72
72
|
partitions = {}
|
73
73
|
|
74
|
-
|
74
|
+
chunk = k_way_merge(@chunks)
|
75
|
+
chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
|
76
|
+
|
77
|
+
chunk.each do |pair|
|
75
78
|
partition = @partitioner.call(pair[0])
|
76
79
|
|
77
80
|
(partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
|
@@ -96,7 +99,7 @@ module MapReduce
|
|
96
99
|
def write_chunk
|
97
100
|
tempfile = Tempfile.new
|
98
101
|
|
99
|
-
@buffer.sort_by!
|
102
|
+
@buffer.sort_by!(&:first)
|
100
103
|
|
101
104
|
reduce_chunk(@buffer, @implementation).each do |pair|
|
102
105
|
tempfile.puts JSON.generate(pair)
|
data/lib/map_reduce/mergeable.rb
CHANGED
@@ -20,6 +20,16 @@ module MapReduce
|
|
20
20
|
def k_way_merge(files)
|
21
21
|
return enum_for(:k_way_merge, files) unless block_given?
|
22
22
|
|
23
|
+
if files.size == 1
|
24
|
+
files.first.each_line do |line|
|
25
|
+
yield(JSON.parse(line))
|
26
|
+
end
|
27
|
+
|
28
|
+
files.each(&:rewind)
|
29
|
+
|
30
|
+
return
|
31
|
+
end
|
32
|
+
|
23
33
|
queue = PriorityQueue.new
|
24
34
|
|
25
35
|
files.each_with_index do |file, index|
|
@@ -29,7 +39,7 @@ module MapReduce
|
|
29
39
|
|
30
40
|
key, value = JSON.parse(line)
|
31
41
|
|
32
|
-
queue.push([key, value, index],
|
42
|
+
queue.push([key, value, index], key)
|
33
43
|
end
|
34
44
|
|
35
45
|
loop do
|
@@ -45,7 +55,7 @@ module MapReduce
|
|
45
55
|
|
46
56
|
key, value = JSON.parse(line)
|
47
57
|
|
48
|
-
queue.push([key, value, index],
|
58
|
+
queue.push([key, value, index], key)
|
49
59
|
end
|
50
60
|
|
51
61
|
files.each(&:rewind)
|
@@ -1,4 +1,26 @@
|
|
1
1
|
module MapReduce
|
2
|
+
# Since LazyPriorityQueue is using <= and >=, but not <=>, it does not
|
3
|
+
# support sorting array keys. Therefore we wrap the keys in SortKey, which
|
4
|
+
# provides those operators. See https://bugs.ruby-lang.org/issues/5574
|
5
|
+
|
6
|
+
class SortKey
|
7
|
+
include Comparable
|
8
|
+
|
9
|
+
attr_reader :object
|
10
|
+
|
11
|
+
def initialize(object)
|
12
|
+
@object = object
|
13
|
+
end
|
14
|
+
|
15
|
+
def <=>(other)
|
16
|
+
res = object <=> other.object
|
17
|
+
|
18
|
+
raise(ArgumentError, "Unable to compare #{@object.inspect} with #{other.object.inspect}") if res.nil?
|
19
|
+
|
20
|
+
res
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
2
24
|
# The MapReduce::PriorityQueue implements a min priority queue using a
|
3
25
|
# binomial heap.
|
4
26
|
|
@@ -25,7 +47,7 @@ module MapReduce
|
|
25
47
|
# priority_queue.push("some object", "some key")
|
26
48
|
|
27
49
|
def push(object, key)
|
28
|
-
@queue.push([@sequence_number, object], key)
|
50
|
+
@queue.push([@sequence_number, object], SortKey.new(key))
|
29
51
|
|
30
52
|
@sequence_number += 1
|
31
53
|
end
|
data/lib/map_reduce/version.rb
CHANGED
data/map-reduce-ruby.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |spec|
|
|
7
7
|
spec.email = ["vetter@flakks.com"]
|
8
8
|
|
9
9
|
spec.summary = "The easiest way to write distributed, larger than memory map-reduce jobs"
|
10
|
-
spec.description = "The MapReduce gem is the easiest way to write custom, distributed, larger "\
|
10
|
+
spec.description = "The MapReduce gem is the easiest way to write custom, distributed, larger " \
|
11
11
|
"than memory map-reduce jobs"
|
12
12
|
spec.homepage = "https://github.com/mrkamel/map-reduce-ruby"
|
13
13
|
spec.license = "MIT"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: map-reduce-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -104,7 +104,7 @@ metadata:
|
|
104
104
|
homepage_uri: https://github.com/mrkamel/map-reduce-ruby
|
105
105
|
source_code_uri: https://github.com/mrkamel/map-reduce-ruby
|
106
106
|
changelog_uri: https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md
|
107
|
-
post_install_message:
|
107
|
+
post_install_message:
|
108
108
|
rdoc_options: []
|
109
109
|
require_paths:
|
110
110
|
- lib
|
@@ -119,8 +119,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
119
|
- !ruby/object:Gem::Version
|
120
120
|
version: '0'
|
121
121
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
123
|
-
signing_key:
|
122
|
+
rubygems_version: 3.3.3
|
123
|
+
signing_key:
|
124
124
|
specification_version: 4
|
125
125
|
summary: The easiest way to write distributed, larger than memory map-reduce jobs
|
126
126
|
test_files: []
|