map-reduce-ruby 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +6 -0
- data/CHANGELOG.md +27 -0
- data/Gemfile.lock +29 -28
- data/README.md +20 -0
- data/lib/map_reduce/mapper.rb +1 -1
- data/lib/map_reduce/mergeable.rb +2 -2
- data/lib/map_reduce/priority_queue.rb +23 -1
- data/lib/map_reduce/version.rb +1 -1
- data/map-reduce-ruby.gemspec +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 68967e02da6776738d27e48228374396dc613f4ee4a48f08268b71c92764eaa0
|
4
|
+
data.tar.gz: 0f0633adb6f2c51617ea3234284b30433da4a9ceb25d167ce8eaef2527dc09db
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d27e08793121dd81d4f8cc887f350d30ec9ea4039dad3490a4c7969164413cf575bdba0aa4c767097fc4af124aa3bb17b55f6781ff03919ef7d36164b3b14fa6
|
7
|
+
data.tar.gz: 71165ebbab647370de51c709b34f8e5cc8348300a41809354472ac125ef542255ec767a4613046cb964443b698fa7ec5dc8334d40df2a64728822aab0b170b43
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1 +1,28 @@
|
|
1
1
|
# CHANGELOG
|
2
|
+
|
3
|
+
## v2.0.0
|
4
|
+
|
5
|
+
* [BREAKING] Keys are no longer automatically converted to json before using
|
6
|
+
them for sorting
|
7
|
+
* This allows to have proper semantic sort order for numeric keys in addition
|
8
|
+
to just the clustering of keys
|
9
|
+
* Examples of valid keys: `"key"`, `["foo", 1.0]`, `["foo", ["bar"]]`
|
10
|
+
* Examples of problematic keys: `nil`, `true`, `["foo", nil]`, `{ "foo" => "bar" }`
|
11
|
+
* For migration purposes it is recommended to convert your keys to and from
|
12
|
+
json manually if you have complex keys using `JSON.generate`/`JSON.parse`:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
class WordCounter
|
16
|
+
def map(url)
|
17
|
+
HTTP.get(url).to_s.split.each do |word|
|
18
|
+
yield(JSON.generate("key" => word), 1) # if you use a hash for the key
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def reduce(json_key, count1, count2)
|
23
|
+
key = JSON.parse(json_key) # if you want to access the original key
|
24
|
+
|
25
|
+
count1 + count2
|
26
|
+
end
|
27
|
+
end
|
28
|
+
```
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
map-reduce-ruby (
|
4
|
+
map-reduce-ruby (2.0.0)
|
5
5
|
json
|
6
6
|
lazy_priority_queue
|
7
7
|
|
@@ -9,41 +9,42 @@ GEM
|
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
11
|
ast (2.4.2)
|
12
|
-
diff-lcs (1.
|
13
|
-
json (2.
|
12
|
+
diff-lcs (1.5.0)
|
13
|
+
json (2.6.2)
|
14
14
|
lazy_priority_queue (0.1.1)
|
15
|
-
parallel (1.
|
16
|
-
parser (3.
|
15
|
+
parallel (1.22.1)
|
16
|
+
parser (3.1.2.1)
|
17
17
|
ast (~> 2.4.1)
|
18
|
-
rainbow (3.
|
19
|
-
regexp_parser (2.0
|
20
|
-
rexml (3.2.
|
21
|
-
rspec (3.
|
22
|
-
rspec-core (~> 3.
|
23
|
-
rspec-expectations (~> 3.
|
24
|
-
rspec-mocks (~> 3.
|
25
|
-
rspec-core (3.
|
26
|
-
rspec-support (~> 3.
|
27
|
-
rspec-expectations (3.
|
18
|
+
rainbow (3.1.1)
|
19
|
+
regexp_parser (2.5.0)
|
20
|
+
rexml (3.2.5)
|
21
|
+
rspec (3.11.0)
|
22
|
+
rspec-core (~> 3.11.0)
|
23
|
+
rspec-expectations (~> 3.11.0)
|
24
|
+
rspec-mocks (~> 3.11.0)
|
25
|
+
rspec-core (3.11.0)
|
26
|
+
rspec-support (~> 3.11.0)
|
27
|
+
rspec-expectations (3.11.1)
|
28
28
|
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
-
rspec-support (~> 3.
|
30
|
-
rspec-mocks (3.
|
29
|
+
rspec-support (~> 3.11.0)
|
30
|
+
rspec-mocks (3.11.1)
|
31
31
|
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
-
rspec-support (~> 3.
|
33
|
-
rspec-support (3.
|
34
|
-
rubocop (
|
32
|
+
rspec-support (~> 3.11.0)
|
33
|
+
rspec-support (3.11.1)
|
34
|
+
rubocop (1.36.0)
|
35
|
+
json (~> 2.3)
|
35
36
|
parallel (~> 1.10)
|
36
|
-
parser (>= 2.
|
37
|
+
parser (>= 3.1.2.1)
|
37
38
|
rainbow (>= 2.2.2, < 4.0)
|
38
|
-
regexp_parser (>= 1.8)
|
39
|
-
rexml
|
40
|
-
rubocop-ast (>=
|
39
|
+
regexp_parser (>= 1.8, < 3.0)
|
40
|
+
rexml (>= 3.2.5, < 4.0)
|
41
|
+
rubocop-ast (>= 1.20.1, < 2.0)
|
41
42
|
ruby-progressbar (~> 1.7)
|
42
|
-
unicode-display_width (>= 1.4.0, <
|
43
|
-
rubocop-ast (1.
|
44
|
-
parser (>=
|
43
|
+
unicode-display_width (>= 1.4.0, < 3.0)
|
44
|
+
rubocop-ast (1.21.0)
|
45
|
+
parser (>= 3.1.1.0)
|
45
46
|
ruby-progressbar (1.11.0)
|
46
|
-
unicode-display_width (
|
47
|
+
unicode-display_width (2.3.0)
|
47
48
|
|
48
49
|
PLATFORMS
|
49
50
|
ruby
|
data/README.md
CHANGED
@@ -120,6 +120,26 @@ mappers are finished.
|
|
120
120
|
|
121
121
|
That's it.
|
122
122
|
|
123
|
+
## Limitations for Keys
|
124
|
+
|
125
|
+
You have to make sure that your keys are properly sortable in ruby. Please
|
126
|
+
note:
|
127
|
+
|
128
|
+
```ruby
|
129
|
+
"key" < nil # comparison of String with nil failed (ArgumentError)
|
130
|
+
|
131
|
+
false < true # undefined method `<' for false:FalseClass (NoMethodError)
|
132
|
+
|
133
|
+
1 > "key" # comparison of Integer with String failed (ArgumentError
|
134
|
+
|
135
|
+
{ "key" => "value1" } < { "key" => "value2" } #=> false
|
136
|
+
{ "key" => "value1" } > { "key" => "value2" } #=> false
|
137
|
+
{ "key" => "value1" } <=> { "key" => "value2" } #=> nil
|
138
|
+
```
|
139
|
+
|
140
|
+
For those reasons, it is recommended to only use strings, numbers and arrays or
|
141
|
+
a combination of those.
|
142
|
+
|
123
143
|
## Internals
|
124
144
|
|
125
145
|
To fully understand the performance details, the following outlines the inner
|
data/lib/map_reduce/mapper.rb
CHANGED
data/lib/map_reduce/mergeable.rb
CHANGED
@@ -29,7 +29,7 @@ module MapReduce
|
|
29
29
|
|
30
30
|
key, value = JSON.parse(line)
|
31
31
|
|
32
|
-
queue.push([key, value, index],
|
32
|
+
queue.push([key, value, index], key)
|
33
33
|
end
|
34
34
|
|
35
35
|
loop do
|
@@ -45,7 +45,7 @@ module MapReduce
|
|
45
45
|
|
46
46
|
key, value = JSON.parse(line)
|
47
47
|
|
48
|
-
queue.push([key, value, index],
|
48
|
+
queue.push([key, value, index], key)
|
49
49
|
end
|
50
50
|
|
51
51
|
files.each(&:rewind)
|
@@ -1,4 +1,26 @@
|
|
1
1
|
module MapReduce
|
2
|
+
# Since LazyPriorityQueue is using <= and >=, but not <=>, it does not
|
3
|
+
# support sorting array keys. Therefore we wrap the keys in SortKey, which
|
4
|
+
# provides those operators. See https://bugs.ruby-lang.org/issues/5574
|
5
|
+
|
6
|
+
class SortKey
|
7
|
+
include Comparable
|
8
|
+
|
9
|
+
attr_reader :object
|
10
|
+
|
11
|
+
def initialize(object)
|
12
|
+
@object = object
|
13
|
+
end
|
14
|
+
|
15
|
+
def <=>(other)
|
16
|
+
res = object <=> other.object
|
17
|
+
|
18
|
+
raise(ArgumentError, "Unable to compare #{@object.inspect} with #{other.object.inspect}") if res.nil?
|
19
|
+
|
20
|
+
res
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
2
24
|
# The MapReduce::PriorityQueue implements a min priority queue using a
|
3
25
|
# binomial heap.
|
4
26
|
|
@@ -25,7 +47,7 @@ module MapReduce
|
|
25
47
|
# priority_queue.push("some object", "some key")
|
26
48
|
|
27
49
|
def push(object, key)
|
28
|
-
@queue.push([@sequence_number, object], key)
|
50
|
+
@queue.push([@sequence_number, object], SortKey.new(key))
|
29
51
|
|
30
52
|
@sequence_number += 1
|
31
53
|
end
|
data/lib/map_reduce/version.rb
CHANGED
data/map-reduce-ruby.gemspec
CHANGED
@@ -7,7 +7,7 @@ Gem::Specification.new do |spec|
|
|
7
7
|
spec.email = ["vetter@flakks.com"]
|
8
8
|
|
9
9
|
spec.summary = "The easiest way to write distributed, larger than memory map-reduce jobs"
|
10
|
-
spec.description = "The MapReduce gem is the easiest way to write custom, distributed, larger "\
|
10
|
+
spec.description = "The MapReduce gem is the easiest way to write custom, distributed, larger " \
|
11
11
|
"than memory map-reduce jobs"
|
12
12
|
spec.homepage = "https://github.com/mrkamel/map-reduce-ruby"
|
13
13
|
spec.license = "MIT"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: map-reduce-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-09-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -104,7 +104,7 @@ metadata:
|
|
104
104
|
homepage_uri: https://github.com/mrkamel/map-reduce-ruby
|
105
105
|
source_code_uri: https://github.com/mrkamel/map-reduce-ruby
|
106
106
|
changelog_uri: https://github.com/mrkamel/map-reduce/blob/master/CHANGELOG.md
|
107
|
-
post_install_message:
|
107
|
+
post_install_message:
|
108
108
|
rdoc_options: []
|
109
109
|
require_paths:
|
110
110
|
- lib
|
@@ -119,8 +119,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
119
119
|
- !ruby/object:Gem::Version
|
120
120
|
version: '0'
|
121
121
|
requirements: []
|
122
|
-
rubygems_version: 3.
|
123
|
-
signing_key:
|
122
|
+
rubygems_version: 3.3.3
|
123
|
+
signing_key:
|
124
124
|
specification_version: 4
|
125
125
|
summary: The easiest way to write distributed, larger than memory map-reduce jobs
|
126
126
|
test_files: []
|