map-reduce-ruby 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Gemfile.lock +1 -1
- data/README.md +4 -7
- data/lib/map_reduce/mapper.rb +4 -1
- data/lib/map_reduce/mergeable.rb +10 -0
- data/lib/map_reduce/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4f3eeb7739b733f1abdf325ddfcf5a4c42fb257edd837952781246fcda9f48fb
|
4
|
+
data.tar.gz: f40eb08a341fc522c8f7043b74ac1c47fb00af491a107f51a6a76ca67f389629
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6e91d7a1f55f0b89d333b317ecedc7978ca29a709dfad403906f6f2835b121a0a49afb29d03f55d8d7c8aa5c1b70628400404b7bf4d590046ae2e66ed48d7abc
|
7
|
+
data.tar.gz: cc4524aec895d935b70548163e5818f8725c5a05985e55f9b8d9330c34491968bb3ec54f02466ac0748eeb130f47a40d07baefa2627b04ed10dbac1c9638b1af
|
data/CHANGELOG.md
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -7,8 +7,7 @@ than memory map-reduce jobs by using your local disk and some arbitrary storage
|
|
7
7
|
layer like s3. You can specify how much memory you are willing to offer and
|
8
8
|
MapReduce will use its buffers accordingly. Finally, you can use your already
|
9
9
|
existing background job system like `sidekiq` or one of its various
|
10
|
-
alternatives.
|
11
|
-
serialized as json.
|
10
|
+
alternatives.
|
12
11
|
|
13
12
|
## Installation
|
14
13
|
|
@@ -30,9 +29,7 @@ Or install it yourself as:
|
|
30
29
|
|
31
30
|
Any map-reduce job consists of an implementation of your `map` function, your
|
32
31
|
`reduce` function and worker code. So let's start with an implementation for a
|
33
|
-
word count map-reduce task which fetches txt documents from the web.
|
34
|
-
note that your keys and values can be everything that can be serialized as
|
35
|
-
json, but nothing else.
|
32
|
+
word count map-reduce task which fetches txt documents from the web.
|
36
33
|
|
37
34
|
```ruby
|
38
35
|
class WordCounter
|
@@ -68,8 +65,8 @@ class WordCountMapper
|
|
68
65
|
end
|
69
66
|
```
|
70
67
|
|
71
|
-
Please note that `MapReduce::HashPartitioner.new(16)` states that we want
|
72
|
-
the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
|
68
|
+
Please note that `MapReduce::HashPartitioner.new(16)` states that we want to
|
69
|
+
split the dataset into 16 partitions (i.e. 0, 1, ... 15). Finally, we need some
|
73
70
|
worker code to run the reduce part:
|
74
71
|
|
75
72
|
```ruby
|
data/lib/map_reduce/mapper.rb
CHANGED
@@ -71,7 +71,10 @@ module MapReduce
|
|
71
71
|
|
72
72
|
partitions = {}
|
73
73
|
|
74
|
-
|
74
|
+
chunk = k_way_merge(@chunks)
|
75
|
+
chunk = reduce_chunk(chunk, @implementation) if @implementation.respond_to?(:reduce)
|
76
|
+
|
77
|
+
chunk.each do |pair|
|
75
78
|
partition = @partitioner.call(pair[0])
|
76
79
|
|
77
80
|
(partitions[partition] ||= Tempfile.new).puts(JSON.generate(pair))
|
data/lib/map_reduce/mergeable.rb
CHANGED
@@ -20,6 +20,16 @@ module MapReduce
|
|
20
20
|
def k_way_merge(files)
|
21
21
|
return enum_for(:k_way_merge, files) unless block_given?
|
22
22
|
|
23
|
+
if files.size == 1
|
24
|
+
files.first.each_line do |line|
|
25
|
+
yield(JSON.parse(line))
|
26
|
+
end
|
27
|
+
|
28
|
+
files.each(&:rewind)
|
29
|
+
|
30
|
+
return
|
31
|
+
end
|
32
|
+
|
23
33
|
queue = PriorityQueue.new
|
24
34
|
|
25
35
|
files.each_with_index do |file, index|
|
data/lib/map_reduce/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: map-reduce-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Benjamin Vetter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-10-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|