map_reduce 0.0.1.alpha4 → 0.0.1.alpha5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +176 -2
- data/lib/map_reduce.rb +7 -1
- data/lib/map_reduce/exceptions.rb +4 -0
- data/lib/map_reduce/map_log.rb +45 -0
- data/lib/map_reduce/mapper.rb +72 -0
- data/lib/map_reduce/master.rb +87 -145
- data/lib/map_reduce/master_old.rb +182 -0
- data/lib/map_reduce/reduce_log.rb +60 -0
- data/lib/map_reduce/reducer.rb +65 -0
- data/lib/map_reduce/socket/master.rb +2 -92
- data/lib/map_reduce/socket/{worker_sync.rb → req_fiber.rb} +3 -4
- data/lib/map_reduce/version.rb +1 -1
- data/spec/map_reduce/map_reduce_spec.rb +80 -67
- metadata +28 -23
- data/lib/map_reduce/socket/worker_em.rb +0 -4
@@ -0,0 +1,182 @@
|
|
1
|
+
require File.expand_path("../socket/master", __FILE__)
|
2
|
+
|
3
|
+
module MapReduce
|
4
|
+
class Master
|
5
|
+
# How often data will be flushed to disk
|
6
|
+
FLUSH_TIMEOUT = 1
|
7
|
+
# How many lines should be parsed by one iteration of grouping
|
8
|
+
GROUP_LINES = 100
|
9
|
+
# How many seconds should we sleep if grouping is going faster then reducing
|
10
|
+
GROUP_TIMEOUT = 1
|
11
|
+
# How many keys should be stored before timeout happend
|
12
|
+
GROUP_MAX = 10_000
|
13
|
+
|
14
|
+
# Valid options:
|
15
|
+
# * socket - socket address to bind
|
16
|
+
# default is 'ipc:///dev/shm/master.sock'
|
17
|
+
# * log_folder - folder to store recieved MAP data
|
18
|
+
# default is '/tmp/mapreduce/'
|
19
|
+
# * workers - count of workers that will emit data.
|
20
|
+
# default is :auto,
|
21
|
+
# but in small jobs it is better to define in explicitly,
|
22
|
+
# because if one worker will stop before others start
|
23
|
+
# master will decide that map job is done and will start reducing
|
24
|
+
# * delimiter - master log stores data like "key{delimiter}values"
|
25
|
+
# so to prevent collisions you can specify your own uniq delimiter
|
26
|
+
# default is a pipe "|"
|
27
|
+
#
|
28
|
+
def initialize(opts = {})
|
29
|
+
# Socket addr to bind
|
30
|
+
@socket_addr = opts[:socket] || ::MapReduce::DEFAULT_SOCKET
|
31
|
+
# Folder to write logs
|
32
|
+
@log_folder = opts[:log_folder] || "/tmp/mapreduce/"
|
33
|
+
# How many MapReduce workers will emit data
|
34
|
+
@workers = opts[:workers] || 1
|
35
|
+
# Delimiter to store key/value pairs in log
|
36
|
+
@delimiter = opts[:delimiter] || "|"
|
37
|
+
|
38
|
+
@log = []
|
39
|
+
@data = []
|
40
|
+
@workers_envelopes = {}
|
41
|
+
@log_filename = File.join(@log_folder, "master-#{Process.pid}.log")
|
42
|
+
@sorted_log_filename = File.join(@log_folder, "master-#{Process.pid}_sorted.log")
|
43
|
+
|
44
|
+
FileUtils.mkdir_p(@log_folder)
|
45
|
+
FileUtils.touch(@log_filename)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Start Eventloop
|
49
|
+
#
|
50
|
+
def run
|
51
|
+
EM.run do
|
52
|
+
# Init socket
|
53
|
+
master_socket
|
54
|
+
|
55
|
+
# Init flushing timer
|
56
|
+
flush
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Stop Eventloop
|
61
|
+
#
|
62
|
+
def stop
|
63
|
+
EM.stop
|
64
|
+
end
|
65
|
+
|
66
|
+
# Store data in log array till flush
|
67
|
+
#
|
68
|
+
def map(key, message)
|
69
|
+
@log << "#{key}#{@delimiter}#{message}"
|
70
|
+
end
|
71
|
+
|
72
|
+
# Send data back to worker.
|
73
|
+
# Last item in data is last unfinished session,
|
74
|
+
# so till the end of file reading we don't send it
|
75
|
+
#
|
76
|
+
def reduce(envelope)
|
77
|
+
if @data.size >= 2
|
78
|
+
data = @data.shift
|
79
|
+
data = data.flatten
|
80
|
+
master_socket.send_reply(data, envelope)
|
81
|
+
elsif @reduce_stop
|
82
|
+
data = @data.shift
|
83
|
+
data = data.flatten if data
|
84
|
+
master_socket.send_reply(data, envelope)
|
85
|
+
else
|
86
|
+
EM.add_timer(1) do
|
87
|
+
reduce(envelope)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Openning log file for read/write
|
93
|
+
#
|
94
|
+
def log_file
|
95
|
+
@log_file ||= begin
|
96
|
+
File.open(@log_filename, "w+")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Openning sorted log for reading
|
101
|
+
#
|
102
|
+
def sorted_log_file
|
103
|
+
@sorted_log_file ||= begin
|
104
|
+
File.open(@sorted_log_filename, "r")
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Flushing data to disk once per FLUSH_TIMEOUT seconds
|
109
|
+
#
|
110
|
+
def flush
|
111
|
+
if @log.any?
|
112
|
+
log_file << @log*"\n" << "\n"
|
113
|
+
log_file.flush
|
114
|
+
@log.clear
|
115
|
+
end
|
116
|
+
|
117
|
+
EM.add_timer(FLUSH_TIMEOUT) do
|
118
|
+
flush
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# Sorting log.
|
123
|
+
# Linux sort is the fastest way to sort big file.
|
124
|
+
# Deleting original log after sort.
|
125
|
+
#
|
126
|
+
def sort
|
127
|
+
`sort #{@log_filename} -o #{@sorted_log_filename}`
|
128
|
+
FileUtils.rm(@log_filename)
|
129
|
+
@log_file = nil
|
130
|
+
end
|
131
|
+
|
132
|
+
# Start reducing part.
|
133
|
+
# First, flushing rest of log to disk.
|
134
|
+
# Then sort data.
|
135
|
+
# Then start to read/group data
|
136
|
+
#
|
137
|
+
def reduce!
|
138
|
+
flush
|
139
|
+
sort
|
140
|
+
|
141
|
+
iter = sorted_log_file.each_line
|
142
|
+
group iter
|
143
|
+
end
|
144
|
+
|
145
|
+
# Reading sorted data and grouping by key.
|
146
|
+
# If queue (@data) is growing faster then workers grad data we pause reading file.
|
147
|
+
#
|
148
|
+
def group(iter)
|
149
|
+
if @data.size >= GROUP_MAX
|
150
|
+
EM.add_timer(GROUP_TIMEOUT){ group(iter) }
|
151
|
+
else
|
152
|
+
GROUP_LINES.times do
|
153
|
+
line = iter.next.chomp
|
154
|
+
key, msg = line.split(@delimiter)
|
155
|
+
|
156
|
+
last = @data.last
|
157
|
+
if last && last[0] == key
|
158
|
+
last[1] << msg
|
159
|
+
else
|
160
|
+
@data << [key, [msg]]
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
EM.next_tick{ group(iter) }
|
165
|
+
end
|
166
|
+
rescue StopIteration => e
|
167
|
+
FileUtils.rm(@sorted_log_filename)
|
168
|
+
@sorted_log_file = nil
|
169
|
+
@reduce_stop = true
|
170
|
+
end
|
171
|
+
|
172
|
+
# Initializing and binding socket
|
173
|
+
#
|
174
|
+
def master_socket
|
175
|
+
@master_socket ||= begin
|
176
|
+
sock = MapReduce::Socket::Master.new self, @workers
|
177
|
+
sock.bind @socket_addr
|
178
|
+
sock
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module MapReduce
|
2
|
+
class ReduceLog
|
3
|
+
def initialize(map_log, delimiter)
|
4
|
+
@map_log = map_log
|
5
|
+
@delimiter = delimiter
|
6
|
+
end
|
7
|
+
|
8
|
+
def get_data
|
9
|
+
if @lines
|
10
|
+
current_key = nil
|
11
|
+
current_values = []
|
12
|
+
while true
|
13
|
+
begin
|
14
|
+
line = @lines.peek.chomp
|
15
|
+
key, values = line.split(@delimiter)
|
16
|
+
current_key ||= key
|
17
|
+
|
18
|
+
if current_key != key
|
19
|
+
break
|
20
|
+
else
|
21
|
+
current_values << values
|
22
|
+
@lines.next
|
23
|
+
end
|
24
|
+
rescue StopIteration => e
|
25
|
+
@file.close
|
26
|
+
FileUtils.rm(File.path(@file))
|
27
|
+
@lines = nil
|
28
|
+
break
|
29
|
+
end
|
30
|
+
end
|
31
|
+
[current_key, *current_values]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def force
|
36
|
+
unless @lines
|
37
|
+
fn = log_file
|
38
|
+
if fn
|
39
|
+
@file = File.open(fn)
|
40
|
+
@lines = @file.each_line
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def log_file
|
46
|
+
@log_file ||= begin
|
47
|
+
fn = @map_log.reset
|
48
|
+
if fn
|
49
|
+
@more = true
|
50
|
+
sort(fn)
|
51
|
+
fn
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def sort(fn)
|
57
|
+
`sort #{fn} -o #{fn}`
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module MapReduce
|
2
|
+
class Reducer
|
3
|
+
def initialize(opts = {})
|
4
|
+
@masters = opts[:masters] || [::MapReduce::DEFAULT_SOCKET]
|
5
|
+
@connection_type = opts[:type] || :em
|
6
|
+
@task = opts[:task]
|
7
|
+
end
|
8
|
+
|
9
|
+
def reduce(&blk)
|
10
|
+
if @connection_type == :em
|
11
|
+
em_reduce(&blk)
|
12
|
+
else
|
13
|
+
sync_reduce(&blk)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def sync_reduce(&blk)
|
18
|
+
all = sockets.dup
|
19
|
+
while sock = all.sample
|
20
|
+
key, *values = sock.send_request(["reduce", @task])
|
21
|
+
if key.nil?
|
22
|
+
all.delete sock
|
23
|
+
else
|
24
|
+
blk.call(key, values)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def em_reduce(all = nil, &blk)
|
30
|
+
all ||= sockets.dup
|
31
|
+
sock = all.sample
|
32
|
+
if sock
|
33
|
+
sock.send_request(["reduce", @task]) do |message|
|
34
|
+
key, *values = message
|
35
|
+
if key.nil?
|
36
|
+
all.delete sock
|
37
|
+
else
|
38
|
+
blk.call(key, values)
|
39
|
+
end
|
40
|
+
|
41
|
+
em_reduce(all, &blk)
|
42
|
+
end
|
43
|
+
else
|
44
|
+
blk.call([nil])
|
45
|
+
end
|
46
|
+
end
|
47
|
+
private
|
48
|
+
|
49
|
+
def sockets
|
50
|
+
@sockets ||= begin
|
51
|
+
klass = if @connection_type == :sync
|
52
|
+
EM::Protocols::Zmq2::ReqFiber
|
53
|
+
else
|
54
|
+
EM::Protocols::Zmq2::ReqCb
|
55
|
+
end
|
56
|
+
|
57
|
+
@masters.map do |sock|
|
58
|
+
s = klass.new
|
59
|
+
s.connect(sock)
|
60
|
+
s
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -1,102 +1,12 @@
|
|
1
|
-
# Reply socket.
|
2
|
-
# Master accepts "map", "map_finished", and "reduce" messages.
|
3
|
-
# For "map" messages it didn't actually replies,
|
4
|
-
# but for "reduce" requests it returns key with grouped values.
|
5
|
-
#
|
6
1
|
module MapReduce::Socket
|
7
2
|
class Master < EM::Protocols::Zmq2::Rep
|
8
|
-
|
9
|
-
# we will sleep for REDUCE_WAIT seconds till state is not REDUCE
|
10
|
-
REDUCE_WAIT = 1
|
11
|
-
|
12
|
-
def initialize(master, workers)
|
3
|
+
def initialize(master)
|
13
4
|
@master = master
|
14
|
-
@workers = workers
|
15
|
-
|
16
|
-
@connections = {}
|
17
|
-
@state = :map
|
18
|
-
|
19
5
|
super()
|
20
6
|
end
|
21
7
|
|
22
8
|
def receive_request(message, envelope)
|
23
|
-
@
|
24
|
-
|
25
|
-
type, key, msg = message
|
26
|
-
case type
|
27
|
-
when "map"
|
28
|
-
map(envelope, key, msg)
|
29
|
-
when "map_finished"
|
30
|
-
map_finished(envelope)
|
31
|
-
when "reduce"
|
32
|
-
reduce(envelope)
|
33
|
-
else
|
34
|
-
MapReduce.logger.error("Wrong message type: #{type}")
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# Send data to log
|
39
|
-
# Someone should never MAP data when master already in REDUCE state
|
40
|
-
#
|
41
|
-
def map(envelope, key, msg)
|
42
|
-
if @state == :map
|
43
|
-
@master.map(key, msg)
|
44
|
-
ok(envelope)
|
45
|
-
else
|
46
|
-
MapReduce.logger.error("Someone tries to MAP data while state is REDUCE")
|
47
|
-
not_ok(envelope, "You can't MAP while we are reducing")
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
# When worker stops mapping data, it sends "map_finished" message.
|
52
|
-
# When all workers will send "map_finished" message reduce will begin.
|
53
|
-
#
|
54
|
-
def map_finished(envelope)
|
55
|
-
ok(envelope)
|
56
|
-
|
57
|
-
@connections[envelope.first] ||= true
|
58
|
-
@workers = @connections.size if @workers == :auto
|
59
|
-
|
60
|
-
return unless @connections.all?{ |k,v| v }
|
61
|
-
return unless @connections.size == @workers
|
62
|
-
|
63
|
-
@state = :reduce
|
64
|
-
@master.reduce!
|
65
|
-
end
|
66
|
-
|
67
|
-
# Wait till all workers stops sending MAP.
|
68
|
-
# After all workers stopped we start REDUCE part of job.
|
69
|
-
#
|
70
|
-
def reduce(envelope)
|
71
|
-
if @state == :reduce
|
72
|
-
@master.reduce(envelope)
|
73
|
-
else
|
74
|
-
EM.add_timer(REDUCE_WAIT) do
|
75
|
-
reduce(envelope)
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
# Simple OK reply
|
81
|
-
#
|
82
|
-
def ok(envelope)
|
83
|
-
send_reply(["ok"], envelope)
|
84
|
-
end
|
85
|
-
|
86
|
-
# Simple NOT OK reply
|
87
|
-
#
|
88
|
-
def not_ok(envelope, error)
|
89
|
-
send_reply(["error", error], envelope)
|
90
|
-
end
|
91
|
-
|
92
|
-
# Switch back to :map state if reduce finished
|
93
|
-
#
|
94
|
-
def send_reply(data, envelope)
|
95
|
-
unless data
|
96
|
-
@state = :map
|
97
|
-
@connections = {}
|
98
|
-
end
|
99
|
-
super
|
9
|
+
@master.recieve_msg(message, envelope)
|
100
10
|
end
|
101
11
|
end
|
102
12
|
end
|
@@ -1,9 +1,8 @@
|
|
1
|
-
module
|
2
|
-
class
|
3
|
-
alias_method :async_send_request, :send_request
|
1
|
+
module EM::Protocols::Zmq2
|
2
|
+
class ReqFiber < EM::Protocols::Zmq2::ReqCb
|
4
3
|
def send_request(data, &blk)
|
5
4
|
fib = Fiber.current
|
6
|
-
|
5
|
+
super(data) do |message|
|
7
6
|
fib.resume(message)
|
8
7
|
end
|
9
8
|
if block_given?
|
data/lib/map_reduce/version.rb
CHANGED
@@ -3,32 +3,48 @@ require 'spec_helper'
|
|
3
3
|
describe "MapReduce stack" do
|
4
4
|
describe "single master" do
|
5
5
|
before do
|
6
|
-
@
|
7
|
-
master = MapReduce::Master.new
|
6
|
+
@pid1 = fork do
|
7
|
+
master = MapReduce::Master.new socket: "tcp://127.0.0.1:15555"
|
8
|
+
trap("SIGINT") do
|
9
|
+
master.stop
|
10
|
+
exit
|
11
|
+
end
|
12
|
+
master.run
|
13
|
+
end
|
14
|
+
@pid2 = fork do
|
15
|
+
master = MapReduce::Master.new socket: "tcp://127.0.0.1:15556"
|
16
|
+
trap("SIGINT") do
|
17
|
+
master.stop
|
18
|
+
exit
|
19
|
+
end
|
8
20
|
master.run
|
9
21
|
end
|
10
22
|
end
|
11
23
|
|
12
24
|
after do
|
13
|
-
Process.kill "
|
25
|
+
Process.kill "INT", @pid1
|
26
|
+
Process.kill "INT", @pid2
|
14
27
|
end
|
15
28
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
29
|
+
describe ":em" do
|
30
|
+
it "should map/reduce with multiple masters" do
|
31
|
+
EM.run do
|
32
|
+
@mapper = MapReduce::Mapper.new task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
33
|
+
@reducer = MapReduce::Reducer.new task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
34
|
+
i = 0
|
35
|
+
[["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"]].each do |a|
|
36
|
+
@mapper.map(*a) do |res|
|
37
|
+
res.must_equal ["ok"]
|
38
|
+
if (i+=1) == 5
|
39
|
+
data = {}
|
40
|
+
@reducer.reduce do |key, values|
|
25
41
|
if key
|
26
42
|
data[key] = values
|
27
43
|
else
|
28
|
-
data.size.must_equal
|
29
|
-
data["
|
30
|
-
data["
|
31
|
-
|
44
|
+
data.size.must_equal 3
|
45
|
+
data["Peter"].sort.must_equal ["Apple", "Lemon"].sort
|
46
|
+
data["Andrew"].sort.must_equal ["Peach", "Orange"].sort
|
47
|
+
data["Mary"].must_equal ["Plum"]
|
32
48
|
EM.stop
|
33
49
|
end
|
34
50
|
end
|
@@ -39,64 +55,61 @@ describe "MapReduce stack" do
|
|
39
55
|
end
|
40
56
|
end
|
41
57
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
data
|
58
|
+
describe ":sync" do
|
59
|
+
it "should map/reduce with multiple masters" do
|
60
|
+
EM.synchrony do
|
61
|
+
@mapper = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
62
|
+
@reducer = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
63
|
+
[["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"]].each do |a|
|
64
|
+
res = @mapper.map(*a)
|
65
|
+
res.must_equal ["ok"]
|
66
|
+
end
|
67
|
+
data = {}
|
68
|
+
@reducer.reduce do |k, values|
|
69
|
+
data[k] = values
|
70
|
+
end
|
71
|
+
data.size.must_equal 3
|
72
|
+
data["Peter"].sort.must_equal ["Apple", "Lemon"].sort
|
73
|
+
data["Andrew"].sort.must_equal ["Peach", "Orange"].sort
|
74
|
+
data["Mary"].must_equal ["Plum"]
|
75
|
+
EM.stop
|
52
76
|
end
|
53
|
-
data.size.must_equal 2
|
54
|
-
data["Petr"].must_equal [["Radiohead", "Muse", "R.E.M."] * ',', ["Radiohead", "The Beatles", "Aquarium"] * ',']
|
55
|
-
data["Alex"].must_equal [["Madonna", "Lady Gaga"] * ',']
|
56
|
-
|
57
|
-
EM.stop
|
58
77
|
end
|
59
|
-
end
|
60
|
-
end
|
61
78
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
@pid2 = fork do
|
69
|
-
master = MapReduce::Master.new socket: "ipc:///dev/shm/sock2.sock"
|
70
|
-
master.run
|
71
|
-
end
|
72
|
-
end
|
79
|
+
it "should map/reduce-map/reduce with multiple masters" do
|
80
|
+
EM.synchrony do
|
81
|
+
@mapper1 = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
82
|
+
@reducer1 = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
83
|
+
@mapper2 = MapReduce::Mapper.new type: :sync, task: "Related", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
84
|
+
@reducer2 = MapReduce::Reducer.new type: :sync, task: "Related", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
73
85
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
86
|
+
[["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"], ["Peter", "Peach"], ["Yura", "Peach"], ["Yura", "Apricot"], ["Yura", "Apple"]].each do |a|
|
87
|
+
res = @mapper1.map(*a)
|
88
|
+
res.must_equal ["ok"]
|
89
|
+
end
|
78
90
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
worker.map("Michael", ["Blur"] * ',')
|
88
|
-
worker.map("Gosha", ["DDT", "Splin"] * ',')
|
89
|
-
worker.map("Obama", ["Adele", "Rolling Stones"] * ',')
|
90
|
-
worker.map_finished
|
91
|
-
worker.reduce do |key, values|
|
92
|
-
data[key] = values if key
|
91
|
+
@reducer1.reduce do |k, values|
|
92
|
+
values.each do |fruit|
|
93
|
+
related = values.dup
|
94
|
+
related.delete fruit
|
95
|
+
related.each do |r|
|
96
|
+
@mapper2.map(fruit, r)
|
97
|
+
end
|
98
|
+
end
|
93
99
|
end
|
94
|
-
data.size.must_equal 5
|
95
|
-
data["Petr"].must_equal [["Radiohead", "Muse", "R.E.M."] * ',', ["Radiohead", "The Beatles", "Aquarium"] * ',']
|
96
|
-
data["Alex"].must_equal [["Madonna", "Lady Gaga"] * ',']
|
97
|
-
end
|
98
100
|
|
99
|
-
|
101
|
+
fruits = {}
|
102
|
+
@reducer2.reduce do |fruit, related|
|
103
|
+
fruits[fruit] ||= []
|
104
|
+
fruits[fruit].push(*related)
|
105
|
+
end
|
106
|
+
|
107
|
+
fruits["Apple"].must_equal ["Apricot", "Lemon", "Peach", "Peach"]
|
108
|
+
fruits["Orange"].must_equal ["Peach"]
|
109
|
+
fruits["Plum"].must_equal nil
|
110
|
+
|
111
|
+
EM.stop
|
112
|
+
end
|
100
113
|
end
|
101
114
|
end
|
102
115
|
end
|