map_reduce 0.0.1.alpha4 → 0.0.1.alpha5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,182 @@
1
+ require File.expand_path("../socket/master", __FILE__)
2
+
3
+ module MapReduce
4
+ class Master
5
+ # How often data will be flushed to disk
6
+ FLUSH_TIMEOUT = 1
7
+ # How many lines should be parsed by one iteration of grouping
8
+ GROUP_LINES = 100
9
+ # How many seconds should we sleep if grouping is going faster then reducing
10
+ GROUP_TIMEOUT = 1
11
+ # How many keys should be stored before timeout happend
12
+ GROUP_MAX = 10_000
13
+
14
+ # Valid options:
15
+ # * socket - socket address to bind
16
+ # default is 'ipc:///dev/shm/master.sock'
17
+ # * log_folder - folder to store recieved MAP data
18
+ # default is '/tmp/mapreduce/'
19
+ # * workers - count of workers that will emit data.
20
+ # default is :auto,
21
+ # but in small jobs it is better to define in explicitly,
22
+ # because if one worker will stop before others start
23
+ # master will decide that map job is done and will start reducing
24
+ # * delimiter - master log stores data like "key{delimiter}values"
25
+ # so to prevent collisions you can specify your own uniq delimiter
26
+ # default is a pipe "|"
27
+ #
28
+ def initialize(opts = {})
29
+ # Socket addr to bind
30
+ @socket_addr = opts[:socket] || ::MapReduce::DEFAULT_SOCKET
31
+ # Folder to write logs
32
+ @log_folder = opts[:log_folder] || "/tmp/mapreduce/"
33
+ # How many MapReduce workers will emit data
34
+ @workers = opts[:workers] || 1
35
+ # Delimiter to store key/value pairs in log
36
+ @delimiter = opts[:delimiter] || "|"
37
+
38
+ @log = []
39
+ @data = []
40
+ @workers_envelopes = {}
41
+ @log_filename = File.join(@log_folder, "master-#{Process.pid}.log")
42
+ @sorted_log_filename = File.join(@log_folder, "master-#{Process.pid}_sorted.log")
43
+
44
+ FileUtils.mkdir_p(@log_folder)
45
+ FileUtils.touch(@log_filename)
46
+ end
47
+
48
+ # Start Eventloop
49
+ #
50
+ def run
51
+ EM.run do
52
+ # Init socket
53
+ master_socket
54
+
55
+ # Init flushing timer
56
+ flush
57
+ end
58
+ end
59
+
60
+ # Stop Eventloop
61
+ #
62
+ def stop
63
+ EM.stop
64
+ end
65
+
66
+ # Store data in log array till flush
67
+ #
68
+ def map(key, message)
69
+ @log << "#{key}#{@delimiter}#{message}"
70
+ end
71
+
72
+ # Send data back to worker.
73
+ # Last item in data is last unfinished session,
74
+ # so till the end of file reading we don't send it
75
+ #
76
+ def reduce(envelope)
77
+ if @data.size >= 2
78
+ data = @data.shift
79
+ data = data.flatten
80
+ master_socket.send_reply(data, envelope)
81
+ elsif @reduce_stop
82
+ data = @data.shift
83
+ data = data.flatten if data
84
+ master_socket.send_reply(data, envelope)
85
+ else
86
+ EM.add_timer(1) do
87
+ reduce(envelope)
88
+ end
89
+ end
90
+ end
91
+
92
+ # Openning log file for read/write
93
+ #
94
+ def log_file
95
+ @log_file ||= begin
96
+ File.open(@log_filename, "w+")
97
+ end
98
+ end
99
+
100
+ # Openning sorted log for reading
101
+ #
102
+ def sorted_log_file
103
+ @sorted_log_file ||= begin
104
+ File.open(@sorted_log_filename, "r")
105
+ end
106
+ end
107
+
108
+ # Flushing data to disk once per FLUSH_TIMEOUT seconds
109
+ #
110
+ def flush
111
+ if @log.any?
112
+ log_file << @log*"\n" << "\n"
113
+ log_file.flush
114
+ @log.clear
115
+ end
116
+
117
+ EM.add_timer(FLUSH_TIMEOUT) do
118
+ flush
119
+ end
120
+ end
121
+
122
+ # Sorting log.
123
+ # Linux sort is the fastest way to sort big file.
124
+ # Deleting original log after sort.
125
+ #
126
+ def sort
127
+ `sort #{@log_filename} -o #{@sorted_log_filename}`
128
+ FileUtils.rm(@log_filename)
129
+ @log_file = nil
130
+ end
131
+
132
+ # Start reducing part.
133
+ # First, flushing rest of log to disk.
134
+ # Then sort data.
135
+ # Then start to read/group data
136
+ #
137
+ def reduce!
138
+ flush
139
+ sort
140
+
141
+ iter = sorted_log_file.each_line
142
+ group iter
143
+ end
144
+
145
+ # Reading sorted data and grouping by key.
146
+ # If queue (@data) is growing faster then workers grad data we pause reading file.
147
+ #
148
+ def group(iter)
149
+ if @data.size >= GROUP_MAX
150
+ EM.add_timer(GROUP_TIMEOUT){ group(iter) }
151
+ else
152
+ GROUP_LINES.times do
153
+ line = iter.next.chomp
154
+ key, msg = line.split(@delimiter)
155
+
156
+ last = @data.last
157
+ if last && last[0] == key
158
+ last[1] << msg
159
+ else
160
+ @data << [key, [msg]]
161
+ end
162
+ end
163
+
164
+ EM.next_tick{ group(iter) }
165
+ end
166
+ rescue StopIteration => e
167
+ FileUtils.rm(@sorted_log_filename)
168
+ @sorted_log_file = nil
169
+ @reduce_stop = true
170
+ end
171
+
172
+ # Initializing and binding socket
173
+ #
174
+ def master_socket
175
+ @master_socket ||= begin
176
+ sock = MapReduce::Socket::Master.new self, @workers
177
+ sock.bind @socket_addr
178
+ sock
179
+ end
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,60 @@
1
+ module MapReduce
2
+ class ReduceLog
3
+ def initialize(map_log, delimiter)
4
+ @map_log = map_log
5
+ @delimiter = delimiter
6
+ end
7
+
8
+ def get_data
9
+ if @lines
10
+ current_key = nil
11
+ current_values = []
12
+ while true
13
+ begin
14
+ line = @lines.peek.chomp
15
+ key, values = line.split(@delimiter)
16
+ current_key ||= key
17
+
18
+ if current_key != key
19
+ break
20
+ else
21
+ current_values << values
22
+ @lines.next
23
+ end
24
+ rescue StopIteration => e
25
+ @file.close
26
+ FileUtils.rm(File.path(@file))
27
+ @lines = nil
28
+ break
29
+ end
30
+ end
31
+ [current_key, *current_values]
32
+ end
33
+ end
34
+
35
+ def force
36
+ unless @lines
37
+ fn = log_file
38
+ if fn
39
+ @file = File.open(fn)
40
+ @lines = @file.each_line
41
+ end
42
+ end
43
+ end
44
+
45
+ def log_file
46
+ @log_file ||= begin
47
+ fn = @map_log.reset
48
+ if fn
49
+ @more = true
50
+ sort(fn)
51
+ fn
52
+ end
53
+ end
54
+ end
55
+
56
+ def sort(fn)
57
+ `sort #{fn} -o #{fn}`
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,65 @@
1
+ module MapReduce
2
+ class Reducer
3
+ def initialize(opts = {})
4
+ @masters = opts[:masters] || [::MapReduce::DEFAULT_SOCKET]
5
+ @connection_type = opts[:type] || :em
6
+ @task = opts[:task]
7
+ end
8
+
9
+ def reduce(&blk)
10
+ if @connection_type == :em
11
+ em_reduce(&blk)
12
+ else
13
+ sync_reduce(&blk)
14
+ end
15
+ end
16
+
17
+ def sync_reduce(&blk)
18
+ all = sockets.dup
19
+ while sock = all.sample
20
+ key, *values = sock.send_request(["reduce", @task])
21
+ if key.nil?
22
+ all.delete sock
23
+ else
24
+ blk.call(key, values)
25
+ end
26
+ end
27
+ end
28
+
29
+ def em_reduce(all = nil, &blk)
30
+ all ||= sockets.dup
31
+ sock = all.sample
32
+ if sock
33
+ sock.send_request(["reduce", @task]) do |message|
34
+ key, *values = message
35
+ if key.nil?
36
+ all.delete sock
37
+ else
38
+ blk.call(key, values)
39
+ end
40
+
41
+ em_reduce(all, &blk)
42
+ end
43
+ else
44
+ blk.call([nil])
45
+ end
46
+ end
47
+ private
48
+
49
+ def sockets
50
+ @sockets ||= begin
51
+ klass = if @connection_type == :sync
52
+ EM::Protocols::Zmq2::ReqFiber
53
+ else
54
+ EM::Protocols::Zmq2::ReqCb
55
+ end
56
+
57
+ @masters.map do |sock|
58
+ s = klass.new
59
+ s.connect(sock)
60
+ s
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -1,102 +1,12 @@
1
- # Reply socket.
2
- # Master accepts "map", "map_finished", and "reduce" messages.
3
- # For "map" messages it didn't actually replies,
4
- # but for "reduce" requests it returns key with grouped values.
5
- #
6
1
  module MapReduce::Socket
7
2
  class Master < EM::Protocols::Zmq2::Rep
8
- # If worker is ready to reduce data, but we are still in MAP state
9
- # we will sleep for REDUCE_WAIT seconds till state is not REDUCE
10
- REDUCE_WAIT = 1
11
-
12
- def initialize(master, workers)
3
+ def initialize(master)
13
4
  @master = master
14
- @workers = workers
15
-
16
- @connections = {}
17
- @state = :map
18
-
19
5
  super()
20
6
  end
21
7
 
22
8
  def receive_request(message, envelope)
23
- @connections[envelope.first] = false
24
-
25
- type, key, msg = message
26
- case type
27
- when "map"
28
- map(envelope, key, msg)
29
- when "map_finished"
30
- map_finished(envelope)
31
- when "reduce"
32
- reduce(envelope)
33
- else
34
- MapReduce.logger.error("Wrong message type: #{type}")
35
- end
36
- end
37
-
38
- # Send data to log
39
- # Someone should never MAP data when master already in REDUCE state
40
- #
41
- def map(envelope, key, msg)
42
- if @state == :map
43
- @master.map(key, msg)
44
- ok(envelope)
45
- else
46
- MapReduce.logger.error("Someone tries to MAP data while state is REDUCE")
47
- not_ok(envelope, "You can't MAP while we are reducing")
48
- end
49
- end
50
-
51
- # When worker stops mapping data, it sends "map_finished" message.
52
- # When all workers will send "map_finished" message reduce will begin.
53
- #
54
- def map_finished(envelope)
55
- ok(envelope)
56
-
57
- @connections[envelope.first] ||= true
58
- @workers = @connections.size if @workers == :auto
59
-
60
- return unless @connections.all?{ |k,v| v }
61
- return unless @connections.size == @workers
62
-
63
- @state = :reduce
64
- @master.reduce!
65
- end
66
-
67
- # Wait till all workers stops sending MAP.
68
- # After all workers stopped we start REDUCE part of job.
69
- #
70
- def reduce(envelope)
71
- if @state == :reduce
72
- @master.reduce(envelope)
73
- else
74
- EM.add_timer(REDUCE_WAIT) do
75
- reduce(envelope)
76
- end
77
- end
78
- end
79
-
80
- # Simple OK reply
81
- #
82
- def ok(envelope)
83
- send_reply(["ok"], envelope)
84
- end
85
-
86
- # Simple NOT OK reply
87
- #
88
- def not_ok(envelope, error)
89
- send_reply(["error", error], envelope)
90
- end
91
-
92
- # Switch back to :map state if reduce finished
93
- #
94
- def send_reply(data, envelope)
95
- unless data
96
- @state = :map
97
- @connections = {}
98
- end
99
- super
9
+ @master.recieve_msg(message, envelope)
100
10
  end
101
11
  end
102
12
  end
@@ -1,9 +1,8 @@
1
- module MapReduce::Socket
2
- class WorkerSync < EM::Protocols::Zmq2::ReqCb
3
- alias_method :async_send_request, :send_request
1
+ module EM::Protocols::Zmq2
2
+ class ReqFiber < EM::Protocols::Zmq2::ReqCb
4
3
  def send_request(data, &blk)
5
4
  fib = Fiber.current
6
- async_send_request(data) do |message|
5
+ super(data) do |message|
7
6
  fib.resume(message)
8
7
  end
9
8
  if block_given?
@@ -1,3 +1,3 @@
1
1
  module MapReduce
2
- VERSION = "0.0.1.alpha4"
2
+ VERSION = "0.0.1.alpha5"
3
3
  end
@@ -3,32 +3,48 @@ require 'spec_helper'
3
3
  describe "MapReduce stack" do
4
4
  describe "single master" do
5
5
  before do
6
- @pid = fork do
7
- master = MapReduce::Master.new
6
+ @pid1 = fork do
7
+ master = MapReduce::Master.new socket: "tcp://127.0.0.1:15555"
8
+ trap("SIGINT") do
9
+ master.stop
10
+ exit
11
+ end
12
+ master.run
13
+ end
14
+ @pid2 = fork do
15
+ master = MapReduce::Master.new socket: "tcp://127.0.0.1:15556"
16
+ trap("SIGINT") do
17
+ master.stop
18
+ exit
19
+ end
8
20
  master.run
9
21
  end
10
22
  end
11
23
 
12
24
  after do
13
- Process.kill "TERM", @pid
25
+ Process.kill "INT", @pid1
26
+ Process.kill "INT", @pid2
14
27
  end
15
28
 
16
- it "should map and reduce some data in CB mode" do
17
- EM.run do
18
- data = {}
19
- worker = MapReduce::Worker.new
20
- worker.map("Petr", ["Radiohead", "Muse", "R.E.M."] * ',') do
21
- worker.map("Alex", ["Madonna", "Lady Gaga"] * ',') do
22
- worker.map("Petr", ["Radiohead", "The Beatles", "Aquarium"] * ',') do
23
- worker.map_finished do
24
- worker.reduce do |key, values|
29
+ describe ":em" do
30
+ it "should map/reduce with multiple masters" do
31
+ EM.run do
32
+ @mapper = MapReduce::Mapper.new task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
33
+ @reducer = MapReduce::Reducer.new task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
34
+ i = 0
35
+ [["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"]].each do |a|
36
+ @mapper.map(*a) do |res|
37
+ res.must_equal ["ok"]
38
+ if (i+=1) == 5
39
+ data = {}
40
+ @reducer.reduce do |key, values|
25
41
  if key
26
42
  data[key] = values
27
43
  else
28
- data.size.must_equal 2
29
- data["Petr"].must_equal [["Radiohead", "Muse", "R.E.M."] * ',', ["Radiohead", "The Beatles", "Aquarium"] * ',']
30
- data["Alex"].must_equal [["Madonna", "Lady Gaga"] * ',']
31
-
44
+ data.size.must_equal 3
45
+ data["Peter"].sort.must_equal ["Apple", "Lemon"].sort
46
+ data["Andrew"].sort.must_equal ["Peach", "Orange"].sort
47
+ data["Mary"].must_equal ["Plum"]
32
48
  EM.stop
33
49
  end
34
50
  end
@@ -39,64 +55,61 @@ describe "MapReduce stack" do
39
55
  end
40
56
  end
41
57
 
42
- it "should map and reduce some data in SYNC mode" do
43
- EM.synchrony do
44
- data = {}
45
- worker = MapReduce::Worker.new type: :sync
46
- worker.map("Petr", ["Radiohead", "Muse", "R.E.M."] * ',')
47
- worker.map("Alex", ["Madonna", "Lady Gaga"] * ',')
48
- worker.map("Petr", ["Radiohead", "The Beatles", "Aquarium"] * ',')
49
- worker.map_finished
50
- worker.reduce do |key, values|
51
- data[key] = values if key
58
+ describe ":sync" do
59
+ it "should map/reduce with multiple masters" do
60
+ EM.synchrony do
61
+ @mapper = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
62
+ @reducer = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
63
+ [["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"]].each do |a|
64
+ res = @mapper.map(*a)
65
+ res.must_equal ["ok"]
66
+ end
67
+ data = {}
68
+ @reducer.reduce do |k, values|
69
+ data[k] = values
70
+ end
71
+ data.size.must_equal 3
72
+ data["Peter"].sort.must_equal ["Apple", "Lemon"].sort
73
+ data["Andrew"].sort.must_equal ["Peach", "Orange"].sort
74
+ data["Mary"].must_equal ["Plum"]
75
+ EM.stop
52
76
  end
53
- data.size.must_equal 2
54
- data["Petr"].must_equal [["Radiohead", "Muse", "R.E.M."] * ',', ["Radiohead", "The Beatles", "Aquarium"] * ',']
55
- data["Alex"].must_equal [["Madonna", "Lady Gaga"] * ',']
56
-
57
- EM.stop
58
77
  end
59
- end
60
- end
61
78
 
62
- describe "multiple master" do
63
- before do
64
- @pid1 = fork do
65
- master = MapReduce::Master.new socket: "ipc:///dev/shm/sock1.sock"
66
- master.run
67
- end
68
- @pid2 = fork do
69
- master = MapReduce::Master.new socket: "ipc:///dev/shm/sock2.sock"
70
- master.run
71
- end
72
- end
79
+ it "should map/reduce-map/reduce with multiple masters" do
80
+ EM.synchrony do
81
+ @mapper1 = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
82
+ @reducer1 = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
83
+ @mapper2 = MapReduce::Mapper.new type: :sync, task: "Related", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
84
+ @reducer2 = MapReduce::Reducer.new type: :sync, task: "Related", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
73
85
 
74
- after do
75
- Process.kill "TERM", @pid1
76
- Process.kill "TERM", @pid2
77
- end
86
+ [["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"], ["Peter", "Peach"], ["Yura", "Peach"], ["Yura", "Apricot"], ["Yura", "Apple"]].each do |a|
87
+ res = @mapper1.map(*a)
88
+ res.must_equal ["ok"]
89
+ end
78
90
 
79
- it "should map and reduce some data in SYNC mode twice" do
80
- EM.synchrony do
81
- worker = MapReduce::Worker.new type: :sync, masters: ["ipc:///dev/shm/sock1.sock", "ipc:///dev/shm/sock2.sock"]
82
- 2.times do
83
- data = {}
84
- worker.map("Petr", ["Radiohead", "Muse", "R.E.M."] * ',')
85
- worker.map("Alex", ["Madonna", "Lady Gaga"] * ',')
86
- worker.map("Petr", ["Radiohead", "The Beatles", "Aquarium"] * ',')
87
- worker.map("Michael", ["Blur"] * ',')
88
- worker.map("Gosha", ["DDT", "Splin"] * ',')
89
- worker.map("Obama", ["Adele", "Rolling Stones"] * ',')
90
- worker.map_finished
91
- worker.reduce do |key, values|
92
- data[key] = values if key
91
+ @reducer1.reduce do |k, values|
92
+ values.each do |fruit|
93
+ related = values.dup
94
+ related.delete fruit
95
+ related.each do |r|
96
+ @mapper2.map(fruit, r)
97
+ end
98
+ end
93
99
  end
94
- data.size.must_equal 5
95
- data["Petr"].must_equal [["Radiohead", "Muse", "R.E.M."] * ',', ["Radiohead", "The Beatles", "Aquarium"] * ',']
96
- data["Alex"].must_equal [["Madonna", "Lady Gaga"] * ',']
97
- end
98
100
 
99
- EM.stop
101
+ fruits = {}
102
+ @reducer2.reduce do |fruit, related|
103
+ fruits[fruit] ||= []
104
+ fruits[fruit].push(*related)
105
+ end
106
+
107
+ fruits["Apple"].must_equal ["Apricot", "Lemon", "Peach", "Peach"]
108
+ fruits["Orange"].must_equal ["Peach"]
109
+ fruits["Plum"].must_equal nil
110
+
111
+ EM.stop
112
+ end
100
113
  end
101
114
  end
102
115
  end