map_reduce 0.0.1.alpha4 → 0.0.1.alpha5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ require File.expand_path("../socket/master", __FILE__)
2
+
3
+ module MapReduce
4
+ class Master
5
+ # How often data will be flushed to disk
6
+ FLUSH_TIMEOUT = 1
7
+ # How many lines should be parsed by one iteration of grouping
8
+ GROUP_LINES = 100
9
+ # How many seconds should we sleep if grouping is going faster then reducing
10
+ GROUP_TIMEOUT = 1
11
+ # How many keys should be stored before timeout happend
12
+ GROUP_MAX = 10_000
13
+
14
+ # Valid options:
15
+ # * socket - socket address to bind
16
+ # default is 'ipc:///dev/shm/master.sock'
17
+ # * log_folder - folder to store recieved MAP data
18
+ # default is '/tmp/mapreduce/'
19
+ # * workers - count of workers that will emit data.
20
+ # default is :auto,
21
+ # but in small jobs it is better to define in explicitly,
22
+ # because if one worker will stop before others start
23
+ # master will decide that map job is done and will start reducing
24
+ # * delimiter - master log stores data like "key{delimiter}values"
25
+ # so to prevent collisions you can specify your own uniq delimiter
26
+ # default is a pipe "|"
27
+ #
28
+ def initialize(opts = {})
29
+ # Socket addr to bind
30
+ @socket_addr = opts[:socket] || ::MapReduce::DEFAULT_SOCKET
31
+ # Folder to write logs
32
+ @log_folder = opts[:log_folder] || "/tmp/mapreduce/"
33
+ # How many MapReduce workers will emit data
34
+ @workers = opts[:workers] || 1
35
+ # Delimiter to store key/value pairs in log
36
+ @delimiter = opts[:delimiter] || "|"
37
+
38
+ @log = []
39
+ @data = []
40
+ @workers_envelopes = {}
41
+ @log_filename = File.join(@log_folder, "master-#{Process.pid}.log")
42
+ @sorted_log_filename = File.join(@log_folder, "master-#{Process.pid}_sorted.log")
43
+
44
+ FileUtils.mkdir_p(@log_folder)
45
+ FileUtils.touch(@log_filename)
46
+ end
47
+
48
+ # Start Eventloop
49
+ #
50
+ def run
51
+ EM.run do
52
+ # Init socket
53
+ master_socket
54
+
55
+ # Init flushing timer
56
+ flush
57
+ end
58
+ end
59
+
60
+ # Stop Eventloop
61
+ #
62
+ def stop
63
+ EM.stop
64
+ end
65
+
66
+ # Store data in log array till flush
67
+ #
68
+ def map(key, message)
69
+ @log << "#{key}#{@delimiter}#{message}"
70
+ end
71
+
72
+ # Send data back to worker.
73
+ # Last item in data is last unfinished session,
74
+ # so till the end of file reading we don't send it
75
+ #
76
+ def reduce(envelope)
77
+ if @data.size >= 2
78
+ data = @data.shift
79
+ data = data.flatten
80
+ master_socket.send_reply(data, envelope)
81
+ elsif @reduce_stop
82
+ data = @data.shift
83
+ data = data.flatten if data
84
+ master_socket.send_reply(data, envelope)
85
+ else
86
+ EM.add_timer(1) do
87
+ reduce(envelope)
88
+ end
89
+ end
90
+ end
91
+
92
+ # Openning log file for read/write
93
+ #
94
+ def log_file
95
+ @log_file ||= begin
96
+ File.open(@log_filename, "w+")
97
+ end
98
+ end
99
+
100
+ # Openning sorted log for reading
101
+ #
102
+ def sorted_log_file
103
+ @sorted_log_file ||= begin
104
+ File.open(@sorted_log_filename, "r")
105
+ end
106
+ end
107
+
108
+ # Flushing data to disk once per FLUSH_TIMEOUT seconds
109
+ #
110
+ def flush
111
+ if @log.any?
112
+ log_file << @log*"\n" << "\n"
113
+ log_file.flush
114
+ @log.clear
115
+ end
116
+
117
+ EM.add_timer(FLUSH_TIMEOUT) do
118
+ flush
119
+ end
120
+ end
121
+
122
+ # Sorting log.
123
+ # Linux sort is the fastest way to sort big file.
124
+ # Deleting original log after sort.
125
+ #
126
+ def sort
127
+ `sort #{@log_filename} -o #{@sorted_log_filename}`
128
+ FileUtils.rm(@log_filename)
129
+ @log_file = nil
130
+ end
131
+
132
+ # Start reducing part.
133
+ # First, flushing rest of log to disk.
134
+ # Then sort data.
135
+ # Then start to read/group data
136
+ #
137
+ def reduce!
138
+ flush
139
+ sort
140
+
141
+ iter = sorted_log_file.each_line
142
+ group iter
143
+ end
144
+
145
+ # Reading sorted data and grouping by key.
146
+ # If queue (@data) is growing faster then workers grad data we pause reading file.
147
+ #
148
+ def group(iter)
149
+ if @data.size >= GROUP_MAX
150
+ EM.add_timer(GROUP_TIMEOUT){ group(iter) }
151
+ else
152
+ GROUP_LINES.times do
153
+ line = iter.next.chomp
154
+ key, msg = line.split(@delimiter)
155
+
156
+ last = @data.last
157
+ if last && last[0] == key
158
+ last[1] << msg
159
+ else
160
+ @data << [key, [msg]]
161
+ end
162
+ end
163
+
164
+ EM.next_tick{ group(iter) }
165
+ end
166
+ rescue StopIteration => e
167
+ FileUtils.rm(@sorted_log_filename)
168
+ @sorted_log_file = nil
169
+ @reduce_stop = true
170
+ end
171
+
172
+ # Initializing and binding socket
173
+ #
174
+ def master_socket
175
+ @master_socket ||= begin
176
+ sock = MapReduce::Socket::Master.new self, @workers
177
+ sock.bind @socket_addr
178
+ sock
179
+ end
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,60 @@
1
+ module MapReduce
2
+ class ReduceLog
3
+ def initialize(map_log, delimiter)
4
+ @map_log = map_log
5
+ @delimiter = delimiter
6
+ end
7
+
8
+ def get_data
9
+ if @lines
10
+ current_key = nil
11
+ current_values = []
12
+ while true
13
+ begin
14
+ line = @lines.peek.chomp
15
+ key, values = line.split(@delimiter)
16
+ current_key ||= key
17
+
18
+ if current_key != key
19
+ break
20
+ else
21
+ current_values << values
22
+ @lines.next
23
+ end
24
+ rescue StopIteration => e
25
+ @file.close
26
+ FileUtils.rm(File.path(@file))
27
+ @lines = nil
28
+ break
29
+ end
30
+ end
31
+ [current_key, *current_values]
32
+ end
33
+ end
34
+
35
+ def force
36
+ unless @lines
37
+ fn = log_file
38
+ if fn
39
+ @file = File.open(fn)
40
+ @lines = @file.each_line
41
+ end
42
+ end
43
+ end
44
+
45
+ def log_file
46
+ @log_file ||= begin
47
+ fn = @map_log.reset
48
+ if fn
49
+ @more = true
50
+ sort(fn)
51
+ fn
52
+ end
53
+ end
54
+ end
55
+
56
+ def sort(fn)
57
+ `sort #{fn} -o #{fn}`
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,65 @@
1
+ module MapReduce
2
+ class Reducer
3
+ def initialize(opts = {})
4
+ @masters = opts[:masters] || [::MapReduce::DEFAULT_SOCKET]
5
+ @connection_type = opts[:type] || :em
6
+ @task = opts[:task]
7
+ end
8
+
9
+ def reduce(&blk)
10
+ if @connection_type == :em
11
+ em_reduce(&blk)
12
+ else
13
+ sync_reduce(&blk)
14
+ end
15
+ end
16
+
17
+ def sync_reduce(&blk)
18
+ all = sockets.dup
19
+ while sock = all.sample
20
+ key, *values = sock.send_request(["reduce", @task])
21
+ if key.nil?
22
+ all.delete sock
23
+ else
24
+ blk.call(key, values)
25
+ end
26
+ end
27
+ end
28
+
29
+ def em_reduce(all = nil, &blk)
30
+ all ||= sockets.dup
31
+ sock = all.sample
32
+ if sock
33
+ sock.send_request(["reduce", @task]) do |message|
34
+ key, *values = message
35
+ if key.nil?
36
+ all.delete sock
37
+ else
38
+ blk.call(key, values)
39
+ end
40
+
41
+ em_reduce(all, &blk)
42
+ end
43
+ else
44
+ blk.call([nil])
45
+ end
46
+ end
47
+ private
48
+
49
+ def sockets
50
+ @sockets ||= begin
51
+ klass = if @connection_type == :sync
52
+ EM::Protocols::Zmq2::ReqFiber
53
+ else
54
+ EM::Protocols::Zmq2::ReqCb
55
+ end
56
+
57
+ @masters.map do |sock|
58
+ s = klass.new
59
+ s.connect(sock)
60
+ s
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -1,102 +1,12 @@
1
- # Reply socket.
2
- # Master accepts "map", "map_finished", and "reduce" messages.
3
- # For "map" messages it didn't actually replies,
4
- # but for "reduce" requests it returns key with grouped values.
5
- #
6
1
  module MapReduce::Socket
7
2
  class Master < EM::Protocols::Zmq2::Rep
8
- # If worker is ready to reduce data, but we are still in MAP state
9
- # we will sleep for REDUCE_WAIT seconds till state is not REDUCE
10
- REDUCE_WAIT = 1
11
-
12
- def initialize(master, workers)
3
+ def initialize(master)
13
4
  @master = master
14
- @workers = workers
15
-
16
- @connections = {}
17
- @state = :map
18
-
19
5
  super()
20
6
  end
21
7
 
22
8
  def receive_request(message, envelope)
23
- @connections[envelope.first] = false
24
-
25
- type, key, msg = message
26
- case type
27
- when "map"
28
- map(envelope, key, msg)
29
- when "map_finished"
30
- map_finished(envelope)
31
- when "reduce"
32
- reduce(envelope)
33
- else
34
- MapReduce.logger.error("Wrong message type: #{type}")
35
- end
36
- end
37
-
38
- # Send data to log
39
- # Someone should never MAP data when master already in REDUCE state
40
- #
41
- def map(envelope, key, msg)
42
- if @state == :map
43
- @master.map(key, msg)
44
- ok(envelope)
45
- else
46
- MapReduce.logger.error("Someone tries to MAP data while state is REDUCE")
47
- not_ok(envelope, "You can't MAP while we are reducing")
48
- end
49
- end
50
-
51
- # When worker stops mapping data, it sends "map_finished" message.
52
- # When all workers will send "map_finished" message reduce will begin.
53
- #
54
- def map_finished(envelope)
55
- ok(envelope)
56
-
57
- @connections[envelope.first] ||= true
58
- @workers = @connections.size if @workers == :auto
59
-
60
- return unless @connections.all?{ |k,v| v }
61
- return unless @connections.size == @workers
62
-
63
- @state = :reduce
64
- @master.reduce!
65
- end
66
-
67
- # Wait till all workers stops sending MAP.
68
- # After all workers stopped we start REDUCE part of job.
69
- #
70
- def reduce(envelope)
71
- if @state == :reduce
72
- @master.reduce(envelope)
73
- else
74
- EM.add_timer(REDUCE_WAIT) do
75
- reduce(envelope)
76
- end
77
- end
78
- end
79
-
80
- # Simple OK reply
81
- #
82
- def ok(envelope)
83
- send_reply(["ok"], envelope)
84
- end
85
-
86
- # Simple NOT OK reply
87
- #
88
- def not_ok(envelope, error)
89
- send_reply(["error", error], envelope)
90
- end
91
-
92
- # Switch back to :map state if reduce finished
93
- #
94
- def send_reply(data, envelope)
95
- unless data
96
- @state = :map
97
- @connections = {}
98
- end
99
- super
9
+ @master.recieve_msg(message, envelope)
100
10
  end
101
11
  end
102
12
  end
@@ -1,9 +1,8 @@
1
- module MapReduce::Socket
2
- class WorkerSync < EM::Protocols::Zmq2::ReqCb
3
- alias_method :async_send_request, :send_request
1
+ module EM::Protocols::Zmq2
2
+ class ReqFiber < EM::Protocols::Zmq2::ReqCb
4
3
  def send_request(data, &blk)
5
4
  fib = Fiber.current
6
- async_send_request(data) do |message|
5
+ super(data) do |message|
7
6
  fib.resume(message)
8
7
  end
9
8
  if block_given?
@@ -1,3 +1,3 @@
1
1
  module MapReduce
2
- VERSION = "0.0.1.alpha4"
2
+ VERSION = "0.0.1.alpha5"
3
3
  end
@@ -3,32 +3,48 @@ require 'spec_helper'
3
3
  describe "MapReduce stack" do
4
4
  describe "single master" do
5
5
  before do
6
- @pid = fork do
7
- master = MapReduce::Master.new
6
+ @pid1 = fork do
7
+ master = MapReduce::Master.new socket: "tcp://127.0.0.1:15555"
8
+ trap("SIGINT") do
9
+ master.stop
10
+ exit
11
+ end
12
+ master.run
13
+ end
14
+ @pid2 = fork do
15
+ master = MapReduce::Master.new socket: "tcp://127.0.0.1:15556"
16
+ trap("SIGINT") do
17
+ master.stop
18
+ exit
19
+ end
8
20
  master.run
9
21
  end
10
22
  end
11
23
 
12
24
  after do
13
- Process.kill "TERM", @pid
25
+ Process.kill "INT", @pid1
26
+ Process.kill "INT", @pid2
14
27
  end
15
28
 
16
- it "should map and reduce some data in CB mode" do
17
- EM.run do
18
- data = {}
19
- worker = MapReduce::Worker.new
20
- worker.map("Petr", ["Radiohead", "Muse", "R.E.M."] * ',') do
21
- worker.map("Alex", ["Madonna", "Lady Gaga"] * ',') do
22
- worker.map("Petr", ["Radiohead", "The Beatles", "Aquarium"] * ',') do
23
- worker.map_finished do
24
- worker.reduce do |key, values|
29
+ describe ":em" do
30
+ it "should map/reduce with multiple masters" do
31
+ EM.run do
32
+ @mapper = MapReduce::Mapper.new task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
33
+ @reducer = MapReduce::Reducer.new task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
34
+ i = 0
35
+ [["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"]].each do |a|
36
+ @mapper.map(*a) do |res|
37
+ res.must_equal ["ok"]
38
+ if (i+=1) == 5
39
+ data = {}
40
+ @reducer.reduce do |key, values|
25
41
  if key
26
42
  data[key] = values
27
43
  else
28
- data.size.must_equal 2
29
- data["Petr"].must_equal [["Radiohead", "Muse", "R.E.M."] * ',', ["Radiohead", "The Beatles", "Aquarium"] * ',']
30
- data["Alex"].must_equal [["Madonna", "Lady Gaga"] * ',']
31
-
44
+ data.size.must_equal 3
45
+ data["Peter"].sort.must_equal ["Apple", "Lemon"].sort
46
+ data["Andrew"].sort.must_equal ["Peach", "Orange"].sort
47
+ data["Mary"].must_equal ["Plum"]
32
48
  EM.stop
33
49
  end
34
50
  end
@@ -39,64 +55,61 @@ describe "MapReduce stack" do
39
55
  end
40
56
  end
41
57
 
42
- it "should map and reduce some data in SYNC mode" do
43
- EM.synchrony do
44
- data = {}
45
- worker = MapReduce::Worker.new type: :sync
46
- worker.map("Petr", ["Radiohead", "Muse", "R.E.M."] * ',')
47
- worker.map("Alex", ["Madonna", "Lady Gaga"] * ',')
48
- worker.map("Petr", ["Radiohead", "The Beatles", "Aquarium"] * ',')
49
- worker.map_finished
50
- worker.reduce do |key, values|
51
- data[key] = values if key
58
+ describe ":sync" do
59
+ it "should map/reduce with multiple masters" do
60
+ EM.synchrony do
61
+ @mapper = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
62
+ @reducer = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
63
+ [["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"]].each do |a|
64
+ res = @mapper.map(*a)
65
+ res.must_equal ["ok"]
66
+ end
67
+ data = {}
68
+ @reducer.reduce do |k, values|
69
+ data[k] = values
70
+ end
71
+ data.size.must_equal 3
72
+ data["Peter"].sort.must_equal ["Apple", "Lemon"].sort
73
+ data["Andrew"].sort.must_equal ["Peach", "Orange"].sort
74
+ data["Mary"].must_equal ["Plum"]
75
+ EM.stop
52
76
  end
53
- data.size.must_equal 2
54
- data["Petr"].must_equal [["Radiohead", "Muse", "R.E.M."] * ',', ["Radiohead", "The Beatles", "Aquarium"] * ',']
55
- data["Alex"].must_equal [["Madonna", "Lady Gaga"] * ',']
56
-
57
- EM.stop
58
77
  end
59
- end
60
- end
61
78
 
62
- describe "multiple master" do
63
- before do
64
- @pid1 = fork do
65
- master = MapReduce::Master.new socket: "ipc:///dev/shm/sock1.sock"
66
- master.run
67
- end
68
- @pid2 = fork do
69
- master = MapReduce::Master.new socket: "ipc:///dev/shm/sock2.sock"
70
- master.run
71
- end
72
- end
79
+ it "should map/reduce-map/reduce with multiple masters" do
80
+ EM.synchrony do
81
+ @mapper1 = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
82
+ @reducer1 = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
83
+ @mapper2 = MapReduce::Mapper.new type: :sync, task: "Related", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
84
+ @reducer2 = MapReduce::Reducer.new type: :sync, task: "Related", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
73
85
 
74
- after do
75
- Process.kill "TERM", @pid1
76
- Process.kill "TERM", @pid2
77
- end
86
+ [["Peter", "Apple"], ["Andrew", "Peach"], ["Mary", "Plum"], ["Peter", "Lemon"], ["Andrew", "Orange"], ["Peter", "Peach"], ["Yura", "Peach"], ["Yura", "Apricot"], ["Yura", "Apple"]].each do |a|
87
+ res = @mapper1.map(*a)
88
+ res.must_equal ["ok"]
89
+ end
78
90
 
79
- it "should map and reduce some data in SYNC mode twice" do
80
- EM.synchrony do
81
- worker = MapReduce::Worker.new type: :sync, masters: ["ipc:///dev/shm/sock1.sock", "ipc:///dev/shm/sock2.sock"]
82
- 2.times do
83
- data = {}
84
- worker.map("Petr", ["Radiohead", "Muse", "R.E.M."] * ',')
85
- worker.map("Alex", ["Madonna", "Lady Gaga"] * ',')
86
- worker.map("Petr", ["Radiohead", "The Beatles", "Aquarium"] * ',')
87
- worker.map("Michael", ["Blur"] * ',')
88
- worker.map("Gosha", ["DDT", "Splin"] * ',')
89
- worker.map("Obama", ["Adele", "Rolling Stones"] * ',')
90
- worker.map_finished
91
- worker.reduce do |key, values|
92
- data[key] = values if key
91
+ @reducer1.reduce do |k, values|
92
+ values.each do |fruit|
93
+ related = values.dup
94
+ related.delete fruit
95
+ related.each do |r|
96
+ @mapper2.map(fruit, r)
97
+ end
98
+ end
93
99
  end
94
- data.size.must_equal 5
95
- data["Petr"].must_equal [["Radiohead", "Muse", "R.E.M."] * ',', ["Radiohead", "The Beatles", "Aquarium"] * ',']
96
- data["Alex"].must_equal [["Madonna", "Lady Gaga"] * ',']
97
- end
98
100
 
99
- EM.stop
101
+ fruits = {}
102
+ @reducer2.reduce do |fruit, related|
103
+ fruits[fruit] ||= []
104
+ fruits[fruit].push(*related)
105
+ end
106
+
107
+ fruits["Apple"].must_equal ["Apricot", "Lemon", "Peach", "Peach"]
108
+ fruits["Orange"].must_equal ["Peach"]
109
+ fruits["Plum"].must_equal nil
110
+
111
+ EM.stop
112
+ end
100
113
  end
101
114
  end
102
115
  end