map_reduce 0.0.1.alpha5 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/map_reduce/map_log.rb +3 -2
- data/lib/map_reduce/master.rb +4 -4
- data/lib/map_reduce/reduce_log.rb +5 -7
- data/lib/map_reduce/version.rb +1 -1
- data/spec/map_reduce/map_reduce_spec.rb +24 -0
- metadata +9 -8
- data/lib/map_reduce/master_old.rb +0 -182
- data/lib/map_reduce/worker.rb +0 -144
data/lib/map_reduce/map_log.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module MapReduce
|
2
2
|
class MapLog
|
3
|
-
MAX_BUFFER_SIZE = 2 **
|
3
|
+
MAX_BUFFER_SIZE = 2 ** 21 # 2 MB
|
4
4
|
|
5
5
|
def initialize(log_folder, task)
|
6
6
|
@log_folder = log_folder
|
@@ -19,6 +19,7 @@ module MapReduce
|
|
19
19
|
unless @log.empty?
|
20
20
|
log_file << @log
|
21
21
|
log_file.flush
|
22
|
+
@log.clear
|
22
23
|
end
|
23
24
|
end
|
24
25
|
|
@@ -35,7 +36,7 @@ module MapReduce
|
|
35
36
|
def log_file
|
36
37
|
@log_file ||= begin
|
37
38
|
begin
|
38
|
-
fn = File.join(@log_folder, "map_#{@task}_#{Time.now.to_i}_#{rand(1000)}.log")
|
39
|
+
fn = File.join(@log_folder, "map_#{@task}_#{Time.now.to_i}_#{Process.pid}_#{rand(1000)}.log")
|
39
40
|
end while File.exist?(fn)
|
40
41
|
FileUtils.mkdir_p(@log_folder)
|
41
42
|
File.open(fn, "a")
|
data/lib/map_reduce/master.rb
CHANGED
@@ -61,14 +61,14 @@ module MapReduce
|
|
61
61
|
reduce_log(task, true).get_data
|
62
62
|
end
|
63
63
|
|
64
|
-
reply(data, envelope)
|
65
|
-
|
66
64
|
if data
|
67
65
|
register(task, envelope, "reducer", status)
|
68
66
|
else
|
69
67
|
register(task, envelope, "reducer", "reduce_finished")
|
70
68
|
end
|
71
69
|
|
70
|
+
reply(data, envelope)
|
71
|
+
|
72
72
|
@after_reduce.call(data[0], data[1], task) if data && @after_reduce
|
73
73
|
end
|
74
74
|
|
@@ -89,9 +89,9 @@ module MapReduce
|
|
89
89
|
|
90
90
|
def reduce_log(task, force = false)
|
91
91
|
@reduce_log ||= {}
|
92
|
-
|
92
|
+
@reduce_log[task] ||= MapReduce::ReduceLog.new(map_log(task), @delimiter)
|
93
93
|
@reduce_log[task].force if force
|
94
|
-
|
94
|
+
@reduce_log[task]
|
95
95
|
end
|
96
96
|
|
97
97
|
def ok(envelope)
|
data/lib/map_reduce/version.rb
CHANGED
@@ -76,6 +76,30 @@ describe "MapReduce stack" do
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
+
it "should map -> reduce / reduce" do
|
80
|
+
EM.synchrony do
|
81
|
+
@mapper = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
82
|
+
@reducer = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
83
|
+
|
84
|
+
Fiber.new do
|
85
|
+
100.times do |i|
|
86
|
+
@mapper.map(i, 1)
|
87
|
+
end
|
88
|
+
end.resume
|
89
|
+
data = []
|
90
|
+
Fiber.new do
|
91
|
+
while data.size < 100
|
92
|
+
@reducer.reduce do |k, v|
|
93
|
+
data << k
|
94
|
+
end
|
95
|
+
end
|
96
|
+
data.sort.must_equal (0...100).to_a.map(&:to_s).sort
|
97
|
+
|
98
|
+
EM.stop
|
99
|
+
end.resume
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
79
103
|
it "should map/reduce-map/reduce with multiple masters" do
|
80
104
|
EM.synchrony do
|
81
105
|
@mapper1 = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: map_reduce
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Petr Yanovich
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -92,13 +92,11 @@ files:
|
|
92
92
|
- lib/map_reduce/map_log.rb
|
93
93
|
- lib/map_reduce/mapper.rb
|
94
94
|
- lib/map_reduce/master.rb
|
95
|
-
- lib/map_reduce/master_old.rb
|
96
95
|
- lib/map_reduce/reduce_log.rb
|
97
96
|
- lib/map_reduce/reducer.rb
|
98
97
|
- lib/map_reduce/socket/master.rb
|
99
98
|
- lib/map_reduce/socket/req_fiber.rb
|
100
99
|
- lib/map_reduce/version.rb
|
101
|
-
- lib/map_reduce/worker.rb
|
102
100
|
- map_reduce.gemspec
|
103
101
|
- spec/map_reduce/map_reduce_spec.rb
|
104
102
|
- spec/map_reduce/master_spec.rb
|
@@ -119,13 +117,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
117
|
version: '0'
|
120
118
|
segments:
|
121
119
|
- 0
|
122
|
-
hash:
|
120
|
+
hash: 1253589631458738880
|
123
121
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
122
|
none: false
|
125
123
|
requirements:
|
126
|
-
- - ! '
|
124
|
+
- - ! '>='
|
127
125
|
- !ruby/object:Gem::Version
|
128
|
-
version:
|
126
|
+
version: '0'
|
127
|
+
segments:
|
128
|
+
- 0
|
129
|
+
hash: 1253589631458738880
|
129
130
|
requirements: []
|
130
131
|
rubyforge_project:
|
131
132
|
rubygems_version: 1.8.25
|
@@ -1,182 +0,0 @@
|
|
1
|
-
require File.expand_path("../socket/master", __FILE__)
|
2
|
-
|
3
|
-
module MapReduce
|
4
|
-
class Master
|
5
|
-
# How often data will be flushed to disk
|
6
|
-
FLUSH_TIMEOUT = 1
|
7
|
-
# How many lines should be parsed by one iteration of grouping
|
8
|
-
GROUP_LINES = 100
|
9
|
-
# How many seconds should we sleep if grouping is going faster then reducing
|
10
|
-
GROUP_TIMEOUT = 1
|
11
|
-
# How many keys should be stored before timeout happend
|
12
|
-
GROUP_MAX = 10_000
|
13
|
-
|
14
|
-
# Valid options:
|
15
|
-
# * socket - socket address to bind
|
16
|
-
# default is 'ipc:///dev/shm/master.sock'
|
17
|
-
# * log_folder - folder to store recieved MAP data
|
18
|
-
# default is '/tmp/mapreduce/'
|
19
|
-
# * workers - count of workers that will emit data.
|
20
|
-
# default is :auto,
|
21
|
-
# but in small jobs it is better to define in explicitly,
|
22
|
-
# because if one worker will stop before others start
|
23
|
-
# master will decide that map job is done and will start reducing
|
24
|
-
# * delimiter - master log stores data like "key{delimiter}values"
|
25
|
-
# so to prevent collisions you can specify your own uniq delimiter
|
26
|
-
# default is a pipe "|"
|
27
|
-
#
|
28
|
-
def initialize(opts = {})
|
29
|
-
# Socket addr to bind
|
30
|
-
@socket_addr = opts[:socket] || ::MapReduce::DEFAULT_SOCKET
|
31
|
-
# Folder to write logs
|
32
|
-
@log_folder = opts[:log_folder] || "/tmp/mapreduce/"
|
33
|
-
# How many MapReduce workers will emit data
|
34
|
-
@workers = opts[:workers] || 1
|
35
|
-
# Delimiter to store key/value pairs in log
|
36
|
-
@delimiter = opts[:delimiter] || "|"
|
37
|
-
|
38
|
-
@log = []
|
39
|
-
@data = []
|
40
|
-
@workers_envelopes = {}
|
41
|
-
@log_filename = File.join(@log_folder, "master-#{Process.pid}.log")
|
42
|
-
@sorted_log_filename = File.join(@log_folder, "master-#{Process.pid}_sorted.log")
|
43
|
-
|
44
|
-
FileUtils.mkdir_p(@log_folder)
|
45
|
-
FileUtils.touch(@log_filename)
|
46
|
-
end
|
47
|
-
|
48
|
-
# Start Eventloop
|
49
|
-
#
|
50
|
-
def run
|
51
|
-
EM.run do
|
52
|
-
# Init socket
|
53
|
-
master_socket
|
54
|
-
|
55
|
-
# Init flushing timer
|
56
|
-
flush
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
# Stop Eventloop
|
61
|
-
#
|
62
|
-
def stop
|
63
|
-
EM.stop
|
64
|
-
end
|
65
|
-
|
66
|
-
# Store data in log array till flush
|
67
|
-
#
|
68
|
-
def map(key, message)
|
69
|
-
@log << "#{key}#{@delimiter}#{message}"
|
70
|
-
end
|
71
|
-
|
72
|
-
# Send data back to worker.
|
73
|
-
# Last item in data is last unfinished session,
|
74
|
-
# so till the end of file reading we don't send it
|
75
|
-
#
|
76
|
-
def reduce(envelope)
|
77
|
-
if @data.size >= 2
|
78
|
-
data = @data.shift
|
79
|
-
data = data.flatten
|
80
|
-
master_socket.send_reply(data, envelope)
|
81
|
-
elsif @reduce_stop
|
82
|
-
data = @data.shift
|
83
|
-
data = data.flatten if data
|
84
|
-
master_socket.send_reply(data, envelope)
|
85
|
-
else
|
86
|
-
EM.add_timer(1) do
|
87
|
-
reduce(envelope)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# Openning log file for read/write
|
93
|
-
#
|
94
|
-
def log_file
|
95
|
-
@log_file ||= begin
|
96
|
-
File.open(@log_filename, "w+")
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
# Openning sorted log for reading
|
101
|
-
#
|
102
|
-
def sorted_log_file
|
103
|
-
@sorted_log_file ||= begin
|
104
|
-
File.open(@sorted_log_filename, "r")
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
# Flushing data to disk once per FLUSH_TIMEOUT seconds
|
109
|
-
#
|
110
|
-
def flush
|
111
|
-
if @log.any?
|
112
|
-
log_file << @log*"\n" << "\n"
|
113
|
-
log_file.flush
|
114
|
-
@log.clear
|
115
|
-
end
|
116
|
-
|
117
|
-
EM.add_timer(FLUSH_TIMEOUT) do
|
118
|
-
flush
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
# Sorting log.
|
123
|
-
# Linux sort is the fastest way to sort big file.
|
124
|
-
# Deleting original log after sort.
|
125
|
-
#
|
126
|
-
def sort
|
127
|
-
`sort #{@log_filename} -o #{@sorted_log_filename}`
|
128
|
-
FileUtils.rm(@log_filename)
|
129
|
-
@log_file = nil
|
130
|
-
end
|
131
|
-
|
132
|
-
# Start reducing part.
|
133
|
-
# First, flushing rest of log to disk.
|
134
|
-
# Then sort data.
|
135
|
-
# Then start to read/group data
|
136
|
-
#
|
137
|
-
def reduce!
|
138
|
-
flush
|
139
|
-
sort
|
140
|
-
|
141
|
-
iter = sorted_log_file.each_line
|
142
|
-
group iter
|
143
|
-
end
|
144
|
-
|
145
|
-
# Reading sorted data and grouping by key.
|
146
|
-
# If queue (@data) is growing faster then workers grad data we pause reading file.
|
147
|
-
#
|
148
|
-
def group(iter)
|
149
|
-
if @data.size >= GROUP_MAX
|
150
|
-
EM.add_timer(GROUP_TIMEOUT){ group(iter) }
|
151
|
-
else
|
152
|
-
GROUP_LINES.times do
|
153
|
-
line = iter.next.chomp
|
154
|
-
key, msg = line.split(@delimiter)
|
155
|
-
|
156
|
-
last = @data.last
|
157
|
-
if last && last[0] == key
|
158
|
-
last[1] << msg
|
159
|
-
else
|
160
|
-
@data << [key, [msg]]
|
161
|
-
end
|
162
|
-
end
|
163
|
-
|
164
|
-
EM.next_tick{ group(iter) }
|
165
|
-
end
|
166
|
-
rescue StopIteration => e
|
167
|
-
FileUtils.rm(@sorted_log_filename)
|
168
|
-
@sorted_log_file = nil
|
169
|
-
@reduce_stop = true
|
170
|
-
end
|
171
|
-
|
172
|
-
# Initializing and binding socket
|
173
|
-
#
|
174
|
-
def master_socket
|
175
|
-
@master_socket ||= begin
|
176
|
-
sock = MapReduce::Socket::Master.new self, @workers
|
177
|
-
sock.bind @socket_addr
|
178
|
-
sock
|
179
|
-
end
|
180
|
-
end
|
181
|
-
end
|
182
|
-
end
|
data/lib/map_reduce/worker.rb
DELETED
@@ -1,144 +0,0 @@
|
|
1
|
-
# MapReduce Worker make two jobs:
|
2
|
-
# First, it maps (emits) all data to masters;
|
3
|
-
# Second, it reduces data returned form master;
|
4
|
-
# After reducing he is ready to map data again.
|
5
|
-
#
|
6
|
-
module MapReduce
|
7
|
-
class Worker
|
8
|
-
|
9
|
-
# Valid options:
|
10
|
-
# * masters - socket addresses of masters,
|
11
|
-
# default is 'ipc:///dev/shm/master.sock'
|
12
|
-
# * type - connection type:
|
13
|
-
# ** :em - Eventmachine with callbacks (default)
|
14
|
-
# ** :sync - Synchronous type on Fibers
|
15
|
-
#
|
16
|
-
def initialize(opts = {})
|
17
|
-
@socket_addrs = opts[:masters] || [::MapReduce::DEFAULT_SOCKET]
|
18
|
-
|
19
|
-
@type = opts[:type] ||= :em
|
20
|
-
@socket_class = case @type
|
21
|
-
when :em
|
22
|
-
require File.expand_path("../socket/worker_em", __FILE__)
|
23
|
-
MapReduce::Socket::WorkerEm
|
24
|
-
when :sync
|
25
|
-
require File.expand_path("../socket/worker_sync", __FILE__)
|
26
|
-
MapReduce::Socket::WorkerSync
|
27
|
-
else
|
28
|
-
fail "Wrong Connection type. Choose :em or :sync, not #{opts[:type]}"
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# Sends key and value to master through socket.
|
33
|
-
# Key can't be nil.
|
34
|
-
#
|
35
|
-
def emit(key, value, &blk)
|
36
|
-
fail "Key can't be nil" if key.nil?
|
37
|
-
|
38
|
-
sock = pick_map_socket(key)
|
39
|
-
sock.send_request(["map", key, value], &blk)
|
40
|
-
end
|
41
|
-
alias :map :emit
|
42
|
-
|
43
|
-
# Explicitly stop MAP phase.
|
44
|
-
# Master will wait till all workers will send "map_finished" message.
|
45
|
-
#
|
46
|
-
def map_finished(&blk)
|
47
|
-
all = master_sockets.size
|
48
|
-
resp = 0
|
49
|
-
|
50
|
-
master_sockets.each do |sock, h|
|
51
|
-
sock.send_request(["map_finished"]) do |msg|
|
52
|
-
socket_state(sock, :reduce)
|
53
|
-
blk.call(["ok"]) if block_given? && (resp+=1) == all
|
54
|
-
end
|
55
|
-
end
|
56
|
-
["ok"]
|
57
|
-
end
|
58
|
-
|
59
|
-
# Reduce operation.
|
60
|
-
# Sends request to all masters.
|
61
|
-
# If master returns nil it means that he is already empty:
|
62
|
-
# nothing to reduce.
|
63
|
-
# Reducing till any socket returns data.
|
64
|
-
# If nothing to reduce, we return nil to client.
|
65
|
-
#
|
66
|
-
def reduce(&blk)
|
67
|
-
if @type == :em
|
68
|
-
em_reduce(&blk)
|
69
|
-
else
|
70
|
-
sync_reduce(&blk)
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def sync_reduce(&blk)
|
75
|
-
while sock = random_reduce_socket
|
76
|
-
key, *values = sock.send_request(["reduce"])
|
77
|
-
if key.nil?
|
78
|
-
socket_state(sock, :map)
|
79
|
-
else
|
80
|
-
blk.call(key, values)
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
def em_reduce(&blk)
|
86
|
-
sock = random_reduce_socket
|
87
|
-
if sock
|
88
|
-
sock.send_request(["reduce"]) do |message|
|
89
|
-
key, *values = message
|
90
|
-
if key.nil?
|
91
|
-
socket_state(sock, :map)
|
92
|
-
else
|
93
|
-
blk.call(key, values)
|
94
|
-
end
|
95
|
-
|
96
|
-
em_reduce(&blk)
|
97
|
-
end
|
98
|
-
else
|
99
|
-
blk.call([nil])
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
private
|
104
|
-
|
105
|
-
# Connect to each master.
|
106
|
-
#
|
107
|
-
def master_sockets
|
108
|
-
@master_sockets ||= begin
|
109
|
-
socks = {}
|
110
|
-
@socket_addrs.each_with_index do |addr, i|
|
111
|
-
sock = @socket_class.new
|
112
|
-
sock.connect addr
|
113
|
-
socks[sock] = { state: :map, ind: i }
|
114
|
-
end
|
115
|
-
socks
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
# Kind of sharding
|
120
|
-
#
|
121
|
-
def pick_map_socket(key)
|
122
|
-
shard = if master_sockets.size > 1
|
123
|
-
Digest::MD5.hexdigest(key.to_s).to_i(16) % master_sockets.size
|
124
|
-
else
|
125
|
-
0
|
126
|
-
end
|
127
|
-
master_sockets.keys[shard]
|
128
|
-
end
|
129
|
-
|
130
|
-
# Take random socket to get reduce message.
|
131
|
-
# Socket should be in :reduce state.
|
132
|
-
#
|
133
|
-
def random_reduce_socket
|
134
|
-
master_sockets.select{ |k,v| v[:state] == :reduce }.keys.sample
|
135
|
-
end
|
136
|
-
|
137
|
-
# Change socket's state to :map when it is empty
|
138
|
-
# and to :reduce when mapping is finished
|
139
|
-
#
|
140
|
-
def socket_state(sock, state)
|
141
|
-
master_sockets[sock][:state] = state
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|