map_reduce 0.0.1.alpha5 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/map_reduce/map_log.rb +3 -2
- data/lib/map_reduce/master.rb +4 -4
- data/lib/map_reduce/reduce_log.rb +5 -7
- data/lib/map_reduce/version.rb +1 -1
- data/spec/map_reduce/map_reduce_spec.rb +24 -0
- metadata +9 -8
- data/lib/map_reduce/master_old.rb +0 -182
- data/lib/map_reduce/worker.rb +0 -144
data/lib/map_reduce/map_log.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module MapReduce
|
2
2
|
class MapLog
|
3
|
-
MAX_BUFFER_SIZE = 2 **
|
3
|
+
MAX_BUFFER_SIZE = 2 ** 21 # 2 MB
|
4
4
|
|
5
5
|
def initialize(log_folder, task)
|
6
6
|
@log_folder = log_folder
|
@@ -19,6 +19,7 @@ module MapReduce
|
|
19
19
|
unless @log.empty?
|
20
20
|
log_file << @log
|
21
21
|
log_file.flush
|
22
|
+
@log.clear
|
22
23
|
end
|
23
24
|
end
|
24
25
|
|
@@ -35,7 +36,7 @@ module MapReduce
|
|
35
36
|
def log_file
|
36
37
|
@log_file ||= begin
|
37
38
|
begin
|
38
|
-
fn = File.join(@log_folder, "map_#{@task}_#{Time.now.to_i}_#{rand(1000)}.log")
|
39
|
+
fn = File.join(@log_folder, "map_#{@task}_#{Time.now.to_i}_#{Process.pid}_#{rand(1000)}.log")
|
39
40
|
end while File.exist?(fn)
|
40
41
|
FileUtils.mkdir_p(@log_folder)
|
41
42
|
File.open(fn, "a")
|
data/lib/map_reduce/master.rb
CHANGED
@@ -61,14 +61,14 @@ module MapReduce
|
|
61
61
|
reduce_log(task, true).get_data
|
62
62
|
end
|
63
63
|
|
64
|
-
reply(data, envelope)
|
65
|
-
|
66
64
|
if data
|
67
65
|
register(task, envelope, "reducer", status)
|
68
66
|
else
|
69
67
|
register(task, envelope, "reducer", "reduce_finished")
|
70
68
|
end
|
71
69
|
|
70
|
+
reply(data, envelope)
|
71
|
+
|
72
72
|
@after_reduce.call(data[0], data[1], task) if data && @after_reduce
|
73
73
|
end
|
74
74
|
|
@@ -89,9 +89,9 @@ module MapReduce
|
|
89
89
|
|
90
90
|
def reduce_log(task, force = false)
|
91
91
|
@reduce_log ||= {}
|
92
|
-
|
92
|
+
@reduce_log[task] ||= MapReduce::ReduceLog.new(map_log(task), @delimiter)
|
93
93
|
@reduce_log[task].force if force
|
94
|
-
|
94
|
+
@reduce_log[task]
|
95
95
|
end
|
96
96
|
|
97
97
|
def ok(envelope)
|
data/lib/map_reduce/version.rb
CHANGED
@@ -76,6 +76,30 @@ describe "MapReduce stack" do
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
+
it "should map -> reduce / reduce" do
|
80
|
+
EM.synchrony do
|
81
|
+
@mapper = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
82
|
+
@reducer = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
83
|
+
|
84
|
+
Fiber.new do
|
85
|
+
100.times do |i|
|
86
|
+
@mapper.map(i, 1)
|
87
|
+
end
|
88
|
+
end.resume
|
89
|
+
data = []
|
90
|
+
Fiber.new do
|
91
|
+
while data.size < 100
|
92
|
+
@reducer.reduce do |k, v|
|
93
|
+
data << k
|
94
|
+
end
|
95
|
+
end
|
96
|
+
data.sort.must_equal (0...100).to_a.map(&:to_s).sort
|
97
|
+
|
98
|
+
EM.stop
|
99
|
+
end.resume
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
79
103
|
it "should map/reduce-map/reduce with multiple masters" do
|
80
104
|
EM.synchrony do
|
81
105
|
@mapper1 = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: map_reduce
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Petr Yanovich
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07-
|
12
|
+
date: 2013-07-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -92,13 +92,11 @@ files:
|
|
92
92
|
- lib/map_reduce/map_log.rb
|
93
93
|
- lib/map_reduce/mapper.rb
|
94
94
|
- lib/map_reduce/master.rb
|
95
|
-
- lib/map_reduce/master_old.rb
|
96
95
|
- lib/map_reduce/reduce_log.rb
|
97
96
|
- lib/map_reduce/reducer.rb
|
98
97
|
- lib/map_reduce/socket/master.rb
|
99
98
|
- lib/map_reduce/socket/req_fiber.rb
|
100
99
|
- lib/map_reduce/version.rb
|
101
|
-
- lib/map_reduce/worker.rb
|
102
100
|
- map_reduce.gemspec
|
103
101
|
- spec/map_reduce/map_reduce_spec.rb
|
104
102
|
- spec/map_reduce/master_spec.rb
|
@@ -119,13 +117,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
117
|
version: '0'
|
120
118
|
segments:
|
121
119
|
- 0
|
122
|
-
hash:
|
120
|
+
hash: 1253589631458738880
|
123
121
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
122
|
none: false
|
125
123
|
requirements:
|
126
|
-
- - ! '
|
124
|
+
- - ! '>='
|
127
125
|
- !ruby/object:Gem::Version
|
128
|
-
version:
|
126
|
+
version: '0'
|
127
|
+
segments:
|
128
|
+
- 0
|
129
|
+
hash: 1253589631458738880
|
129
130
|
requirements: []
|
130
131
|
rubyforge_project:
|
131
132
|
rubygems_version: 1.8.25
|
@@ -1,182 +0,0 @@
|
|
1
|
-
require File.expand_path("../socket/master", __FILE__)
|
2
|
-
|
3
|
-
module MapReduce
|
4
|
-
class Master
|
5
|
-
# How often data will be flushed to disk
|
6
|
-
FLUSH_TIMEOUT = 1
|
7
|
-
# How many lines should be parsed by one iteration of grouping
|
8
|
-
GROUP_LINES = 100
|
9
|
-
# How many seconds should we sleep if grouping is going faster then reducing
|
10
|
-
GROUP_TIMEOUT = 1
|
11
|
-
# How many keys should be stored before timeout happend
|
12
|
-
GROUP_MAX = 10_000
|
13
|
-
|
14
|
-
# Valid options:
|
15
|
-
# * socket - socket address to bind
|
16
|
-
# default is 'ipc:///dev/shm/master.sock'
|
17
|
-
# * log_folder - folder to store recieved MAP data
|
18
|
-
# default is '/tmp/mapreduce/'
|
19
|
-
# * workers - count of workers that will emit data.
|
20
|
-
# default is :auto,
|
21
|
-
# but in small jobs it is better to define in explicitly,
|
22
|
-
# because if one worker will stop before others start
|
23
|
-
# master will decide that map job is done and will start reducing
|
24
|
-
# * delimiter - master log stores data like "key{delimiter}values"
|
25
|
-
# so to prevent collisions you can specify your own uniq delimiter
|
26
|
-
# default is a pipe "|"
|
27
|
-
#
|
28
|
-
def initialize(opts = {})
|
29
|
-
# Socket addr to bind
|
30
|
-
@socket_addr = opts[:socket] || ::MapReduce::DEFAULT_SOCKET
|
31
|
-
# Folder to write logs
|
32
|
-
@log_folder = opts[:log_folder] || "/tmp/mapreduce/"
|
33
|
-
# How many MapReduce workers will emit data
|
34
|
-
@workers = opts[:workers] || 1
|
35
|
-
# Delimiter to store key/value pairs in log
|
36
|
-
@delimiter = opts[:delimiter] || "|"
|
37
|
-
|
38
|
-
@log = []
|
39
|
-
@data = []
|
40
|
-
@workers_envelopes = {}
|
41
|
-
@log_filename = File.join(@log_folder, "master-#{Process.pid}.log")
|
42
|
-
@sorted_log_filename = File.join(@log_folder, "master-#{Process.pid}_sorted.log")
|
43
|
-
|
44
|
-
FileUtils.mkdir_p(@log_folder)
|
45
|
-
FileUtils.touch(@log_filename)
|
46
|
-
end
|
47
|
-
|
48
|
-
# Start Eventloop
|
49
|
-
#
|
50
|
-
def run
|
51
|
-
EM.run do
|
52
|
-
# Init socket
|
53
|
-
master_socket
|
54
|
-
|
55
|
-
# Init flushing timer
|
56
|
-
flush
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
# Stop Eventloop
|
61
|
-
#
|
62
|
-
def stop
|
63
|
-
EM.stop
|
64
|
-
end
|
65
|
-
|
66
|
-
# Store data in log array till flush
|
67
|
-
#
|
68
|
-
def map(key, message)
|
69
|
-
@log << "#{key}#{@delimiter}#{message}"
|
70
|
-
end
|
71
|
-
|
72
|
-
# Send data back to worker.
|
73
|
-
# Last item in data is last unfinished session,
|
74
|
-
# so till the end of file reading we don't send it
|
75
|
-
#
|
76
|
-
def reduce(envelope)
|
77
|
-
if @data.size >= 2
|
78
|
-
data = @data.shift
|
79
|
-
data = data.flatten
|
80
|
-
master_socket.send_reply(data, envelope)
|
81
|
-
elsif @reduce_stop
|
82
|
-
data = @data.shift
|
83
|
-
data = data.flatten if data
|
84
|
-
master_socket.send_reply(data, envelope)
|
85
|
-
else
|
86
|
-
EM.add_timer(1) do
|
87
|
-
reduce(envelope)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
|
92
|
-
# Openning log file for read/write
|
93
|
-
#
|
94
|
-
def log_file
|
95
|
-
@log_file ||= begin
|
96
|
-
File.open(@log_filename, "w+")
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
# Openning sorted log for reading
|
101
|
-
#
|
102
|
-
def sorted_log_file
|
103
|
-
@sorted_log_file ||= begin
|
104
|
-
File.open(@sorted_log_filename, "r")
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
# Flushing data to disk once per FLUSH_TIMEOUT seconds
|
109
|
-
#
|
110
|
-
def flush
|
111
|
-
if @log.any?
|
112
|
-
log_file << @log*"\n" << "\n"
|
113
|
-
log_file.flush
|
114
|
-
@log.clear
|
115
|
-
end
|
116
|
-
|
117
|
-
EM.add_timer(FLUSH_TIMEOUT) do
|
118
|
-
flush
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
# Sorting log.
|
123
|
-
# Linux sort is the fastest way to sort big file.
|
124
|
-
# Deleting original log after sort.
|
125
|
-
#
|
126
|
-
def sort
|
127
|
-
`sort #{@log_filename} -o #{@sorted_log_filename}`
|
128
|
-
FileUtils.rm(@log_filename)
|
129
|
-
@log_file = nil
|
130
|
-
end
|
131
|
-
|
132
|
-
# Start reducing part.
|
133
|
-
# First, flushing rest of log to disk.
|
134
|
-
# Then sort data.
|
135
|
-
# Then start to read/group data
|
136
|
-
#
|
137
|
-
def reduce!
|
138
|
-
flush
|
139
|
-
sort
|
140
|
-
|
141
|
-
iter = sorted_log_file.each_line
|
142
|
-
group iter
|
143
|
-
end
|
144
|
-
|
145
|
-
# Reading sorted data and grouping by key.
|
146
|
-
# If queue (@data) is growing faster then workers grad data we pause reading file.
|
147
|
-
#
|
148
|
-
def group(iter)
|
149
|
-
if @data.size >= GROUP_MAX
|
150
|
-
EM.add_timer(GROUP_TIMEOUT){ group(iter) }
|
151
|
-
else
|
152
|
-
GROUP_LINES.times do
|
153
|
-
line = iter.next.chomp
|
154
|
-
key, msg = line.split(@delimiter)
|
155
|
-
|
156
|
-
last = @data.last
|
157
|
-
if last && last[0] == key
|
158
|
-
last[1] << msg
|
159
|
-
else
|
160
|
-
@data << [key, [msg]]
|
161
|
-
end
|
162
|
-
end
|
163
|
-
|
164
|
-
EM.next_tick{ group(iter) }
|
165
|
-
end
|
166
|
-
rescue StopIteration => e
|
167
|
-
FileUtils.rm(@sorted_log_filename)
|
168
|
-
@sorted_log_file = nil
|
169
|
-
@reduce_stop = true
|
170
|
-
end
|
171
|
-
|
172
|
-
# Initializing and binding socket
|
173
|
-
#
|
174
|
-
def master_socket
|
175
|
-
@master_socket ||= begin
|
176
|
-
sock = MapReduce::Socket::Master.new self, @workers
|
177
|
-
sock.bind @socket_addr
|
178
|
-
sock
|
179
|
-
end
|
180
|
-
end
|
181
|
-
end
|
182
|
-
end
|
data/lib/map_reduce/worker.rb
DELETED
@@ -1,144 +0,0 @@
|
|
1
|
-
# MapReduce Worker make two jobs:
|
2
|
-
# First, it maps (emits) all data to masters;
|
3
|
-
# Second, it reduces data returned form master;
|
4
|
-
# After reducing he is ready to map data again.
|
5
|
-
#
|
6
|
-
module MapReduce
|
7
|
-
class Worker
|
8
|
-
|
9
|
-
# Valid options:
|
10
|
-
# * masters - socket addresses of masters,
|
11
|
-
# default is 'ipc:///dev/shm/master.sock'
|
12
|
-
# * type - connection type:
|
13
|
-
# ** :em - Eventmachine with callbacks (default)
|
14
|
-
# ** :sync - Synchronous type on Fibers
|
15
|
-
#
|
16
|
-
def initialize(opts = {})
|
17
|
-
@socket_addrs = opts[:masters] || [::MapReduce::DEFAULT_SOCKET]
|
18
|
-
|
19
|
-
@type = opts[:type] ||= :em
|
20
|
-
@socket_class = case @type
|
21
|
-
when :em
|
22
|
-
require File.expand_path("../socket/worker_em", __FILE__)
|
23
|
-
MapReduce::Socket::WorkerEm
|
24
|
-
when :sync
|
25
|
-
require File.expand_path("../socket/worker_sync", __FILE__)
|
26
|
-
MapReduce::Socket::WorkerSync
|
27
|
-
else
|
28
|
-
fail "Wrong Connection type. Choose :em or :sync, not #{opts[:type]}"
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# Sends key and value to master through socket.
|
33
|
-
# Key can't be nil.
|
34
|
-
#
|
35
|
-
def emit(key, value, &blk)
|
36
|
-
fail "Key can't be nil" if key.nil?
|
37
|
-
|
38
|
-
sock = pick_map_socket(key)
|
39
|
-
sock.send_request(["map", key, value], &blk)
|
40
|
-
end
|
41
|
-
alias :map :emit
|
42
|
-
|
43
|
-
# Explicitly stop MAP phase.
|
44
|
-
# Master will wait till all workers will send "map_finished" message.
|
45
|
-
#
|
46
|
-
def map_finished(&blk)
|
47
|
-
all = master_sockets.size
|
48
|
-
resp = 0
|
49
|
-
|
50
|
-
master_sockets.each do |sock, h|
|
51
|
-
sock.send_request(["map_finished"]) do |msg|
|
52
|
-
socket_state(sock, :reduce)
|
53
|
-
blk.call(["ok"]) if block_given? && (resp+=1) == all
|
54
|
-
end
|
55
|
-
end
|
56
|
-
["ok"]
|
57
|
-
end
|
58
|
-
|
59
|
-
# Reduce operation.
|
60
|
-
# Sends request to all masters.
|
61
|
-
# If master returns nil it means that he is already empty:
|
62
|
-
# nothing to reduce.
|
63
|
-
# Reducing till any socket returns data.
|
64
|
-
# If nothing to reduce, we return nil to client.
|
65
|
-
#
|
66
|
-
def reduce(&blk)
|
67
|
-
if @type == :em
|
68
|
-
em_reduce(&blk)
|
69
|
-
else
|
70
|
-
sync_reduce(&blk)
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def sync_reduce(&blk)
|
75
|
-
while sock = random_reduce_socket
|
76
|
-
key, *values = sock.send_request(["reduce"])
|
77
|
-
if key.nil?
|
78
|
-
socket_state(sock, :map)
|
79
|
-
else
|
80
|
-
blk.call(key, values)
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
def em_reduce(&blk)
|
86
|
-
sock = random_reduce_socket
|
87
|
-
if sock
|
88
|
-
sock.send_request(["reduce"]) do |message|
|
89
|
-
key, *values = message
|
90
|
-
if key.nil?
|
91
|
-
socket_state(sock, :map)
|
92
|
-
else
|
93
|
-
blk.call(key, values)
|
94
|
-
end
|
95
|
-
|
96
|
-
em_reduce(&blk)
|
97
|
-
end
|
98
|
-
else
|
99
|
-
blk.call([nil])
|
100
|
-
end
|
101
|
-
end
|
102
|
-
|
103
|
-
private
|
104
|
-
|
105
|
-
# Connect to each master.
|
106
|
-
#
|
107
|
-
def master_sockets
|
108
|
-
@master_sockets ||= begin
|
109
|
-
socks = {}
|
110
|
-
@socket_addrs.each_with_index do |addr, i|
|
111
|
-
sock = @socket_class.new
|
112
|
-
sock.connect addr
|
113
|
-
socks[sock] = { state: :map, ind: i }
|
114
|
-
end
|
115
|
-
socks
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
# Kind of sharding
|
120
|
-
#
|
121
|
-
def pick_map_socket(key)
|
122
|
-
shard = if master_sockets.size > 1
|
123
|
-
Digest::MD5.hexdigest(key.to_s).to_i(16) % master_sockets.size
|
124
|
-
else
|
125
|
-
0
|
126
|
-
end
|
127
|
-
master_sockets.keys[shard]
|
128
|
-
end
|
129
|
-
|
130
|
-
# Take random socket to get reduce message.
|
131
|
-
# Socket should be in :reduce state.
|
132
|
-
#
|
133
|
-
def random_reduce_socket
|
134
|
-
master_sockets.select{ |k,v| v[:state] == :reduce }.keys.sample
|
135
|
-
end
|
136
|
-
|
137
|
-
# Change socket's state to :map when it is empty
|
138
|
-
# and to :reduce when mapping is finished
|
139
|
-
#
|
140
|
-
def socket_state(sock, state)
|
141
|
-
master_sockets[sock][:state] = state
|
142
|
-
end
|
143
|
-
end
|
144
|
-
end
|