map_reduce 0.0.1.alpha5 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  module MapReduce
2
2
  class MapLog
3
- MAX_BUFFER_SIZE = 2 ** 20
3
+ MAX_BUFFER_SIZE = 2 ** 21 # 2 MB
4
4
 
5
5
  def initialize(log_folder, task)
6
6
  @log_folder = log_folder
@@ -19,6 +19,7 @@ module MapReduce
19
19
  unless @log.empty?
20
20
  log_file << @log
21
21
  log_file.flush
22
+ @log.clear
22
23
  end
23
24
  end
24
25
 
@@ -35,7 +36,7 @@ module MapReduce
35
36
  def log_file
36
37
  @log_file ||= begin
37
38
  begin
38
- fn = File.join(@log_folder, "map_#{@task}_#{Time.now.to_i}_#{rand(1000)}.log")
39
+ fn = File.join(@log_folder, "map_#{@task}_#{Time.now.to_i}_#{Process.pid}_#{rand(1000)}.log")
39
40
  end while File.exist?(fn)
40
41
  FileUtils.mkdir_p(@log_folder)
41
42
  File.open(fn, "a")
@@ -61,14 +61,14 @@ module MapReduce
61
61
  reduce_log(task, true).get_data
62
62
  end
63
63
 
64
- reply(data, envelope)
65
-
66
64
  if data
67
65
  register(task, envelope, "reducer", status)
68
66
  else
69
67
  register(task, envelope, "reducer", "reduce_finished")
70
68
  end
71
69
 
70
+ reply(data, envelope)
71
+
72
72
  @after_reduce.call(data[0], data[1], task) if data && @after_reduce
73
73
  end
74
74
 
@@ -89,9 +89,9 @@ module MapReduce
89
89
 
90
90
  def reduce_log(task, force = false)
91
91
  @reduce_log ||= {}
92
- log = @reduce_log[task] ||= MapReduce::ReduceLog.new(map_log(task), @delimiter)
92
+ @reduce_log[task] ||= MapReduce::ReduceLog.new(map_log(task), @delimiter)
93
93
  @reduce_log[task].force if force
94
- log
94
+ @reduce_log[task]
95
95
  end
96
96
 
97
97
  def ok(envelope)
@@ -43,13 +43,11 @@ module MapReduce
43
43
  end
44
44
 
45
45
  def log_file
46
- @log_file ||= begin
47
- fn = @map_log.reset
48
- if fn
49
- @more = true
50
- sort(fn)
51
- fn
52
- end
46
+ fn = @map_log.reset
47
+ if fn
48
+ @more = true
49
+ sort(fn)
50
+ fn
53
51
  end
54
52
  end
55
53
 
@@ -1,3 +1,3 @@
1
1
  module MapReduce
2
- VERSION = "0.0.1.alpha5"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -76,6 +76,30 @@ describe "MapReduce stack" do
76
76
  end
77
77
  end
78
78
 
79
+ it "should map -> reduce / reduce" do
80
+ EM.synchrony do
81
+ @mapper = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
82
+ @reducer = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
83
+
84
+ Fiber.new do
85
+ 100.times do |i|
86
+ @mapper.map(i, 1)
87
+ end
88
+ end.resume
89
+ data = []
90
+ Fiber.new do
91
+ while data.size < 100
92
+ @reducer.reduce do |k, v|
93
+ data << k
94
+ end
95
+ end
96
+ data.sort.must_equal (0...100).to_a.map(&:to_s).sort
97
+
98
+ EM.stop
99
+ end.resume
100
+ end
101
+ end
102
+
79
103
  it "should map/reduce-map/reduce with multiple masters" do
80
104
  EM.synchrony do
81
105
  @mapper1 = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: map_reduce
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.alpha5
5
- prerelease: 6
4
+ version: 0.0.2
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Petr Yanovich
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-02 00:00:00.000000000 Z
12
+ date: 2013-07-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -92,13 +92,11 @@ files:
92
92
  - lib/map_reduce/map_log.rb
93
93
  - lib/map_reduce/mapper.rb
94
94
  - lib/map_reduce/master.rb
95
- - lib/map_reduce/master_old.rb
96
95
  - lib/map_reduce/reduce_log.rb
97
96
  - lib/map_reduce/reducer.rb
98
97
  - lib/map_reduce/socket/master.rb
99
98
  - lib/map_reduce/socket/req_fiber.rb
100
99
  - lib/map_reduce/version.rb
101
- - lib/map_reduce/worker.rb
102
100
  - map_reduce.gemspec
103
101
  - spec/map_reduce/map_reduce_spec.rb
104
102
  - spec/map_reduce/master_spec.rb
@@ -119,13 +117,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
119
117
  version: '0'
120
118
  segments:
121
119
  - 0
122
- hash: 3538709234591680598
120
+ hash: 1253589631458738880
123
121
  required_rubygems_version: !ruby/object:Gem::Requirement
124
122
  none: false
125
123
  requirements:
126
- - - ! '>'
124
+ - - ! '>='
127
125
  - !ruby/object:Gem::Version
128
- version: 1.3.1
126
+ version: '0'
127
+ segments:
128
+ - 0
129
+ hash: 1253589631458738880
129
130
  requirements: []
130
131
  rubyforge_project:
131
132
  rubygems_version: 1.8.25
@@ -1,182 +0,0 @@
1
- require File.expand_path("../socket/master", __FILE__)
2
-
3
- module MapReduce
4
- class Master
5
- # How often data will be flushed to disk
6
- FLUSH_TIMEOUT = 1
7
- # How many lines should be parsed by one iteration of grouping
8
- GROUP_LINES = 100
9
- # How many seconds should we sleep if grouping is going faster then reducing
10
- GROUP_TIMEOUT = 1
11
- # How many keys should be stored before timeout happend
12
- GROUP_MAX = 10_000
13
-
14
- # Valid options:
15
- # * socket - socket address to bind
16
- # default is 'ipc:///dev/shm/master.sock'
17
- # * log_folder - folder to store recieved MAP data
18
- # default is '/tmp/mapreduce/'
19
- # * workers - count of workers that will emit data.
20
- # default is :auto,
21
- # but in small jobs it is better to define in explicitly,
22
- # because if one worker will stop before others start
23
- # master will decide that map job is done and will start reducing
24
- # * delimiter - master log stores data like "key{delimiter}values"
25
- # so to prevent collisions you can specify your own uniq delimiter
26
- # default is a pipe "|"
27
- #
28
- def initialize(opts = {})
29
- # Socket addr to bind
30
- @socket_addr = opts[:socket] || ::MapReduce::DEFAULT_SOCKET
31
- # Folder to write logs
32
- @log_folder = opts[:log_folder] || "/tmp/mapreduce/"
33
- # How many MapReduce workers will emit data
34
- @workers = opts[:workers] || 1
35
- # Delimiter to store key/value pairs in log
36
- @delimiter = opts[:delimiter] || "|"
37
-
38
- @log = []
39
- @data = []
40
- @workers_envelopes = {}
41
- @log_filename = File.join(@log_folder, "master-#{Process.pid}.log")
42
- @sorted_log_filename = File.join(@log_folder, "master-#{Process.pid}_sorted.log")
43
-
44
- FileUtils.mkdir_p(@log_folder)
45
- FileUtils.touch(@log_filename)
46
- end
47
-
48
- # Start Eventloop
49
- #
50
- def run
51
- EM.run do
52
- # Init socket
53
- master_socket
54
-
55
- # Init flushing timer
56
- flush
57
- end
58
- end
59
-
60
- # Stop Eventloop
61
- #
62
- def stop
63
- EM.stop
64
- end
65
-
66
- # Store data in log array till flush
67
- #
68
- def map(key, message)
69
- @log << "#{key}#{@delimiter}#{message}"
70
- end
71
-
72
- # Send data back to worker.
73
- # Last item in data is last unfinished session,
74
- # so till the end of file reading we don't send it
75
- #
76
- def reduce(envelope)
77
- if @data.size >= 2
78
- data = @data.shift
79
- data = data.flatten
80
- master_socket.send_reply(data, envelope)
81
- elsif @reduce_stop
82
- data = @data.shift
83
- data = data.flatten if data
84
- master_socket.send_reply(data, envelope)
85
- else
86
- EM.add_timer(1) do
87
- reduce(envelope)
88
- end
89
- end
90
- end
91
-
92
- # Openning log file for read/write
93
- #
94
- def log_file
95
- @log_file ||= begin
96
- File.open(@log_filename, "w+")
97
- end
98
- end
99
-
100
- # Openning sorted log for reading
101
- #
102
- def sorted_log_file
103
- @sorted_log_file ||= begin
104
- File.open(@sorted_log_filename, "r")
105
- end
106
- end
107
-
108
- # Flushing data to disk once per FLUSH_TIMEOUT seconds
109
- #
110
- def flush
111
- if @log.any?
112
- log_file << @log*"\n" << "\n"
113
- log_file.flush
114
- @log.clear
115
- end
116
-
117
- EM.add_timer(FLUSH_TIMEOUT) do
118
- flush
119
- end
120
- end
121
-
122
- # Sorting log.
123
- # Linux sort is the fastest way to sort big file.
124
- # Deleting original log after sort.
125
- #
126
- def sort
127
- `sort #{@log_filename} -o #{@sorted_log_filename}`
128
- FileUtils.rm(@log_filename)
129
- @log_file = nil
130
- end
131
-
132
- # Start reducing part.
133
- # First, flushing rest of log to disk.
134
- # Then sort data.
135
- # Then start to read/group data
136
- #
137
- def reduce!
138
- flush
139
- sort
140
-
141
- iter = sorted_log_file.each_line
142
- group iter
143
- end
144
-
145
- # Reading sorted data and grouping by key.
146
- # If queue (@data) is growing faster then workers grad data we pause reading file.
147
- #
148
- def group(iter)
149
- if @data.size >= GROUP_MAX
150
- EM.add_timer(GROUP_TIMEOUT){ group(iter) }
151
- else
152
- GROUP_LINES.times do
153
- line = iter.next.chomp
154
- key, msg = line.split(@delimiter)
155
-
156
- last = @data.last
157
- if last && last[0] == key
158
- last[1] << msg
159
- else
160
- @data << [key, [msg]]
161
- end
162
- end
163
-
164
- EM.next_tick{ group(iter) }
165
- end
166
- rescue StopIteration => e
167
- FileUtils.rm(@sorted_log_filename)
168
- @sorted_log_file = nil
169
- @reduce_stop = true
170
- end
171
-
172
- # Initializing and binding socket
173
- #
174
- def master_socket
175
- @master_socket ||= begin
176
- sock = MapReduce::Socket::Master.new self, @workers
177
- sock.bind @socket_addr
178
- sock
179
- end
180
- end
181
- end
182
- end
@@ -1,144 +0,0 @@
1
- # MapReduce Worker make two jobs:
2
- # First, it maps (emits) all data to masters;
3
- # Second, it reduces data returned form master;
4
- # After reducing he is ready to map data again.
5
- #
6
- module MapReduce
7
- class Worker
8
-
9
- # Valid options:
10
- # * masters - socket addresses of masters,
11
- # default is 'ipc:///dev/shm/master.sock'
12
- # * type - connection type:
13
- # ** :em - Eventmachine with callbacks (default)
14
- # ** :sync - Synchronous type on Fibers
15
- #
16
- def initialize(opts = {})
17
- @socket_addrs = opts[:masters] || [::MapReduce::DEFAULT_SOCKET]
18
-
19
- @type = opts[:type] ||= :em
20
- @socket_class = case @type
21
- when :em
22
- require File.expand_path("../socket/worker_em", __FILE__)
23
- MapReduce::Socket::WorkerEm
24
- when :sync
25
- require File.expand_path("../socket/worker_sync", __FILE__)
26
- MapReduce::Socket::WorkerSync
27
- else
28
- fail "Wrong Connection type. Choose :em or :sync, not #{opts[:type]}"
29
- end
30
- end
31
-
32
- # Sends key and value to master through socket.
33
- # Key can't be nil.
34
- #
35
- def emit(key, value, &blk)
36
- fail "Key can't be nil" if key.nil?
37
-
38
- sock = pick_map_socket(key)
39
- sock.send_request(["map", key, value], &blk)
40
- end
41
- alias :map :emit
42
-
43
- # Explicitly stop MAP phase.
44
- # Master will wait till all workers will send "map_finished" message.
45
- #
46
- def map_finished(&blk)
47
- all = master_sockets.size
48
- resp = 0
49
-
50
- master_sockets.each do |sock, h|
51
- sock.send_request(["map_finished"]) do |msg|
52
- socket_state(sock, :reduce)
53
- blk.call(["ok"]) if block_given? && (resp+=1) == all
54
- end
55
- end
56
- ["ok"]
57
- end
58
-
59
- # Reduce operation.
60
- # Sends request to all masters.
61
- # If master returns nil it means that he is already empty:
62
- # nothing to reduce.
63
- # Reducing till any socket returns data.
64
- # If nothing to reduce, we return nil to client.
65
- #
66
- def reduce(&blk)
67
- if @type == :em
68
- em_reduce(&blk)
69
- else
70
- sync_reduce(&blk)
71
- end
72
- end
73
-
74
- def sync_reduce(&blk)
75
- while sock = random_reduce_socket
76
- key, *values = sock.send_request(["reduce"])
77
- if key.nil?
78
- socket_state(sock, :map)
79
- else
80
- blk.call(key, values)
81
- end
82
- end
83
- end
84
-
85
- def em_reduce(&blk)
86
- sock = random_reduce_socket
87
- if sock
88
- sock.send_request(["reduce"]) do |message|
89
- key, *values = message
90
- if key.nil?
91
- socket_state(sock, :map)
92
- else
93
- blk.call(key, values)
94
- end
95
-
96
- em_reduce(&blk)
97
- end
98
- else
99
- blk.call([nil])
100
- end
101
- end
102
-
103
- private
104
-
105
- # Connect to each master.
106
- #
107
- def master_sockets
108
- @master_sockets ||= begin
109
- socks = {}
110
- @socket_addrs.each_with_index do |addr, i|
111
- sock = @socket_class.new
112
- sock.connect addr
113
- socks[sock] = { state: :map, ind: i }
114
- end
115
- socks
116
- end
117
- end
118
-
119
- # Kind of sharding
120
- #
121
- def pick_map_socket(key)
122
- shard = if master_sockets.size > 1
123
- Digest::MD5.hexdigest(key.to_s).to_i(16) % master_sockets.size
124
- else
125
- 0
126
- end
127
- master_sockets.keys[shard]
128
- end
129
-
130
- # Take random socket to get reduce message.
131
- # Socket should be in :reduce state.
132
- #
133
- def random_reduce_socket
134
- master_sockets.select{ |k,v| v[:state] == :reduce }.keys.sample
135
- end
136
-
137
- # Change socket's state to :map when it is empty
138
- # and to :reduce when mapping is finished
139
- #
140
- def socket_state(sock, state)
141
- master_sockets[sock][:state] = state
142
- end
143
- end
144
- end