map_reduce 0.0.1.alpha5 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  module MapReduce
2
2
  class MapLog
3
- MAX_BUFFER_SIZE = 2 ** 20
3
+ MAX_BUFFER_SIZE = 2 ** 21 # 2 MB
4
4
 
5
5
  def initialize(log_folder, task)
6
6
  @log_folder = log_folder
@@ -19,6 +19,7 @@ module MapReduce
19
19
  unless @log.empty?
20
20
  log_file << @log
21
21
  log_file.flush
22
+ @log.clear
22
23
  end
23
24
  end
24
25
 
@@ -35,7 +36,7 @@ module MapReduce
35
36
  def log_file
36
37
  @log_file ||= begin
37
38
  begin
38
- fn = File.join(@log_folder, "map_#{@task}_#{Time.now.to_i}_#{rand(1000)}.log")
39
+ fn = File.join(@log_folder, "map_#{@task}_#{Time.now.to_i}_#{Process.pid}_#{rand(1000)}.log")
39
40
  end while File.exist?(fn)
40
41
  FileUtils.mkdir_p(@log_folder)
41
42
  File.open(fn, "a")
@@ -61,14 +61,14 @@ module MapReduce
61
61
  reduce_log(task, true).get_data
62
62
  end
63
63
 
64
- reply(data, envelope)
65
-
66
64
  if data
67
65
  register(task, envelope, "reducer", status)
68
66
  else
69
67
  register(task, envelope, "reducer", "reduce_finished")
70
68
  end
71
69
 
70
+ reply(data, envelope)
71
+
72
72
  @after_reduce.call(data[0], data[1], task) if data && @after_reduce
73
73
  end
74
74
 
@@ -89,9 +89,9 @@ module MapReduce
89
89
 
90
90
  def reduce_log(task, force = false)
91
91
  @reduce_log ||= {}
92
- log = @reduce_log[task] ||= MapReduce::ReduceLog.new(map_log(task), @delimiter)
92
+ @reduce_log[task] ||= MapReduce::ReduceLog.new(map_log(task), @delimiter)
93
93
  @reduce_log[task].force if force
94
- log
94
+ @reduce_log[task]
95
95
  end
96
96
 
97
97
  def ok(envelope)
@@ -43,13 +43,11 @@ module MapReduce
43
43
  end
44
44
 
45
45
  def log_file
46
- @log_file ||= begin
47
- fn = @map_log.reset
48
- if fn
49
- @more = true
50
- sort(fn)
51
- fn
52
- end
46
+ fn = @map_log.reset
47
+ if fn
48
+ @more = true
49
+ sort(fn)
50
+ fn
53
51
  end
54
52
  end
55
53
 
@@ -1,3 +1,3 @@
1
1
  module MapReduce
2
- VERSION = "0.0.1.alpha5"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -76,6 +76,30 @@ describe "MapReduce stack" do
76
76
  end
77
77
  end
78
78
 
79
+ it "should map -> reduce / reduce" do
80
+ EM.synchrony do
81
+ @mapper = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
82
+ @reducer = MapReduce::Reducer.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
83
+
84
+ Fiber.new do
85
+ 100.times do |i|
86
+ @mapper.map(i, 1)
87
+ end
88
+ end.resume
89
+ data = []
90
+ Fiber.new do
91
+ while data.size < 100
92
+ @reducer.reduce do |k, v|
93
+ data << k
94
+ end
95
+ end
96
+ data.sort.must_equal (0...100).to_a.map(&:to_s).sort
97
+
98
+ EM.stop
99
+ end.resume
100
+ end
101
+ end
102
+
79
103
  it "should map/reduce-map/reduce with multiple masters" do
80
104
  EM.synchrony do
81
105
  @mapper1 = MapReduce::Mapper.new type: :sync, task: "Fruits", masters: ["tcp://127.0.0.1:15555", "tcp://127.0.0.1:15556"]
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: map_reduce
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.alpha5
5
- prerelease: 6
4
+ version: 0.0.2
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Petr Yanovich
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-02 00:00:00.000000000 Z
12
+ date: 2013-07-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
@@ -92,13 +92,11 @@ files:
92
92
  - lib/map_reduce/map_log.rb
93
93
  - lib/map_reduce/mapper.rb
94
94
  - lib/map_reduce/master.rb
95
- - lib/map_reduce/master_old.rb
96
95
  - lib/map_reduce/reduce_log.rb
97
96
  - lib/map_reduce/reducer.rb
98
97
  - lib/map_reduce/socket/master.rb
99
98
  - lib/map_reduce/socket/req_fiber.rb
100
99
  - lib/map_reduce/version.rb
101
- - lib/map_reduce/worker.rb
102
100
  - map_reduce.gemspec
103
101
  - spec/map_reduce/map_reduce_spec.rb
104
102
  - spec/map_reduce/master_spec.rb
@@ -119,13 +117,16 @@ required_ruby_version: !ruby/object:Gem::Requirement
119
117
  version: '0'
120
118
  segments:
121
119
  - 0
122
- hash: 3538709234591680598
120
+ hash: 1253589631458738880
123
121
  required_rubygems_version: !ruby/object:Gem::Requirement
124
122
  none: false
125
123
  requirements:
126
- - - ! '>'
124
+ - - ! '>='
127
125
  - !ruby/object:Gem::Version
128
- version: 1.3.1
126
+ version: '0'
127
+ segments:
128
+ - 0
129
+ hash: 1253589631458738880
129
130
  requirements: []
130
131
  rubyforge_project:
131
132
  rubygems_version: 1.8.25
@@ -1,182 +0,0 @@
1
- require File.expand_path("../socket/master", __FILE__)
2
-
3
- module MapReduce
4
- class Master
5
- # How often data will be flushed to disk
6
- FLUSH_TIMEOUT = 1
7
- # How many lines should be parsed by one iteration of grouping
8
- GROUP_LINES = 100
9
- # How many seconds should we sleep if grouping is going faster then reducing
10
- GROUP_TIMEOUT = 1
11
- # How many keys should be stored before timeout happend
12
- GROUP_MAX = 10_000
13
-
14
- # Valid options:
15
- # * socket - socket address to bind
16
- # default is 'ipc:///dev/shm/master.sock'
17
- # * log_folder - folder to store recieved MAP data
18
- # default is '/tmp/mapreduce/'
19
- # * workers - count of workers that will emit data.
20
- # default is :auto,
21
- # but in small jobs it is better to define in explicitly,
22
- # because if one worker will stop before others start
23
- # master will decide that map job is done and will start reducing
24
- # * delimiter - master log stores data like "key{delimiter}values"
25
- # so to prevent collisions you can specify your own uniq delimiter
26
- # default is a pipe "|"
27
- #
28
- def initialize(opts = {})
29
- # Socket addr to bind
30
- @socket_addr = opts[:socket] || ::MapReduce::DEFAULT_SOCKET
31
- # Folder to write logs
32
- @log_folder = opts[:log_folder] || "/tmp/mapreduce/"
33
- # How many MapReduce workers will emit data
34
- @workers = opts[:workers] || 1
35
- # Delimiter to store key/value pairs in log
36
- @delimiter = opts[:delimiter] || "|"
37
-
38
- @log = []
39
- @data = []
40
- @workers_envelopes = {}
41
- @log_filename = File.join(@log_folder, "master-#{Process.pid}.log")
42
- @sorted_log_filename = File.join(@log_folder, "master-#{Process.pid}_sorted.log")
43
-
44
- FileUtils.mkdir_p(@log_folder)
45
- FileUtils.touch(@log_filename)
46
- end
47
-
48
- # Start Eventloop
49
- #
50
- def run
51
- EM.run do
52
- # Init socket
53
- master_socket
54
-
55
- # Init flushing timer
56
- flush
57
- end
58
- end
59
-
60
- # Stop Eventloop
61
- #
62
- def stop
63
- EM.stop
64
- end
65
-
66
- # Store data in log array till flush
67
- #
68
- def map(key, message)
69
- @log << "#{key}#{@delimiter}#{message}"
70
- end
71
-
72
- # Send data back to worker.
73
- # Last item in data is last unfinished session,
74
- # so till the end of file reading we don't send it
75
- #
76
- def reduce(envelope)
77
- if @data.size >= 2
78
- data = @data.shift
79
- data = data.flatten
80
- master_socket.send_reply(data, envelope)
81
- elsif @reduce_stop
82
- data = @data.shift
83
- data = data.flatten if data
84
- master_socket.send_reply(data, envelope)
85
- else
86
- EM.add_timer(1) do
87
- reduce(envelope)
88
- end
89
- end
90
- end
91
-
92
- # Openning log file for read/write
93
- #
94
- def log_file
95
- @log_file ||= begin
96
- File.open(@log_filename, "w+")
97
- end
98
- end
99
-
100
- # Openning sorted log for reading
101
- #
102
- def sorted_log_file
103
- @sorted_log_file ||= begin
104
- File.open(@sorted_log_filename, "r")
105
- end
106
- end
107
-
108
- # Flushing data to disk once per FLUSH_TIMEOUT seconds
109
- #
110
- def flush
111
- if @log.any?
112
- log_file << @log*"\n" << "\n"
113
- log_file.flush
114
- @log.clear
115
- end
116
-
117
- EM.add_timer(FLUSH_TIMEOUT) do
118
- flush
119
- end
120
- end
121
-
122
- # Sorting log.
123
- # Linux sort is the fastest way to sort big file.
124
- # Deleting original log after sort.
125
- #
126
- def sort
127
- `sort #{@log_filename} -o #{@sorted_log_filename}`
128
- FileUtils.rm(@log_filename)
129
- @log_file = nil
130
- end
131
-
132
- # Start reducing part.
133
- # First, flushing rest of log to disk.
134
- # Then sort data.
135
- # Then start to read/group data
136
- #
137
- def reduce!
138
- flush
139
- sort
140
-
141
- iter = sorted_log_file.each_line
142
- group iter
143
- end
144
-
145
- # Reading sorted data and grouping by key.
146
- # If queue (@data) is growing faster then workers grad data we pause reading file.
147
- #
148
- def group(iter)
149
- if @data.size >= GROUP_MAX
150
- EM.add_timer(GROUP_TIMEOUT){ group(iter) }
151
- else
152
- GROUP_LINES.times do
153
- line = iter.next.chomp
154
- key, msg = line.split(@delimiter)
155
-
156
- last = @data.last
157
- if last && last[0] == key
158
- last[1] << msg
159
- else
160
- @data << [key, [msg]]
161
- end
162
- end
163
-
164
- EM.next_tick{ group(iter) }
165
- end
166
- rescue StopIteration => e
167
- FileUtils.rm(@sorted_log_filename)
168
- @sorted_log_file = nil
169
- @reduce_stop = true
170
- end
171
-
172
- # Initializing and binding socket
173
- #
174
- def master_socket
175
- @master_socket ||= begin
176
- sock = MapReduce::Socket::Master.new self, @workers
177
- sock.bind @socket_addr
178
- sock
179
- end
180
- end
181
- end
182
- end
@@ -1,144 +0,0 @@
1
- # MapReduce Worker make two jobs:
2
- # First, it maps (emits) all data to masters;
3
- # Second, it reduces data returned form master;
4
- # After reducing he is ready to map data again.
5
- #
6
- module MapReduce
7
- class Worker
8
-
9
- # Valid options:
10
- # * masters - socket addresses of masters,
11
- # default is 'ipc:///dev/shm/master.sock'
12
- # * type - connection type:
13
- # ** :em - Eventmachine with callbacks (default)
14
- # ** :sync - Synchronous type on Fibers
15
- #
16
- def initialize(opts = {})
17
- @socket_addrs = opts[:masters] || [::MapReduce::DEFAULT_SOCKET]
18
-
19
- @type = opts[:type] ||= :em
20
- @socket_class = case @type
21
- when :em
22
- require File.expand_path("../socket/worker_em", __FILE__)
23
- MapReduce::Socket::WorkerEm
24
- when :sync
25
- require File.expand_path("../socket/worker_sync", __FILE__)
26
- MapReduce::Socket::WorkerSync
27
- else
28
- fail "Wrong Connection type. Choose :em or :sync, not #{opts[:type]}"
29
- end
30
- end
31
-
32
- # Sends key and value to master through socket.
33
- # Key can't be nil.
34
- #
35
- def emit(key, value, &blk)
36
- fail "Key can't be nil" if key.nil?
37
-
38
- sock = pick_map_socket(key)
39
- sock.send_request(["map", key, value], &blk)
40
- end
41
- alias :map :emit
42
-
43
- # Explicitly stop MAP phase.
44
- # Master will wait till all workers will send "map_finished" message.
45
- #
46
- def map_finished(&blk)
47
- all = master_sockets.size
48
- resp = 0
49
-
50
- master_sockets.each do |sock, h|
51
- sock.send_request(["map_finished"]) do |msg|
52
- socket_state(sock, :reduce)
53
- blk.call(["ok"]) if block_given? && (resp+=1) == all
54
- end
55
- end
56
- ["ok"]
57
- end
58
-
59
- # Reduce operation.
60
- # Sends request to all masters.
61
- # If master returns nil it means that he is already empty:
62
- # nothing to reduce.
63
- # Reducing till any socket returns data.
64
- # If nothing to reduce, we return nil to client.
65
- #
66
- def reduce(&blk)
67
- if @type == :em
68
- em_reduce(&blk)
69
- else
70
- sync_reduce(&blk)
71
- end
72
- end
73
-
74
- def sync_reduce(&blk)
75
- while sock = random_reduce_socket
76
- key, *values = sock.send_request(["reduce"])
77
- if key.nil?
78
- socket_state(sock, :map)
79
- else
80
- blk.call(key, values)
81
- end
82
- end
83
- end
84
-
85
- def em_reduce(&blk)
86
- sock = random_reduce_socket
87
- if sock
88
- sock.send_request(["reduce"]) do |message|
89
- key, *values = message
90
- if key.nil?
91
- socket_state(sock, :map)
92
- else
93
- blk.call(key, values)
94
- end
95
-
96
- em_reduce(&blk)
97
- end
98
- else
99
- blk.call([nil])
100
- end
101
- end
102
-
103
- private
104
-
105
- # Connect to each master.
106
- #
107
- def master_sockets
108
- @master_sockets ||= begin
109
- socks = {}
110
- @socket_addrs.each_with_index do |addr, i|
111
- sock = @socket_class.new
112
- sock.connect addr
113
- socks[sock] = { state: :map, ind: i }
114
- end
115
- socks
116
- end
117
- end
118
-
119
- # Kind of sharding
120
- #
121
- def pick_map_socket(key)
122
- shard = if master_sockets.size > 1
123
- Digest::MD5.hexdigest(key.to_s).to_i(16) % master_sockets.size
124
- else
125
- 0
126
- end
127
- master_sockets.keys[shard]
128
- end
129
-
130
- # Take random socket to get reduce message.
131
- # Socket should be in :reduce state.
132
- #
133
- def random_reduce_socket
134
- master_sockets.select{ |k,v| v[:state] == :reduce }.keys.sample
135
- end
136
-
137
- # Change socket's state to :map when it is empty
138
- # and to :reduce when mapping is finished
139
- #
140
- def socket_state(sock, state)
141
- master_sockets[sock][:state] = state
142
- end
143
- end
144
- end