pmux 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/.gitignore +8 -0
  2. data/README.md +36 -0
  3. data/Rakefile +4 -0
  4. data/bin/pmux +5 -0
  5. data/lib/pmux/application.rb +166 -0
  6. data/lib/pmux/cleaner.rb +28 -0
  7. data/lib/pmux/fiber18.rb +64 -0
  8. data/lib/pmux/fixcmd.rb +25 -0
  9. data/lib/pmux/gatherer.rb +23 -0
  10. data/lib/pmux/handler.rb +262 -0
  11. data/lib/pmux/job.rb +101 -0
  12. data/lib/pmux/joblogger.rb +46 -0
  13. data/lib/pmux/mapper.rb +151 -0
  14. data/lib/pmux/mros.rb +207 -0
  15. data/lib/pmux/multi_session.rb +309 -0
  16. data/lib/pmux/pipeio.rb +19 -0
  17. data/lib/pmux/plugin.rb +23 -0
  18. data/lib/pmux/q.rb +3 -0
  19. data/lib/pmux/reducer.rb +90 -0
  20. data/lib/pmux/storage_adapter.rb +105 -0
  21. data/lib/pmux/task_dispatcher.rb +167 -0
  22. data/lib/pmux/task_queue.rb +11 -0
  23. data/lib/pmux/task_scheduler.rb +166 -0
  24. data/lib/pmux/util_daemon.rb +18 -0
  25. data/lib/pmux/util_logger.rb +137 -0
  26. data/lib/pmux/version.rb +3 -0
  27. data/lib/pmux/worker.rb +91 -0
  28. data/lib/pmux/writer.rb +19 -0
  29. data/lib/pmux.rb +27 -0
  30. data/pmux.gemspec +24 -0
  31. data/test/mock_mros.rb +284 -0
  32. data/test/mock_pipeio.rb +26 -0
  33. data/test/mock_world.rb +193 -0
  34. data/test/mock_xattr.rb +10 -0
  35. data/test/runner.rb +10 -0
  36. data/test/test_application.rb +13 -0
  37. data/test/test_fixcmd.rb +17 -0
  38. data/test/test_handler.rb +15 -0
  39. data/test/test_i_mapreduce.rb +169 -0
  40. data/test/test_i_mros.rb +28 -0
  41. data/test/test_i_msession.rb +27 -0
  42. data/test/test_job.rb +35 -0
  43. data/test/test_joblogger.rb +16 -0
  44. data/test/test_mapper.rb +60 -0
  45. data/test/test_pipeio.rb +24 -0
  46. data/test/test_storage_adapter.rb +63 -0
  47. data/test/test_task_queue.rb +87 -0
  48. data/test/test_task_scheduler.rb +39 -0
  49. data/test/txt/0.log +105 -0
  50. data/test/txt/1.log +105 -0
  51. data/test/txt/2.log +105 -0
  52. data/test/txt/3.log +105 -0
  53. data/test/txt/4.log +105 -0
  54. data/test/txt/5.log +105 -0
  55. data/test/txt/6.log +105 -0
  56. data/test/txt/7.log +105 -0
  57. data/test/txt/8.log +105 -0
  58. data/test/unittest_helper.rb +57 -0
  59. metadata +153 -0
data/lib/pmux/job.rb ADDED
@@ -0,0 +1,101 @@
1
+ require 'forwardable'
2
+
3
+ module Pmux
4
+ class Job
5
+ extend Forwardable
6
+ def_delegators :@h, :[], :[]=, :delete
7
+ attr_reader :tasks, :num_t, :num_r
8
+ attr_reader :taskhash
9
+ attr_reader :reducers
10
+
11
+ def initialize params, files
12
+ @params = params
13
+ @files = files
14
+
15
+ @task_id = 0
16
+ @num_r = @params[:num_r] || 0
17
+ @params[:job_name] ||= %Q{"#{@params[:mapper]}"}
18
+
19
+ @taskhash = {}
20
+ @done_taskhash = {}
21
+ @tasks = mk_tasks files
22
+ @num_t = @tasks.size + @num_r
23
+ @h = {:start_time=>Time.now,
24
+ :map_tasks=>@tasks.size, :reduce_tasks=>@num_r,
25
+ }
26
+ end
27
+
28
+ def mk_tasks files
29
+ job_id = self.id
30
+ files.map {|file|
31
+ @task_id += 1
32
+ @taskhash[@task_id] = {:job_id=>job_id, :task_id=>@task_id,
33
+ :file=>file,
34
+ :mapper=>@params[:mapper], #:reducer=>@params[:reducer],
35
+ :num_r=>@num_r, :ff=>@params[:ff],
36
+ :separator=>@params[:separator],
37
+ :hist=>[],
38
+ }
39
+ }
40
+ end
41
+
42
+ def mk_reducer_addrs addrs, num_r=nil
43
+ num_r ||= @num_r
44
+ step = addrs.size.to_f / num_r
45
+ @reducers = (0..num_r-1).map {|ind| addrs[step*ind]}
46
+ @reducers
47
+ end
48
+
49
+ def mk_reduce_tasks
50
+ pindex = 0
51
+ job_id = self.id
52
+ @tasks = reducers.map {|reducer_node_addr|
53
+ @task_id += 1
54
+ #task = make_reduce_task pindex, reducer_node_addr
55
+ task = {:pindex=>pindex, :job_id=>self.id, :task_id=>@task_id,
56
+ :node_addr=>reducer_node_addr,
57
+ :reducer=>@params[:reducer],
58
+ :hist=>[],
59
+ }
60
+ @taskhash[@task_id] = task
61
+ pindex += 1
62
+ task
63
+ }
64
+ end
65
+
66
+ def id
67
+ self.object_id.abs.to_s
68
+ end
69
+
70
+ def get_task_by_id task_id
71
+ @taskhash[task_id]
72
+ end
73
+
74
+ def delete_task_by_id task_id
75
+ if (task = @taskhash[task_id])
76
+ task[:status] = :DONE
77
+ @done_taskhash[task_id] = task
78
+ @taskhash.delete task_id
79
+ end
80
+ end
81
+
82
+ def completed?
83
+ @taskhash.empty?
84
+ end
85
+
86
+ def to_jlheader
87
+ h = {:id=>id, :files_first=>@files.first, :tasksize=>@tasks.size,
88
+ :params=>@params,
89
+ :start_time=>@h[:start_time],
90
+ :map_tasks=>@h[:map_tasks], :reduce_tasks=>@h[:reduce_tasks],
91
+ :storage_name=>@params[:storage_name],
92
+ :mapper=>@params[:mapper], :reducer=>@params[:reducer],
93
+ :num_r=>@params[:num_r],
94
+ }
95
+ end
96
+
97
+ def to_jlfooter
98
+ h = {:end_time=>@h[:end_time]}
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,46 @@
1
+ require 'yaml'
2
+
3
+ module Pmux
4
+ class Joblogger
5
+ attr_reader :path
6
+
7
+ def initialize dir, job
8
+ @dir = dir
9
+ @job = job
10
+ if dir and File.directory? dir
11
+ @path = "#{dir}/#{job.id}.yml"
12
+ @f = open(@path, 'w')
13
+ end
14
+ end
15
+
16
+ def dump_header
17
+ dump @job.to_jlheader
18
+ end
19
+
20
+ def dump_footer
21
+ dump @job.to_jlfooter
22
+ end
23
+
24
+ def dump obj
25
+ @f.print YAML.dump(obj) if @f
26
+ end
27
+
28
+ def sep
29
+ return unless @f
30
+ @f.puts '---'
31
+ @f.flush
32
+ end
33
+
34
+ def add key, obj
35
+ return unless @f
36
+ @f.puts "#{key}:"
37
+ for k, v in obj
38
+ @f.puts " #{k}: #{v.inspect}"
39
+ end
40
+ end
41
+
42
+ def close
43
+ @f.close if @f
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,151 @@
1
+ module Pmux
2
+ class Mapper
3
+ attr_accessor :tmp_dir
4
+ attr_accessor :num_r
5
+ attr_reader :ifbase
6
+ attr_reader :exitstatus
7
+
8
+ def initialize task, tmp_dir, loop=nil
9
+ @task = task
10
+ @tmp_dir = tmp_dir
11
+ @exitstatus = nil
12
+ job_id, task_id, fusion_id =
13
+ task.values_at 'job_id', 'task_id', 'fusion_id'
14
+ @path = task['path']
15
+ @num_r = task['num_r'] || 0
16
+ @loop = loop
17
+ @on_receive = nil
18
+ @on_success = nil
19
+ @on_error = nil
20
+ if false
21
+ else
22
+ @ifbase = "#{tmp_dir}/m#{fusion_id or task_id}"
23
+ end
24
+ end
25
+
26
+ def do_map_task; end
27
+ def result_size; end
28
+ def result_body; end
29
+ end
30
+
31
+ class StreamingMapper < Mapper
32
+ include FixCmdLine
33
+ CHUNK_SIZE = 8192
34
+
35
+ def on_success &block
36
+ @on_success = block
37
+ end
38
+ def on_error &block
39
+ @on_error = block
40
+ end
41
+
42
+ def do_map_task
43
+ mapper_cmd = @task['mapper'] || 'cat'
44
+ err_path = "#{tmp_dir}/.err.#{$$}"
45
+ err_msg = nil
46
+ if @num_r <= 1
47
+ cmd_line = fix_cmd_line mapper_cmd,
48
+ @path, "#{@ifbase}-0", err_path, tmp_dir
49
+ Log.debug "system: #{cmd_line}"
50
+ system cmd_line
51
+ else # @num_r >= 2
52
+ partitioner = TextPartitioner.new @ifbase, @num_r,
53
+ :separator=>@task['separator']
54
+ cmd_line = fix_cmd_line mapper_cmd, @path, nil, err_path, tmp_dir
55
+ IO.popen(cmd_line, 'r') {|io|
56
+ until io.eof?
57
+ data = io.read CHUNK_SIZE
58
+ partitioner.emit data
59
+ end
60
+ }
61
+ partitioner.close
62
+ end
63
+ @exitstatus = $?.exitstatus
64
+ if File.size? err_path
65
+ err_msg = File.read(err_path).chomp!
66
+ raise RuntimeError, err_msg
67
+ end
68
+ if @exitstatus > 1
69
+ raise RuntimeError, "failed to execute mapper: #{cmd_line}"
70
+ end
71
+ @ifbase
72
+ end
73
+
74
+ def do_streaming_map_task
75
+ mapper_cmd = @task['mapper'] || 'cat'
76
+ err_path = "#{tmp_dir}/.err.#{object_id}"
77
+ err_msg = nil
78
+ if @num_r <= 1
79
+ cmd_line = fix_cmd_line mapper_cmd,
80
+ @path, nil, err_path, tmp_dir
81
+ Log.debug "pipe: #{cmd_line}"
82
+ pipeio = PipeIO.new cmd_line
83
+ out = open("#{@ifbase}-0", 'a')
84
+ pipeio.on_receive {|data| out.write data}
85
+ else # @num_r >= 2
86
+ partitioner = TextPartitioner.new @ifbase, @num_r,
87
+ :separator=>@task['separator']
88
+ cmd_line = fix_cmd_line mapper_cmd, @path, nil, err_path, tmp_dir
89
+ pipeio = PipeIO.new cmd_line
90
+ pipeio.on_receive {|data| partitioner.emit data}
91
+ end
92
+ on_success = @on_success
93
+ on_error = @on_error
94
+ pipeio.on_close {
95
+ if out
96
+ out.close rescue nil
97
+ end
98
+ if partitioner
99
+ partitioner.close
100
+ end
101
+ #@exitstatus = $?.exitstatus
102
+ if File.size? err_path
103
+ err_msg = File.read(err_path).chomp!
104
+ e = RuntimeError.new err_msg
105
+ e.set_backtrace ['mapper']
106
+ on_error.call e if on_error
107
+ else
108
+ on_success.call if on_success
109
+ end
110
+ }
111
+ @loop.attach pipeio
112
+ end
113
+
114
+ def result_size
115
+ @num_r <= 1 ? File.size?("#{@ifbase}-0") : nil
116
+ end
117
+
118
+ def result_body
119
+ @num_r <= 1 ? File.read("#{@ifbase}-0") : nil
120
+ end
121
+ end
122
+
123
+ class TextPartitioner
124
+ def initialize ifbase, num_r, options={}
125
+ @ifbase = ifbase
126
+ @num_r = num_r
127
+ @ifiles = (0..(num_r-1)).map {|n| open("#{ifbase}-#{n}", 'w')}
128
+ @rbuf = ''
129
+ if (sep = options[:separator])
130
+ @separator_re = Regexp.new sep
131
+ else
132
+ @separator_re = /\t/
133
+ end
134
+ end
135
+
136
+ def emit data
137
+ @rbuf << data
138
+ while true
139
+ break unless @rbuf =~ /\n/
140
+ line, s = @rbuf.split /^/, 2
141
+ key, = line.split @separator_re, 2
142
+ @ifiles[key.hash % @num_r].write line
143
+ @rbuf.replace(s || '')
144
+ end
145
+ end
146
+
147
+ def close
148
+ @ifiles.each {|io| io.close}
149
+ end
150
+ end
151
+ end
data/lib/pmux/mros.rb ADDED
@@ -0,0 +1,207 @@
1
+ # msgpack-rpc over ssh
2
+
3
+ require 'net/ssh'
4
+ if require 'msgpack/rpc'
5
+ require 'msgpack/rpc/transport/unix'
6
+ MR = MessagePack::RPC
7
+ end
8
+ if defined? Rev
9
+ Coolio = Rev
10
+ end
11
+ require 'net/scp'
12
+
13
+ class Net::SSH::Compat
14
+ class Watcher < Coolio::IOWatcher
15
+ def initialize ruby_io, flags, fiber, watchers
16
+ @ruby_io = ruby_io
17
+ @fiber = fiber
18
+ @watchers = watchers
19
+ super ruby_io, flags
20
+ end
21
+
22
+ def on_readable
23
+ cleanup
24
+ @fiber.resume([[@ruby_io]])
25
+ end
26
+ def on_writable
27
+ cleanup
28
+ @fiber.resume([[], [@ruby_io]])
29
+ end
30
+ def cleanup
31
+ for w in @watchers
32
+ w.detach
33
+ end
34
+ end
35
+ end
36
+
37
+ class TimerWatcher < Coolio::TimerWatcher
38
+ def initialize interval, fiber, watchers
39
+ @fiber = fiber
40
+ @watchers = watchers
41
+ super(interval, false)
42
+ end
43
+
44
+ def on_timer
45
+ for w in @watchers; w.detach; end
46
+ @fiber.resume(nil)
47
+ end
48
+ end
49
+
50
+ class <<self
51
+ @@loop = Coolio::Loop.default
52
+
53
+ def coolio_loop
54
+ @@loop
55
+ end
56
+ def coolio_loop=(loop)
57
+ @@loop = loop
58
+ end
59
+
60
+ alias :io_select0 :io_select
61
+ def io_select reads, writes=nil, excepts=nil, timeout=nil
62
+ if timeout and timeout.zero?
63
+ io_select0 reads, writes, excepts, 0
64
+ else
65
+ writes ||= []
66
+ loop = @@loop
67
+
68
+ watchers = []
69
+ rw = reads & writes
70
+ for io in (reads | writes)
71
+ if rw.include? io
72
+ flag = :rw
73
+ elsif reads.include? io
74
+ flag = :r
75
+ else
76
+ flag = :w
77
+ end
78
+ watcher = Watcher.new io, flag, Fiber.current, watchers
79
+ watchers.push watcher
80
+ watcher.attach loop
81
+ end
82
+ if timeout
83
+ watcher = TimerWatcher.new timeout, Fiber.current, watchers
84
+ watchers.push watcher
85
+ watcher.attach loop
86
+ end
87
+ Fiber.yield
88
+ end
89
+ end
90
+ end
91
+ end
92
+
93
+ class Net::SSH::Connection::Session
94
+ attr_accessor :coolio_loop
95
+
96
+ def floop wait=nil
97
+ while true
98
+ break if closed?
99
+ #break unless busy?
100
+ loop wait
101
+ @coolio_loop.waitings[Fiber.current] = self
102
+ Fiber.yield
103
+ end
104
+ end
105
+ end
106
+
107
+ module Coolio
108
+ class Loop
109
+ attr_reader :waitings
110
+ def start_ssh *args, &block
111
+ @waitings ||= {}
112
+ fiber = Fiber.new {
113
+ begin
114
+ ssh = Net::SSH.start *args
115
+ ssh.coolio_loop = self
116
+ block.call ssh
117
+ rescue => e
118
+ block.call e
119
+ end
120
+ }
121
+ fiber.resume
122
+ end
123
+
124
+ def run
125
+ raise RuntimeError, "no watchers for this loop" if @watchers.empty?
126
+
127
+ @running = true
128
+ while @running and not @active_watchers.zero?
129
+ run_once
130
+ if @waitings and !@waitings.empty?
131
+ busy_sessions = @waitings.select {|f, s| s.busy?}
132
+ for fiber, ssh in busy_sessions
133
+ @waitings.delete fiber
134
+ fiber.resume
135
+ end
136
+ end
137
+ end
138
+ @running = false
139
+ end
140
+
141
+ def set_timer(interval, repeating=false, &block)
142
+ timer = TimerWatcher.new interval, repeating
143
+ timer.on_timer {
144
+ block.call
145
+ timer.detach unless repeating
146
+ }
147
+ timer.attach self
148
+ end
149
+ end
150
+ end
151
+
152
+ module MR
153
+ class PipeTransport
154
+ def initialize ruby_in, ruby_out, ruby_err
155
+ @ruby_in = ruby_in
156
+ @ruby_out = ruby_out
157
+ @ruby_err = ruby_err
158
+ end
159
+
160
+ def listen server
161
+ pout = PipeOut.new @ruby_out
162
+ pin = PipeIn.new @ruby_in, server, pout
163
+ perr = PipeOut.new @ruby_err
164
+ server.loop.attach pout
165
+ server.loop.attach pin
166
+ server.loop.attach perr
167
+ end
168
+
169
+ def close
170
+ end
171
+
172
+ class PipeOut < Coolio::IO
173
+ def initialize ruby_io
174
+ @ruby_io = ruby_io
175
+ super ruby_io
176
+ end
177
+
178
+ def on_readable
179
+ end
180
+
181
+ def send_data data
182
+ @ruby_io.write data
183
+ end
184
+ end
185
+
186
+ class PipeIn < Coolio::IO
187
+ include MR::MessageReceiver
188
+
189
+ def initialize ruby_io, server, pout
190
+ @ruby_io = ruby_io
191
+ super ruby_io
192
+ @server = server
193
+ @pout = pout
194
+ @pac = MessagePack::Unpacker.new
195
+ end
196
+ def on_read data
197
+ @pac.feed_each(data) {|obj| on_message obj}
198
+ end
199
+ def on_request msgid, method, param
200
+ @server.on_request @pout, msgid, method, param
201
+ end
202
+ def on_close
203
+ on_request 0, 'quit', []
204
+ end
205
+ end
206
+ end
207
+ end