pwrake 2.0.1 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +22 -9
- data/bin/gfwhere-pipe +33 -9
- data/bin/pwrake +5 -2
- data/bin/pwrake_branch +5 -3
- data/lib/pwrake/branch/branch.rb +95 -86
- data/lib/pwrake/branch/branch_application.rb +4 -0
- data/lib/pwrake/branch/communicator.rb +173 -0
- data/lib/pwrake/branch/communicator_set.rb +100 -0
- data/lib/pwrake/branch/fiber_queue.rb +10 -0
- data/lib/pwrake/branch/shell.rb +68 -24
- data/lib/pwrake/branch/shell_profiler.rb +2 -0
- data/lib/pwrake/gfarm/gfarm_postprocess.rb +8 -7
- data/lib/pwrake/logger.rb +5 -0
- data/lib/pwrake/master/master.rb +190 -87
- data/lib/pwrake/master/master_application.rb +8 -0
- data/lib/pwrake/nbio.rb +525 -0
- data/lib/pwrake/option/host_map.rb +36 -4
- data/lib/pwrake/option/option.rb +7 -1
- data/lib/pwrake/option/option_filesystem.rb +13 -3
- data/lib/pwrake/queue/locality_aware_queue.rb +41 -6
- data/lib/pwrake/queue/queue_array.rb +31 -11
- data/lib/pwrake/queue/task_queue.rb +15 -18
- data/lib/pwrake/report/report.rb +2 -0
- data/lib/pwrake/task/task_algorithm.rb +4 -1
- data/lib/pwrake/task/task_manager.rb +2 -0
- data/lib/pwrake/task/task_property.rb +1 -0
- data/lib/pwrake/task/task_wrapper.rb +40 -21
- data/lib/pwrake/version.rb +1 -1
- data/lib/pwrake/worker/invoker.rb +4 -29
- data/pwrake.gemspec +3 -2
- metadata +24 -12
- data/lib/pwrake/branch.rb +0 -22
- data/lib/pwrake/branch/worker_communicator.rb +0 -104
- data/lib/pwrake/iomux/channel.rb +0 -70
- data/lib/pwrake/iomux/handler.rb +0 -124
- data/lib/pwrake/iomux/handler_set.rb +0 -35
- data/lib/pwrake/iomux/runner.rb +0 -62
- data/lib/pwrake/master.rb +0 -30
@@ -1,3 +1,6 @@
|
|
1
|
+
require "pwrake/logger"
|
2
|
+
require "pwrake/branch/branch"
|
3
|
+
|
1
4
|
module Pwrake
|
2
5
|
|
3
6
|
# The TaskManager module is a mixin for managing tasks.
|
@@ -36,6 +39,7 @@ module Pwrake
|
|
36
39
|
def run_branch_in_thread(r,w,opts)
|
37
40
|
#standard_exception_handling do
|
38
41
|
@branch = Branch.new(opts,r,w)
|
42
|
+
@branch.init_logger
|
39
43
|
begin
|
40
44
|
@branch.run
|
41
45
|
rescue => e
|
@@ -0,0 +1,173 @@
|
|
1
|
+
module Pwrake
|
2
|
+
|
3
|
+
class CommChannel
|
4
|
+
|
5
|
+
def initialize(host,id,queue,writer,ios=[])
|
6
|
+
@host = host
|
7
|
+
@id = id
|
8
|
+
@queue = queue
|
9
|
+
@writer = writer
|
10
|
+
@ios = ios
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :host, :id
|
14
|
+
|
15
|
+
def put_line(s)
|
16
|
+
if $cause_fault
|
17
|
+
$cause_fault = nil
|
18
|
+
Log.warn("closing writer io caller=\n#{caller.join("\n")}")
|
19
|
+
@ios.each{|io| io.close}
|
20
|
+
end
|
21
|
+
@writer.put_line(s,@id)
|
22
|
+
end
|
23
|
+
|
24
|
+
def get_line
|
25
|
+
@queue.deq
|
26
|
+
end
|
27
|
+
|
28
|
+
def halt
|
29
|
+
@queue.halt
|
30
|
+
@writer.halt
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class Communicator
|
35
|
+
|
36
|
+
class ConnectError < IOError; end
|
37
|
+
|
38
|
+
attr_reader :id, :host, :ncore, :channel
|
39
|
+
attr_reader :reader, :writer, :handler
|
40
|
+
attr_reader :shells
|
41
|
+
|
42
|
+
def initialize(set,id,host,ncore,selector,option)
|
43
|
+
@set = set
|
44
|
+
@id = id
|
45
|
+
@host = host
|
46
|
+
@ncore = @ncore_given = ncore
|
47
|
+
@selector = selector
|
48
|
+
@option = option
|
49
|
+
@shells = {}
|
50
|
+
end
|
51
|
+
|
52
|
+
def inspect
|
53
|
+
"#<#{self.class} @id=#{@id},@host=#{@host},@ncore=#{@ncore}>"
|
54
|
+
end
|
55
|
+
|
56
|
+
def new_channel
|
57
|
+
i,q = @reader.new_queue
|
58
|
+
CommChannel.new(@host,i,q,@writer,[@ior,@iow,@ioe])
|
59
|
+
end
|
60
|
+
|
61
|
+
def connect(worker_code)
|
62
|
+
rb_cmd = "ruby -e 'eval ARGF.read(#{worker_code.size})'"
|
63
|
+
if ['localhost','localhost.localdomain','127.0.0.1'].include? @host
|
64
|
+
#if /^localhost/ =~ @host
|
65
|
+
cmd = rb_cmd
|
66
|
+
else
|
67
|
+
cmd = "ssh -x -T #{@option[:ssh_option]} #{@host} \"#{rb_cmd}\""
|
68
|
+
end
|
69
|
+
#
|
70
|
+
@ior,w0 = IO.pipe
|
71
|
+
@ioe,w1 = IO.pipe
|
72
|
+
r2,@iow = IO.pipe
|
73
|
+
@pid = Kernel.spawn(cmd,:pgroup=>true,:out=>w0,:err=>w1,:in=>r2)
|
74
|
+
w0.close
|
75
|
+
w1.close
|
76
|
+
r2.close
|
77
|
+
sel = @set.selector
|
78
|
+
@reader = NBIO::MultiReader.new(sel,@ior)
|
79
|
+
@rd_err = NBIO::Reader.new(sel,@ioe)
|
80
|
+
@writer = NBIO::Writer.new(sel,@iow)
|
81
|
+
@handler = NBIO::Handler.new(@reader,@writer,@host)
|
82
|
+
#
|
83
|
+
@writer.write(worker_code)
|
84
|
+
@writer.write(Marshal.dump(@ncore))
|
85
|
+
@writer.write(Marshal.dump(@option))
|
86
|
+
# read ncore
|
87
|
+
while s = @reader.get_line
|
88
|
+
if /^ncore:(.*)$/ =~ s
|
89
|
+
a = $1
|
90
|
+
Log.debug "ncore=#{a} @#{@host}"
|
91
|
+
if /^(\d+)$/ =~ a
|
92
|
+
@ncore = $1.to_i
|
93
|
+
return false
|
94
|
+
else
|
95
|
+
raise ConnectError, "invalid for ncore: #{a.inspect}"
|
96
|
+
end
|
97
|
+
else
|
98
|
+
return false if !common_line(s)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
raise ConnectError, "fail to connect #{cmd.inspect}"
|
102
|
+
rescue => e
|
103
|
+
dropout(e)
|
104
|
+
end
|
105
|
+
|
106
|
+
def common_line(s)
|
107
|
+
x = "Communicator#common_line(id=#{@id},host=#{@host})"
|
108
|
+
case s
|
109
|
+
when /^heartbeat$/
|
110
|
+
Log.debug "#{x}: #{s.inspect}"
|
111
|
+
@selector.heartbeat(@reader.io)
|
112
|
+
when /^exited$/
|
113
|
+
Log.debug "#{x}: #{s.inspect}"
|
114
|
+
return false
|
115
|
+
when /^log:(.*)$/
|
116
|
+
Log.info "#{x}: log>#{$1}"
|
117
|
+
when String
|
118
|
+
Log.warn "#{x}: out>#{s.inspect}"
|
119
|
+
when Exception
|
120
|
+
Log.warn "#{x}: err>#{s.class}: #{s.message}"
|
121
|
+
dropout(s)
|
122
|
+
return false
|
123
|
+
else
|
124
|
+
raise ConnectError, "#{x}: invalid for read: #{s.inspect}"
|
125
|
+
end
|
126
|
+
true
|
127
|
+
end
|
128
|
+
|
129
|
+
def finish_shells
|
130
|
+
@shells.keys.each{|sh| sh.finish_task_q}
|
131
|
+
end
|
132
|
+
|
133
|
+
def dropout(exc=nil)
|
134
|
+
# Error output
|
135
|
+
err_out = []
|
136
|
+
begin
|
137
|
+
finish_shells
|
138
|
+
@handler.exit
|
139
|
+
while s = @rd_err.get_line
|
140
|
+
err_out << s
|
141
|
+
end
|
142
|
+
rescue => e
|
143
|
+
m = Log.bt(e)
|
144
|
+
#$stderr.puts m
|
145
|
+
Log.error(m)
|
146
|
+
end
|
147
|
+
# Error output
|
148
|
+
if !err_out.empty?
|
149
|
+
$stderr.puts err_out.join("\n")
|
150
|
+
Log.error((["process error output:"]+err_out).join("\n "))
|
151
|
+
end
|
152
|
+
# Exception
|
153
|
+
if exc
|
154
|
+
m = Log.bt(exc)
|
155
|
+
#$stderr.puts m
|
156
|
+
Log.error m
|
157
|
+
end
|
158
|
+
ensure
|
159
|
+
@set.delete(self)
|
160
|
+
end
|
161
|
+
|
162
|
+
def finish
|
163
|
+
@iow.close
|
164
|
+
while s=@ior.gets
|
165
|
+
puts "out=#{s.chomp}"
|
166
|
+
end
|
167
|
+
while s=@ioe.gets
|
168
|
+
puts "err=#{s.chomp}"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require "forwardable"
|
2
|
+
require "pwrake/branch/communicator"
|
3
|
+
|
4
|
+
module Pwrake
|
5
|
+
class CommunicatorSet
|
6
|
+
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def initialize(master_rd,selector,option)
|
10
|
+
@master_rd = master_rd
|
11
|
+
@selector = selector
|
12
|
+
@option = option
|
13
|
+
@communicators = {}
|
14
|
+
@initial_communicators = []
|
15
|
+
if hb = @option[:heartbeat]
|
16
|
+
@heartbeat_timeout = hb + 15
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :selector
|
21
|
+
|
22
|
+
def_delegators :@communicators, :each, :each_value, :values, :size
|
23
|
+
|
24
|
+
def create_communicators
|
25
|
+
Fiber.new do
|
26
|
+
s = @master_rd.get_line
|
27
|
+
if s.chomp != "host_list_begin"
|
28
|
+
raise "Branch#setup_worker: recv=#{s.chomp} expected=host_list_begin"
|
29
|
+
end
|
30
|
+
|
31
|
+
while s = @master_rd.get_line
|
32
|
+
s.chomp!
|
33
|
+
break if s == "host_list_end"
|
34
|
+
if /^host:(\d+) (\S+) ([+-]?\d+)?$/ =~ s
|
35
|
+
id, host, ncore = $1,$2,$3
|
36
|
+
ncore &&= ncore.to_i
|
37
|
+
@communicators[id] = Communicator.new(self,id,host,ncore,@selector,@option)
|
38
|
+
else
|
39
|
+
raise "Branch#setup_worker: recv=#{s.chomp} expected=host:id hostname ncore"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end.resume
|
43
|
+
@selector.run(@heartbeat_timeout)
|
44
|
+
@initial_communicators = @communicators.dup
|
45
|
+
end
|
46
|
+
|
47
|
+
def add(comm)
|
48
|
+
@communicators[comm.id] = comm
|
49
|
+
end
|
50
|
+
|
51
|
+
def delete(comm)
|
52
|
+
@communicators.delete(comm.id)
|
53
|
+
@error_host << comm.host
|
54
|
+
end
|
55
|
+
|
56
|
+
def drop(id)
|
57
|
+
comm = @communicators[id]
|
58
|
+
Log.debug "drop:id=#{id} comm=#{comm.inspect} @communicators.keys=#{@communicators.keys}"
|
59
|
+
comm.dropout if comm
|
60
|
+
end
|
61
|
+
|
62
|
+
def drop_all
|
63
|
+
Log.debug "drop_all"
|
64
|
+
@communicators.keys.each do |id|
|
65
|
+
@communicators[id].dropout
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def finish_shells
|
70
|
+
Log.debug "finish_shells"
|
71
|
+
@communicators.keys.each do |id|
|
72
|
+
@communicators[id].finish_shells
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def run(message)
|
77
|
+
@error_host = []
|
78
|
+
n1 = @communicators.size
|
79
|
+
@selector.run(@heartbeat_timeout)
|
80
|
+
n2 = @communicators.size
|
81
|
+
if n1 != n2
|
82
|
+
Log.info "# of communicators: #{n1}->#{n2} during #{message.inspect}"
|
83
|
+
Log.info "retired hosts=[#{@error_host.join(',')}]"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def handler_set
|
88
|
+
@communicators.each_value.map{|comm| comm.handler}
|
89
|
+
end
|
90
|
+
|
91
|
+
def kill(sig)
|
92
|
+
NBIO::Handler.kill(handler_set,sig)
|
93
|
+
end
|
94
|
+
|
95
|
+
def exit
|
96
|
+
NBIO::Handler.exit(handler_set)
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
end
|
@@ -2,6 +2,9 @@ require 'fiber'
|
|
2
2
|
|
3
3
|
module Pwrake
|
4
4
|
|
5
|
+
class FiberQueueError < StandardError
|
6
|
+
end
|
7
|
+
|
5
8
|
class FiberQueue
|
6
9
|
|
7
10
|
def initialize
|
@@ -11,6 +14,9 @@ module Pwrake
|
|
11
14
|
end
|
12
15
|
|
13
16
|
def enq(x)
|
17
|
+
if @finished
|
18
|
+
raise FiberQueueError,"cannot enq to already finished queue"
|
19
|
+
end
|
14
20
|
@q.push(x)
|
15
21
|
f = @waiter.shift
|
16
22
|
f.resume if f
|
@@ -25,6 +31,10 @@ module Pwrake
|
|
25
31
|
return @q.shift
|
26
32
|
end
|
27
33
|
|
34
|
+
def deq_nonblock
|
35
|
+
@q.shift
|
36
|
+
end
|
37
|
+
|
28
38
|
def finish
|
29
39
|
@finished = true
|
30
40
|
while f = @waiter.shift
|
data/lib/pwrake/branch/shell.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'pwrake/branch/shell_profiler'
|
2
|
+
|
1
3
|
module Pwrake
|
2
4
|
|
3
5
|
class DummyMutex
|
@@ -20,15 +22,16 @@ module Pwrake
|
|
20
22
|
BY_FIBER[Fiber.current]
|
21
23
|
end
|
22
24
|
|
23
|
-
def initialize(chan,task_q,opt={})
|
25
|
+
def initialize(chan,comm,task_q,opt={})
|
24
26
|
@chan = chan
|
25
|
-
@
|
27
|
+
@id = chan.id
|
28
|
+
@host = chan.host
|
29
|
+
@comm = comm
|
26
30
|
@task_q = task_q
|
27
31
|
@lock = DummyMutex.new
|
28
|
-
@id = chan.id
|
29
|
-
#
|
30
32
|
@option = opt
|
31
33
|
@work_dir = @option[:work_dir] || Dir.pwd
|
34
|
+
@comm.shells[self] = true
|
32
35
|
end
|
33
36
|
|
34
37
|
attr_reader :id, :host, :status, :profile
|
@@ -36,11 +39,12 @@ module Pwrake
|
|
36
39
|
def open
|
37
40
|
if @opened
|
38
41
|
Log.warn "already opened: host=#{@host} id=#{@id}"
|
42
|
+
return
|
39
43
|
end
|
44
|
+
@opened = true
|
40
45
|
_puts("open")
|
41
46
|
if (s = _gets) == "open"
|
42
47
|
OPEN_LIST[__id__] = self
|
43
|
-
@opened = true
|
44
48
|
true
|
45
49
|
else
|
46
50
|
Log.error("Shell#open failed: recieve #{s.inspect}")
|
@@ -48,19 +52,24 @@ module Pwrake
|
|
48
52
|
end
|
49
53
|
end
|
50
54
|
|
51
|
-
def
|
55
|
+
def exit
|
52
56
|
if !@opened
|
53
|
-
Log.
|
57
|
+
Log.debug "already exited: host=#{@host} id=#{@id}"
|
58
|
+
return
|
54
59
|
end
|
60
|
+
@opened = false
|
55
61
|
_puts("exit")
|
56
62
|
if (s = _gets) == "exit"
|
57
63
|
OPEN_LIST.delete(__id__)
|
58
|
-
|
64
|
+
Log.debug("Shell#exit: recieve #{s.inspect}")
|
59
65
|
true
|
60
66
|
else
|
61
|
-
Log.
|
67
|
+
Log.debug("Shell#exit: recieve #{s.inspect}")
|
62
68
|
false
|
63
69
|
end
|
70
|
+
rescue IOError,Errno::EPIPE => e
|
71
|
+
Log.debug("Shell#exit: #{Log.bt(e)}")
|
72
|
+
false
|
64
73
|
end
|
65
74
|
|
66
75
|
def set_current_task(task_id,task_name)
|
@@ -106,20 +115,17 @@ module Pwrake
|
|
106
115
|
|
107
116
|
def _gets
|
108
117
|
s = @chan.get_line
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
Log.debug e
|
115
|
-
end
|
118
|
+
Log.debug "Shell#_gets(host=#{@host},id=#{@id}): #{s.inspect}"
|
119
|
+
case s
|
120
|
+
when Exception
|
121
|
+
@chan.halt
|
122
|
+
Log.error Log.bt(s)
|
116
123
|
end
|
117
124
|
s
|
118
125
|
end
|
119
126
|
|
120
127
|
def _system(cmd)
|
121
128
|
@cmd = cmd
|
122
|
-
#raise "@chan is closed" if @chan.closed?
|
123
129
|
@lock.synchronize do
|
124
130
|
_puts(cmd)
|
125
131
|
status = io_read_loop{}
|
@@ -129,11 +135,10 @@ module Pwrake
|
|
129
135
|
|
130
136
|
def _backquote(cmd)
|
131
137
|
@cmd = cmd
|
132
|
-
#raise "@chan is closed" if @chan.closed?
|
133
138
|
a = []
|
134
139
|
@lock.synchronize do
|
135
140
|
_puts(cmd)
|
136
|
-
status = io_read_loop{|x| a << x}
|
141
|
+
@status = io_read_loop{|x| a << x}
|
137
142
|
end
|
138
143
|
a.join("\n")
|
139
144
|
end
|
@@ -141,9 +146,9 @@ module Pwrake
|
|
141
146
|
def _execute(cmd,quote=nil,&block)
|
142
147
|
@cmd = cmd
|
143
148
|
if !@opened
|
144
|
-
raise "
|
149
|
+
raise "non opened"
|
145
150
|
end
|
146
|
-
status = nil
|
151
|
+
@status = nil
|
147
152
|
start_time = Time.now
|
148
153
|
begin
|
149
154
|
_puts(cmd)
|
@@ -184,6 +189,18 @@ module Pwrake
|
|
184
189
|
end
|
185
190
|
return status
|
186
191
|
end
|
192
|
+
when "exit"
|
193
|
+
msg = "Shell#io_read_loop: exit"
|
194
|
+
$stderr.puts(msg)
|
195
|
+
Log.error(msg)
|
196
|
+
@chan.halt
|
197
|
+
return "exit"
|
198
|
+
when IOError
|
199
|
+
@chan.halt
|
200
|
+
return "ioerror"
|
201
|
+
when NBIO::TimeoutError
|
202
|
+
@chan.halt
|
203
|
+
return "timeout"
|
187
204
|
end
|
188
205
|
msg = "Shell#io_read_loop: Invalid result: #{s.inspect}"
|
189
206
|
$stderr.puts(msg)
|
@@ -193,7 +210,8 @@ module Pwrake
|
|
193
210
|
|
194
211
|
public
|
195
212
|
|
196
|
-
def create_fiber(
|
213
|
+
def create_fiber(master_w)
|
214
|
+
@master_w = master_w
|
197
215
|
if !@opened
|
198
216
|
Log.warn "not opened: host=#{@host} id=#{@id}"
|
199
217
|
end
|
@@ -218,13 +236,39 @@ module Pwrake
|
|
218
236
|
Rake.application.display_error_message(e)
|
219
237
|
Log.error e
|
220
238
|
result = "taskfail:#{@id}:#{task.name}"
|
239
|
+
ensure
|
240
|
+
master_w.put_line result
|
221
241
|
end
|
222
|
-
hdl.put_line result
|
223
242
|
end
|
224
|
-
ensure
|
225
243
|
Log.debug "shell id=#{@id} fiber end"
|
244
|
+
master_w.put_line "retire:#{@comm.id}"
|
245
|
+
@comm.shells.delete(self)
|
246
|
+
exit
|
247
|
+
if @comm.shells.empty?
|
248
|
+
@comm.dropout
|
249
|
+
end
|
250
|
+
@chan.halt
|
251
|
+
rescue => e
|
252
|
+
m = Log.bt(e)
|
253
|
+
#$stderr.puts m
|
254
|
+
Log.error(m)
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def finish_task_q
|
260
|
+
@task_q.finish
|
261
|
+
#Log.debug "finish_task_q: @task_q=#{@task_q.inspect}"
|
262
|
+
while task_str = @task_q.deq_nonblock
|
263
|
+
if /^(\d+):(.*)$/ =~ task_str
|
264
|
+
task_id, task_name = $1.to_i, $2
|
265
|
+
else
|
266
|
+
raise RuntimeError, "invalid task_str: #{task_str}"
|
226
267
|
end
|
268
|
+
@master_w.put_line "taskfail:#{@id}:#{task_name}"
|
269
|
+
Log.warn "unexecuted task: #{result}"
|
227
270
|
end
|
271
|
+
@chan.halt
|
228
272
|
end
|
229
273
|
|
230
274
|
end
|