pwrake 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -9
- data/bin/gfwhere-pipe +33 -9
- data/bin/pwrake +5 -2
- data/bin/pwrake_branch +5 -3
- data/lib/pwrake/branch/branch.rb +95 -86
- data/lib/pwrake/branch/branch_application.rb +4 -0
- data/lib/pwrake/branch/communicator.rb +173 -0
- data/lib/pwrake/branch/communicator_set.rb +100 -0
- data/lib/pwrake/branch/fiber_queue.rb +10 -0
- data/lib/pwrake/branch/shell.rb +68 -24
- data/lib/pwrake/branch/shell_profiler.rb +2 -0
- data/lib/pwrake/gfarm/gfarm_postprocess.rb +8 -7
- data/lib/pwrake/logger.rb +5 -0
- data/lib/pwrake/master/master.rb +190 -87
- data/lib/pwrake/master/master_application.rb +8 -0
- data/lib/pwrake/nbio.rb +525 -0
- data/lib/pwrake/option/host_map.rb +36 -4
- data/lib/pwrake/option/option.rb +7 -1
- data/lib/pwrake/option/option_filesystem.rb +13 -3
- data/lib/pwrake/queue/locality_aware_queue.rb +41 -6
- data/lib/pwrake/queue/queue_array.rb +31 -11
- data/lib/pwrake/queue/task_queue.rb +15 -18
- data/lib/pwrake/report/report.rb +2 -0
- data/lib/pwrake/task/task_algorithm.rb +4 -1
- data/lib/pwrake/task/task_manager.rb +2 -0
- data/lib/pwrake/task/task_property.rb +1 -0
- data/lib/pwrake/task/task_wrapper.rb +40 -21
- data/lib/pwrake/version.rb +1 -1
- data/lib/pwrake/worker/invoker.rb +4 -29
- data/pwrake.gemspec +3 -2
- metadata +24 -12
- data/lib/pwrake/branch.rb +0 -22
- data/lib/pwrake/branch/worker_communicator.rb +0 -104
- data/lib/pwrake/iomux/channel.rb +0 -70
- data/lib/pwrake/iomux/handler.rb +0 -124
- data/lib/pwrake/iomux/handler_set.rb +0 -35
- data/lib/pwrake/iomux/runner.rb +0 -62
- data/lib/pwrake/master.rb +0 -30
@@ -1,3 +1,6 @@
|
|
1
|
+
require "pwrake/logger"
|
2
|
+
require "pwrake/branch/branch"
|
3
|
+
|
1
4
|
module Pwrake
|
2
5
|
|
3
6
|
# The TaskManager module is a mixin for managing tasks.
|
@@ -36,6 +39,7 @@ module Pwrake
|
|
36
39
|
def run_branch_in_thread(r,w,opts)
|
37
40
|
#standard_exception_handling do
|
38
41
|
@branch = Branch.new(opts,r,w)
|
42
|
+
@branch.init_logger
|
39
43
|
begin
|
40
44
|
@branch.run
|
41
45
|
rescue => e
|
@@ -0,0 +1,173 @@
|
|
1
|
+
module Pwrake
|
2
|
+
|
3
|
+
class CommChannel
|
4
|
+
|
5
|
+
def initialize(host,id,queue,writer,ios=[])
|
6
|
+
@host = host
|
7
|
+
@id = id
|
8
|
+
@queue = queue
|
9
|
+
@writer = writer
|
10
|
+
@ios = ios
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :host, :id
|
14
|
+
|
15
|
+
def put_line(s)
|
16
|
+
if $cause_fault
|
17
|
+
$cause_fault = nil
|
18
|
+
Log.warn("closing writer io caller=\n#{caller.join("\n")}")
|
19
|
+
@ios.each{|io| io.close}
|
20
|
+
end
|
21
|
+
@writer.put_line(s,@id)
|
22
|
+
end
|
23
|
+
|
24
|
+
def get_line
|
25
|
+
@queue.deq
|
26
|
+
end
|
27
|
+
|
28
|
+
def halt
|
29
|
+
@queue.halt
|
30
|
+
@writer.halt
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class Communicator
|
35
|
+
|
36
|
+
class ConnectError < IOError; end
|
37
|
+
|
38
|
+
attr_reader :id, :host, :ncore, :channel
|
39
|
+
attr_reader :reader, :writer, :handler
|
40
|
+
attr_reader :shells
|
41
|
+
|
42
|
+
def initialize(set,id,host,ncore,selector,option)
|
43
|
+
@set = set
|
44
|
+
@id = id
|
45
|
+
@host = host
|
46
|
+
@ncore = @ncore_given = ncore
|
47
|
+
@selector = selector
|
48
|
+
@option = option
|
49
|
+
@shells = {}
|
50
|
+
end
|
51
|
+
|
52
|
+
def inspect
|
53
|
+
"#<#{self.class} @id=#{@id},@host=#{@host},@ncore=#{@ncore}>"
|
54
|
+
end
|
55
|
+
|
56
|
+
def new_channel
|
57
|
+
i,q = @reader.new_queue
|
58
|
+
CommChannel.new(@host,i,q,@writer,[@ior,@iow,@ioe])
|
59
|
+
end
|
60
|
+
|
61
|
+
def connect(worker_code)
|
62
|
+
rb_cmd = "ruby -e 'eval ARGF.read(#{worker_code.size})'"
|
63
|
+
if ['localhost','localhost.localdomain','127.0.0.1'].include? @host
|
64
|
+
#if /^localhost/ =~ @host
|
65
|
+
cmd = rb_cmd
|
66
|
+
else
|
67
|
+
cmd = "ssh -x -T #{@option[:ssh_option]} #{@host} \"#{rb_cmd}\""
|
68
|
+
end
|
69
|
+
#
|
70
|
+
@ior,w0 = IO.pipe
|
71
|
+
@ioe,w1 = IO.pipe
|
72
|
+
r2,@iow = IO.pipe
|
73
|
+
@pid = Kernel.spawn(cmd,:pgroup=>true,:out=>w0,:err=>w1,:in=>r2)
|
74
|
+
w0.close
|
75
|
+
w1.close
|
76
|
+
r2.close
|
77
|
+
sel = @set.selector
|
78
|
+
@reader = NBIO::MultiReader.new(sel,@ior)
|
79
|
+
@rd_err = NBIO::Reader.new(sel,@ioe)
|
80
|
+
@writer = NBIO::Writer.new(sel,@iow)
|
81
|
+
@handler = NBIO::Handler.new(@reader,@writer,@host)
|
82
|
+
#
|
83
|
+
@writer.write(worker_code)
|
84
|
+
@writer.write(Marshal.dump(@ncore))
|
85
|
+
@writer.write(Marshal.dump(@option))
|
86
|
+
# read ncore
|
87
|
+
while s = @reader.get_line
|
88
|
+
if /^ncore:(.*)$/ =~ s
|
89
|
+
a = $1
|
90
|
+
Log.debug "ncore=#{a} @#{@host}"
|
91
|
+
if /^(\d+)$/ =~ a
|
92
|
+
@ncore = $1.to_i
|
93
|
+
return false
|
94
|
+
else
|
95
|
+
raise ConnectError, "invalid for ncore: #{a.inspect}"
|
96
|
+
end
|
97
|
+
else
|
98
|
+
return false if !common_line(s)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
raise ConnectError, "fail to connect #{cmd.inspect}"
|
102
|
+
rescue => e
|
103
|
+
dropout(e)
|
104
|
+
end
|
105
|
+
|
106
|
+
def common_line(s)
|
107
|
+
x = "Communicator#common_line(id=#{@id},host=#{@host})"
|
108
|
+
case s
|
109
|
+
when /^heartbeat$/
|
110
|
+
Log.debug "#{x}: #{s.inspect}"
|
111
|
+
@selector.heartbeat(@reader.io)
|
112
|
+
when /^exited$/
|
113
|
+
Log.debug "#{x}: #{s.inspect}"
|
114
|
+
return false
|
115
|
+
when /^log:(.*)$/
|
116
|
+
Log.info "#{x}: log>#{$1}"
|
117
|
+
when String
|
118
|
+
Log.warn "#{x}: out>#{s.inspect}"
|
119
|
+
when Exception
|
120
|
+
Log.warn "#{x}: err>#{s.class}: #{s.message}"
|
121
|
+
dropout(s)
|
122
|
+
return false
|
123
|
+
else
|
124
|
+
raise ConnectError, "#{x}: invalid for read: #{s.inspect}"
|
125
|
+
end
|
126
|
+
true
|
127
|
+
end
|
128
|
+
|
129
|
+
def finish_shells
|
130
|
+
@shells.keys.each{|sh| sh.finish_task_q}
|
131
|
+
end
|
132
|
+
|
133
|
+
def dropout(exc=nil)
|
134
|
+
# Error output
|
135
|
+
err_out = []
|
136
|
+
begin
|
137
|
+
finish_shells
|
138
|
+
@handler.exit
|
139
|
+
while s = @rd_err.get_line
|
140
|
+
err_out << s
|
141
|
+
end
|
142
|
+
rescue => e
|
143
|
+
m = Log.bt(e)
|
144
|
+
#$stderr.puts m
|
145
|
+
Log.error(m)
|
146
|
+
end
|
147
|
+
# Error output
|
148
|
+
if !err_out.empty?
|
149
|
+
$stderr.puts err_out.join("\n")
|
150
|
+
Log.error((["process error output:"]+err_out).join("\n "))
|
151
|
+
end
|
152
|
+
# Exception
|
153
|
+
if exc
|
154
|
+
m = Log.bt(exc)
|
155
|
+
#$stderr.puts m
|
156
|
+
Log.error m
|
157
|
+
end
|
158
|
+
ensure
|
159
|
+
@set.delete(self)
|
160
|
+
end
|
161
|
+
|
162
|
+
def finish
|
163
|
+
@iow.close
|
164
|
+
while s=@ior.gets
|
165
|
+
puts "out=#{s.chomp}"
|
166
|
+
end
|
167
|
+
while s=@ioe.gets
|
168
|
+
puts "err=#{s.chomp}"
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require "forwardable"
|
2
|
+
require "pwrake/branch/communicator"
|
3
|
+
|
4
|
+
module Pwrake
|
5
|
+
class CommunicatorSet
|
6
|
+
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def initialize(master_rd,selector,option)
|
10
|
+
@master_rd = master_rd
|
11
|
+
@selector = selector
|
12
|
+
@option = option
|
13
|
+
@communicators = {}
|
14
|
+
@initial_communicators = []
|
15
|
+
if hb = @option[:heartbeat]
|
16
|
+
@heartbeat_timeout = hb + 15
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
attr_reader :selector
|
21
|
+
|
22
|
+
def_delegators :@communicators, :each, :each_value, :values, :size
|
23
|
+
|
24
|
+
def create_communicators
|
25
|
+
Fiber.new do
|
26
|
+
s = @master_rd.get_line
|
27
|
+
if s.chomp != "host_list_begin"
|
28
|
+
raise "Branch#setup_worker: recv=#{s.chomp} expected=host_list_begin"
|
29
|
+
end
|
30
|
+
|
31
|
+
while s = @master_rd.get_line
|
32
|
+
s.chomp!
|
33
|
+
break if s == "host_list_end"
|
34
|
+
if /^host:(\d+) (\S+) ([+-]?\d+)?$/ =~ s
|
35
|
+
id, host, ncore = $1,$2,$3
|
36
|
+
ncore &&= ncore.to_i
|
37
|
+
@communicators[id] = Communicator.new(self,id,host,ncore,@selector,@option)
|
38
|
+
else
|
39
|
+
raise "Branch#setup_worker: recv=#{s.chomp} expected=host:id hostname ncore"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end.resume
|
43
|
+
@selector.run(@heartbeat_timeout)
|
44
|
+
@initial_communicators = @communicators.dup
|
45
|
+
end
|
46
|
+
|
47
|
+
def add(comm)
|
48
|
+
@communicators[comm.id] = comm
|
49
|
+
end
|
50
|
+
|
51
|
+
def delete(comm)
|
52
|
+
@communicators.delete(comm.id)
|
53
|
+
@error_host << comm.host
|
54
|
+
end
|
55
|
+
|
56
|
+
def drop(id)
|
57
|
+
comm = @communicators[id]
|
58
|
+
Log.debug "drop:id=#{id} comm=#{comm.inspect} @communicators.keys=#{@communicators.keys}"
|
59
|
+
comm.dropout if comm
|
60
|
+
end
|
61
|
+
|
62
|
+
def drop_all
|
63
|
+
Log.debug "drop_all"
|
64
|
+
@communicators.keys.each do |id|
|
65
|
+
@communicators[id].dropout
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def finish_shells
|
70
|
+
Log.debug "finish_shells"
|
71
|
+
@communicators.keys.each do |id|
|
72
|
+
@communicators[id].finish_shells
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def run(message)
|
77
|
+
@error_host = []
|
78
|
+
n1 = @communicators.size
|
79
|
+
@selector.run(@heartbeat_timeout)
|
80
|
+
n2 = @communicators.size
|
81
|
+
if n1 != n2
|
82
|
+
Log.info "# of communicators: #{n1}->#{n2} during #{message.inspect}"
|
83
|
+
Log.info "retired hosts=[#{@error_host.join(',')}]"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def handler_set
|
88
|
+
@communicators.each_value.map{|comm| comm.handler}
|
89
|
+
end
|
90
|
+
|
91
|
+
def kill(sig)
|
92
|
+
NBIO::Handler.kill(handler_set,sig)
|
93
|
+
end
|
94
|
+
|
95
|
+
def exit
|
96
|
+
NBIO::Handler.exit(handler_set)
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
end
|
@@ -2,6 +2,9 @@ require 'fiber'
|
|
2
2
|
|
3
3
|
module Pwrake
|
4
4
|
|
5
|
+
class FiberQueueError < StandardError
|
6
|
+
end
|
7
|
+
|
5
8
|
class FiberQueue
|
6
9
|
|
7
10
|
def initialize
|
@@ -11,6 +14,9 @@ module Pwrake
|
|
11
14
|
end
|
12
15
|
|
13
16
|
def enq(x)
|
17
|
+
if @finished
|
18
|
+
raise FiberQueueError,"cannot enq to already finished queue"
|
19
|
+
end
|
14
20
|
@q.push(x)
|
15
21
|
f = @waiter.shift
|
16
22
|
f.resume if f
|
@@ -25,6 +31,10 @@ module Pwrake
|
|
25
31
|
return @q.shift
|
26
32
|
end
|
27
33
|
|
34
|
+
def deq_nonblock
|
35
|
+
@q.shift
|
36
|
+
end
|
37
|
+
|
28
38
|
def finish
|
29
39
|
@finished = true
|
30
40
|
while f = @waiter.shift
|
data/lib/pwrake/branch/shell.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'pwrake/branch/shell_profiler'
|
2
|
+
|
1
3
|
module Pwrake
|
2
4
|
|
3
5
|
class DummyMutex
|
@@ -20,15 +22,16 @@ module Pwrake
|
|
20
22
|
BY_FIBER[Fiber.current]
|
21
23
|
end
|
22
24
|
|
23
|
-
def initialize(chan,task_q,opt={})
|
25
|
+
def initialize(chan,comm,task_q,opt={})
|
24
26
|
@chan = chan
|
25
|
-
@
|
27
|
+
@id = chan.id
|
28
|
+
@host = chan.host
|
29
|
+
@comm = comm
|
26
30
|
@task_q = task_q
|
27
31
|
@lock = DummyMutex.new
|
28
|
-
@id = chan.id
|
29
|
-
#
|
30
32
|
@option = opt
|
31
33
|
@work_dir = @option[:work_dir] || Dir.pwd
|
34
|
+
@comm.shells[self] = true
|
32
35
|
end
|
33
36
|
|
34
37
|
attr_reader :id, :host, :status, :profile
|
@@ -36,11 +39,12 @@ module Pwrake
|
|
36
39
|
def open
|
37
40
|
if @opened
|
38
41
|
Log.warn "already opened: host=#{@host} id=#{@id}"
|
42
|
+
return
|
39
43
|
end
|
44
|
+
@opened = true
|
40
45
|
_puts("open")
|
41
46
|
if (s = _gets) == "open"
|
42
47
|
OPEN_LIST[__id__] = self
|
43
|
-
@opened = true
|
44
48
|
true
|
45
49
|
else
|
46
50
|
Log.error("Shell#open failed: recieve #{s.inspect}")
|
@@ -48,19 +52,24 @@ module Pwrake
|
|
48
52
|
end
|
49
53
|
end
|
50
54
|
|
51
|
-
def
|
55
|
+
def exit
|
52
56
|
if !@opened
|
53
|
-
Log.
|
57
|
+
Log.debug "already exited: host=#{@host} id=#{@id}"
|
58
|
+
return
|
54
59
|
end
|
60
|
+
@opened = false
|
55
61
|
_puts("exit")
|
56
62
|
if (s = _gets) == "exit"
|
57
63
|
OPEN_LIST.delete(__id__)
|
58
|
-
|
64
|
+
Log.debug("Shell#exit: recieve #{s.inspect}")
|
59
65
|
true
|
60
66
|
else
|
61
|
-
Log.
|
67
|
+
Log.debug("Shell#exit: recieve #{s.inspect}")
|
62
68
|
false
|
63
69
|
end
|
70
|
+
rescue IOError,Errno::EPIPE => e
|
71
|
+
Log.debug("Shell#exit: #{Log.bt(e)}")
|
72
|
+
false
|
64
73
|
end
|
65
74
|
|
66
75
|
def set_current_task(task_id,task_name)
|
@@ -106,20 +115,17 @@ module Pwrake
|
|
106
115
|
|
107
116
|
def _gets
|
108
117
|
s = @chan.get_line
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
Log.debug e
|
115
|
-
end
|
118
|
+
Log.debug "Shell#_gets(host=#{@host},id=#{@id}): #{s.inspect}"
|
119
|
+
case s
|
120
|
+
when Exception
|
121
|
+
@chan.halt
|
122
|
+
Log.error Log.bt(s)
|
116
123
|
end
|
117
124
|
s
|
118
125
|
end
|
119
126
|
|
120
127
|
def _system(cmd)
|
121
128
|
@cmd = cmd
|
122
|
-
#raise "@chan is closed" if @chan.closed?
|
123
129
|
@lock.synchronize do
|
124
130
|
_puts(cmd)
|
125
131
|
status = io_read_loop{}
|
@@ -129,11 +135,10 @@ module Pwrake
|
|
129
135
|
|
130
136
|
def _backquote(cmd)
|
131
137
|
@cmd = cmd
|
132
|
-
#raise "@chan is closed" if @chan.closed?
|
133
138
|
a = []
|
134
139
|
@lock.synchronize do
|
135
140
|
_puts(cmd)
|
136
|
-
status = io_read_loop{|x| a << x}
|
141
|
+
@status = io_read_loop{|x| a << x}
|
137
142
|
end
|
138
143
|
a.join("\n")
|
139
144
|
end
|
@@ -141,9 +146,9 @@ module Pwrake
|
|
141
146
|
def _execute(cmd,quote=nil,&block)
|
142
147
|
@cmd = cmd
|
143
148
|
if !@opened
|
144
|
-
raise "
|
149
|
+
raise "non opened"
|
145
150
|
end
|
146
|
-
status = nil
|
151
|
+
@status = nil
|
147
152
|
start_time = Time.now
|
148
153
|
begin
|
149
154
|
_puts(cmd)
|
@@ -184,6 +189,18 @@ module Pwrake
|
|
184
189
|
end
|
185
190
|
return status
|
186
191
|
end
|
192
|
+
when "exit"
|
193
|
+
msg = "Shell#io_read_loop: exit"
|
194
|
+
$stderr.puts(msg)
|
195
|
+
Log.error(msg)
|
196
|
+
@chan.halt
|
197
|
+
return "exit"
|
198
|
+
when IOError
|
199
|
+
@chan.halt
|
200
|
+
return "ioerror"
|
201
|
+
when NBIO::TimeoutError
|
202
|
+
@chan.halt
|
203
|
+
return "timeout"
|
187
204
|
end
|
188
205
|
msg = "Shell#io_read_loop: Invalid result: #{s.inspect}"
|
189
206
|
$stderr.puts(msg)
|
@@ -193,7 +210,8 @@ module Pwrake
|
|
193
210
|
|
194
211
|
public
|
195
212
|
|
196
|
-
def create_fiber(
|
213
|
+
def create_fiber(master_w)
|
214
|
+
@master_w = master_w
|
197
215
|
if !@opened
|
198
216
|
Log.warn "not opened: host=#{@host} id=#{@id}"
|
199
217
|
end
|
@@ -218,13 +236,39 @@ module Pwrake
|
|
218
236
|
Rake.application.display_error_message(e)
|
219
237
|
Log.error e
|
220
238
|
result = "taskfail:#{@id}:#{task.name}"
|
239
|
+
ensure
|
240
|
+
master_w.put_line result
|
221
241
|
end
|
222
|
-
hdl.put_line result
|
223
242
|
end
|
224
|
-
ensure
|
225
243
|
Log.debug "shell id=#{@id} fiber end"
|
244
|
+
master_w.put_line "retire:#{@comm.id}"
|
245
|
+
@comm.shells.delete(self)
|
246
|
+
exit
|
247
|
+
if @comm.shells.empty?
|
248
|
+
@comm.dropout
|
249
|
+
end
|
250
|
+
@chan.halt
|
251
|
+
rescue => e
|
252
|
+
m = Log.bt(e)
|
253
|
+
#$stderr.puts m
|
254
|
+
Log.error(m)
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
def finish_task_q
|
260
|
+
@task_q.finish
|
261
|
+
#Log.debug "finish_task_q: @task_q=#{@task_q.inspect}"
|
262
|
+
while task_str = @task_q.deq_nonblock
|
263
|
+
if /^(\d+):(.*)$/ =~ task_str
|
264
|
+
task_id, task_name = $1.to_i, $2
|
265
|
+
else
|
266
|
+
raise RuntimeError, "invalid task_str: #{task_str}"
|
226
267
|
end
|
268
|
+
@master_w.put_line "taskfail:#{@id}:#{task_name}"
|
269
|
+
Log.warn "unexecuted task: #{result}"
|
227
270
|
end
|
271
|
+
@chan.halt
|
228
272
|
end
|
229
273
|
|
230
274
|
end
|