pwrake 2.1.3 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +25 -12
- data/bin/pwrake-mpi +41 -0
- data/bin/pwrake-mpi-run +16 -0
- data/lib/pwrake/branch/branch.rb +17 -29
- data/lib/pwrake/branch/branch_application.rb +31 -41
- data/lib/pwrake/branch/communicator.rb +32 -11
- data/lib/pwrake/branch/communicator_set.rb +6 -0
- data/lib/pwrake/logger.rb +29 -1
- data/lib/pwrake/master/master.rb +51 -64
- data/lib/pwrake/master/master_application.rb +4 -9
- data/lib/pwrake/mpi/branch.rb +76 -0
- data/lib/pwrake/mpi/worker.rb +42 -0
- data/lib/pwrake/nbio.rb +60 -62
- data/lib/pwrake/option/host_map.rb +50 -9
- data/lib/pwrake/option/option.rb +55 -66
- data/lib/pwrake/option/option_default_filesystem.rb +48 -0
- data/lib/pwrake/option/option_gfarm.rb +1 -0
- data/lib/pwrake/option/option_gfarm2fs.rb +101 -0
- data/lib/pwrake/queue/locality_aware_queue.rb +7 -11
- data/lib/pwrake/report/task_stat.rb +4 -5
- data/lib/pwrake/task/task_wrapper.rb +57 -34
- data/lib/pwrake/version.rb +1 -1
- data/lib/pwrake/worker/executor.rb +32 -14
- data/lib/pwrake/worker/invoker.rb +61 -34
- data/lib/pwrake/worker/worker_main.rb +5 -5
- data/lib/pwrake/worker/writer.rb +27 -20
- metadata +11 -5
- data/lib/pwrake/option/option_filesystem.rb +0 -123
- data/lib/pwrake/worker/load.rb +0 -14
- data/lib/pwrake/worker/reader.rb +0 -73
@@ -6,9 +6,7 @@ module Pwrake
|
|
6
6
|
# group_map = {gid1=>[hid1,hid2,...], ...}
|
7
7
|
@size_q = 0
|
8
8
|
@q = {}
|
9
|
-
@hostinfo_by_name = {}
|
10
9
|
@hostinfo_by_id.each do |id,h|
|
11
|
-
@hostinfo_by_name[h.name] = h
|
12
10
|
@q[id] = @array_class.new(h.ncore)
|
13
11
|
end
|
14
12
|
@q_group = {}
|
@@ -26,23 +24,21 @@ module Pwrake
|
|
26
24
|
@n_turn = @disable_steal ? 1 : 2
|
27
25
|
end
|
28
26
|
|
29
|
-
|
30
27
|
def enq_impl(t)
|
31
28
|
hints = t && t.suggest_location
|
32
29
|
Log.debug "enq #{t.name} hints=#{hints.inspect}"
|
33
30
|
if hints.nil? || hints.empty?
|
34
31
|
@q_remote.push(t)
|
35
32
|
else
|
36
|
-
|
33
|
+
kv = {}
|
37
34
|
hints.each do |h|
|
38
|
-
|
39
|
-
if host_info && q = @q[host_info.id]
|
40
|
-
t.assigned.push(host_info.id)
|
41
|
-
q.push(t)
|
42
|
-
stored = true
|
43
|
-
end
|
35
|
+
HostMap.ipmatch_for_name(h).each{|id| kv[id] = true}
|
44
36
|
end
|
45
|
-
if
|
37
|
+
if !kv.empty?
|
38
|
+
kv.each_key do |id|
|
39
|
+
t.assigned.push(id)
|
40
|
+
@q[id].push(t)
|
41
|
+
end
|
46
42
|
@size_q += 1
|
47
43
|
else
|
48
44
|
@q_remote.push(t)
|
@@ -39,12 +39,10 @@ module Pwrake
|
|
39
39
|
|
40
40
|
def task_locality
|
41
41
|
file_size = {}
|
42
|
-
file_host = {}
|
43
42
|
h = {}
|
44
43
|
@task_table.each do |row|
|
45
44
|
name = row['task_name']
|
46
45
|
file_size[name] = row['file_size'].to_i
|
47
|
-
file_host[name] = (row['file_host']||'').split('|')
|
48
46
|
exec_host = row['exec_host'] || ""
|
49
47
|
h[exec_host] = true
|
50
48
|
end
|
@@ -54,15 +52,16 @@ module Pwrake
|
|
54
52
|
if row['executed']=='1'
|
55
53
|
name = row['task_name']
|
56
54
|
exec_host = row['exec_host']
|
57
|
-
loc =
|
55
|
+
loc = (row['write_loc'] == "L")
|
58
56
|
count(exec_host, loc, :out_num, 1)
|
59
57
|
count(exec_host, loc, :out_size, file_size[name])
|
60
58
|
|
61
59
|
preq_files = (row['preq']||'').split('|')
|
62
|
-
|
60
|
+
preq_loc = row['preq_loc']||''
|
61
|
+
preq_files.each_with_index do |preq,i|
|
63
62
|
sz = file_size[preq]
|
64
63
|
if sz && sz > 0
|
65
|
-
loc =
|
64
|
+
loc = (preq_loc[i] == "L")
|
66
65
|
count(exec_host, loc, :in_num, 1)
|
67
66
|
count(exec_host, loc, :in_size, sz)
|
68
67
|
end
|
@@ -25,10 +25,10 @@ module Pwrake
|
|
25
25
|
@input_file_mtime = nil
|
26
26
|
@rank = nil
|
27
27
|
@priority = nil
|
28
|
-
@lock_rank = Monitor.new
|
29
28
|
@executed = false
|
30
29
|
@assigned = []
|
31
30
|
@exec_host = nil
|
31
|
+
@exec_host_id = nil
|
32
32
|
@tried_hosts = []
|
33
33
|
@n_retry = @property.retry || Rake.application.pwrake_options["RETRY"] || 1
|
34
34
|
end
|
@@ -41,7 +41,7 @@ module Pwrake
|
|
41
41
|
attr_reader :assigned
|
42
42
|
attr_reader :tried_hosts
|
43
43
|
attr_accessor :executed
|
44
|
-
attr_accessor :exec_host
|
44
|
+
attr_accessor :exec_host, :exec_host_id
|
45
45
|
attr_accessor :shell_id, :status
|
46
46
|
|
47
47
|
def self.format_time(t)
|
@@ -53,8 +53,8 @@ module Pwrake
|
|
53
53
|
fn = File.join(dir,option['TASK_CSV_FILE'])
|
54
54
|
@@task_logger = CSV.open(fn,'w')
|
55
55
|
@@task_logger.puts %w[
|
56
|
-
task_id task_name start_time end_time elap_time preq preq_host
|
57
|
-
exec_host shell_id has_action executed file_size file_mtime file_host
|
56
|
+
task_id task_name start_time end_time elap_time preq preq_host preq_loc
|
57
|
+
exec_host shell_id has_action executed file_size file_mtime file_host write_loc
|
58
58
|
]
|
59
59
|
end
|
60
60
|
end
|
@@ -78,14 +78,14 @@ module Pwrake
|
|
78
78
|
@n_retry == 0
|
79
79
|
end
|
80
80
|
|
81
|
-
def postprocess(
|
81
|
+
def postprocess(postproc)
|
82
82
|
@executed = true if !@task.actions.empty?
|
83
83
|
#tm_taskend = Time.now
|
84
84
|
if is_file_task?
|
85
85
|
#t = Time.now
|
86
86
|
if File.exist?(name)
|
87
87
|
@file_stat = File::Stat.new(name)
|
88
|
-
@location =
|
88
|
+
@location = postproc.run(self)
|
89
89
|
end
|
90
90
|
end
|
91
91
|
#Log.debug "postprocess time=#{Time.now-tm_taskend}"
|
@@ -114,11 +114,11 @@ module Pwrake
|
|
114
114
|
def log_task
|
115
115
|
@time_end = Time.now
|
116
116
|
#
|
117
|
-
|
117
|
+
sug_host = suggest_location()
|
118
118
|
shell = Pwrake::Shell.current
|
119
119
|
#
|
120
|
-
if
|
121
|
-
Rake.application.count(
|
120
|
+
if sug_host && !sug_host.empty? && shell && !actions.empty?
|
121
|
+
Rake.application.count( sug_host, shell.host )
|
122
122
|
end
|
123
123
|
return if !@@task_logger
|
124
124
|
#
|
@@ -127,20 +127,34 @@ module Pwrake
|
|
127
127
|
RANK_STAT.add_sample(rank,elap)
|
128
128
|
end
|
129
129
|
#
|
130
|
+
# locality check
|
131
|
+
loc_na = true
|
132
|
+
preq_loc = prerequisites.map do |preq|
|
133
|
+
locs = Rake.application[preq].wrapper.location
|
134
|
+
if loc = file_locality(locs)
|
135
|
+
loc_na = false
|
136
|
+
loc
|
137
|
+
else
|
138
|
+
"n"
|
139
|
+
end
|
140
|
+
end.join("")
|
141
|
+
preq_loc = nil if loc_na
|
142
|
+
write_loc = file_locality(@location)
|
143
|
+
#
|
130
144
|
if @file_stat
|
131
|
-
fstat = [@file_stat.size, @file_stat.mtime, self.location.join('|')]
|
145
|
+
fstat = [@file_stat.size, @file_stat.mtime, self.location.join('|'), write_loc]
|
132
146
|
else
|
133
|
-
fstat = [nil]*
|
147
|
+
fstat = [nil]*4
|
134
148
|
end
|
135
149
|
#
|
136
|
-
# task_id task_name start_time end_time elap_time preq preq_host
|
137
|
-
# exec_host shell_id has_action executed file_size file_mtime file_host
|
150
|
+
# task_id task_name start_time end_time elap_time preq preq_host preq_loc
|
151
|
+
# exec_host shell_id has_action executed file_size file_mtime file_host write_loc
|
138
152
|
#
|
139
153
|
row = [ @task_id, name, @time_start, @time_end, elap,
|
140
|
-
prerequisites,
|
154
|
+
prerequisites, sug_host, preq_loc, @exec_host, @shell_id,
|
141
155
|
(actions.empty?) ? 0 : 1,
|
142
156
|
(@executed) ? 1 : 0,
|
143
|
-
|
157
|
+
] + fstat
|
144
158
|
row.map!{|x|
|
145
159
|
if x.kind_of?(Time)
|
146
160
|
TaskWrapper.format_time(x)
|
@@ -166,6 +180,17 @@ module Pwrake
|
|
166
180
|
end
|
167
181
|
end
|
168
182
|
|
183
|
+
def file_locality(nodes)
|
184
|
+
if nodes.empty? || !@exec_host_id
|
185
|
+
nil # not available
|
186
|
+
elsif nodes.any?{|node|
|
187
|
+
HostMap.ipmatch_for_name(node).include?(@exec_host_id)}
|
188
|
+
"L" # Local
|
189
|
+
else
|
190
|
+
"R" # Remote
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
169
194
|
def is_file_task?
|
170
195
|
@task.kind_of?(Rake::FileTask)
|
171
196
|
end
|
@@ -223,28 +248,26 @@ module Pwrake
|
|
223
248
|
end
|
224
249
|
|
225
250
|
def rank
|
226
|
-
|
227
|
-
if
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
max_rank = r
|
236
|
-
end
|
237
|
-
end
|
238
|
-
if has_output_file?
|
239
|
-
step = 1
|
240
|
-
else
|
241
|
-
step = 0
|
251
|
+
if @rank.nil?
|
252
|
+
if subsequents.nil? || subsequents.empty?
|
253
|
+
@rank = 0
|
254
|
+
else
|
255
|
+
max_rank = 0
|
256
|
+
subsequents.each do |subsq|
|
257
|
+
r = subsq.wrapper.rank
|
258
|
+
if max_rank < r
|
259
|
+
max_rank = r
|
242
260
|
end
|
243
|
-
@rank = max_rank + step
|
244
261
|
end
|
245
|
-
|
262
|
+
if has_output_file?
|
263
|
+
step = 1
|
264
|
+
else
|
265
|
+
step = 0
|
266
|
+
end
|
267
|
+
@rank = max_rank + step
|
246
268
|
end
|
247
|
-
|
269
|
+
Log.debug "Task[#{name}] rank=#{@rank.inspect}"
|
270
|
+
end
|
248
271
|
@rank
|
249
272
|
end
|
250
273
|
|
data/lib/pwrake/version.rb
CHANGED
@@ -7,17 +7,37 @@ module Pwrake
|
|
7
7
|
@id = id
|
8
8
|
@out = Writer.instance
|
9
9
|
@log = LogExecutor.instance
|
10
|
-
@queue =
|
10
|
+
@queue = FiberQueue.new
|
11
11
|
@rd_list = []
|
12
12
|
@dir = dir_class.new
|
13
13
|
@dir.open
|
14
14
|
@dir.open_messages.each{|m| @log.info(m)}
|
15
15
|
@out.puts "#{@id}:open"
|
16
|
+
|
17
|
+
r,w = IO.pipe
|
18
|
+
@command_pipe_r = NBIO::Reader.new(@selector,r)
|
19
|
+
@command_pipe_w = NBIO::Writer.new(@selector,w)
|
20
|
+
@start_process_fiber = Fiber.new do
|
21
|
+
while line = @queue.deq
|
22
|
+
cmd = line
|
23
|
+
while /\\$/ =~ line # line continues
|
24
|
+
line = @queue.deq
|
25
|
+
break if !line
|
26
|
+
cmd += line
|
27
|
+
end
|
28
|
+
break if @stopped
|
29
|
+
cmd.chomp!
|
30
|
+
if !cmd.empty?
|
31
|
+
start_process(cmd)
|
32
|
+
end
|
33
|
+
Fiber.yield
|
34
|
+
end
|
35
|
+
end
|
16
36
|
end
|
17
37
|
|
18
38
|
def stop
|
19
39
|
@stopped = true
|
20
|
-
@queue.
|
40
|
+
@queue.finish
|
21
41
|
end
|
22
42
|
|
23
43
|
def close
|
@@ -43,13 +63,12 @@ module Pwrake
|
|
43
63
|
|
44
64
|
def execute(cmd)
|
45
65
|
return if @stopped
|
46
|
-
@queue.
|
47
|
-
|
66
|
+
@queue.enq(cmd)
|
67
|
+
@start_process_fiber.resume
|
48
68
|
end
|
49
69
|
|
50
|
-
def start_process
|
70
|
+
def start_process(command)
|
51
71
|
return if @thread # running
|
52
|
-
command = @queue.shift
|
53
72
|
return if !command # empty queue
|
54
73
|
@spawn_in, @sh_in = IO.pipe
|
55
74
|
@sh_out, @spawn_out = IO.pipe
|
@@ -71,20 +90,19 @@ module Pwrake
|
|
71
90
|
@spawn_err.close
|
72
91
|
end
|
73
92
|
|
74
|
-
@rd_out = Reader.new(@sh_out
|
75
|
-
@rd_err = Reader.new(@sh_err
|
93
|
+
@rd_out = NBIO::Reader.new(@selector,@sh_out)
|
94
|
+
@rd_err = NBIO::Reader.new(@selector,@sh_err)
|
76
95
|
@rd_list = [@rd_out,@rd_err]
|
77
96
|
|
78
|
-
|
79
|
-
|
97
|
+
Fiber.new{callback(@rd_err,"e")}.resume
|
98
|
+
Fiber.new{callback(@rd_out,"o")}.resume
|
80
99
|
end
|
81
100
|
|
82
|
-
def callback(rd)
|
101
|
+
def callback(rd,mode)
|
83
102
|
while s = rd.gets
|
84
|
-
@out.puts "#{@id}:#{
|
103
|
+
@out.puts "#{@id}:#{mode}:#{s.chomp}"
|
85
104
|
end
|
86
105
|
if rd.eof?
|
87
|
-
@selector.delete_reader(rd.io)
|
88
106
|
@rd_list.delete(rd)
|
89
107
|
if @rd_list.empty? # process_end
|
90
108
|
@thread = @pid = nil
|
@@ -93,7 +111,7 @@ module Pwrake
|
|
93
111
|
@sh_in.close
|
94
112
|
@sh_out.close
|
95
113
|
@sh_err.close
|
96
|
-
|
114
|
+
@start_process_fiber.resume # next process
|
97
115
|
end
|
98
116
|
end
|
99
117
|
rescue => exc
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require "socket"
|
2
|
+
|
1
3
|
module Pwrake
|
2
4
|
|
3
5
|
class Invoker
|
@@ -12,37 +14,65 @@ module Pwrake
|
|
12
14
|
end
|
13
15
|
end
|
14
16
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
18
|
-
|
17
|
+
def get_io
|
18
|
+
[IO, $stdin, $stdout]
|
19
|
+
end
|
20
|
+
|
21
|
+
def setup_connection
|
22
|
+
ioc, ior, iow = get_io()
|
23
|
+
# read @ncore and @option
|
24
|
+
@ncore,len = ior.read(8).unpack("V2")
|
25
|
+
@option = Marshal.load(ior.read(len))
|
26
|
+
# set pipe to branch-master
|
27
|
+
@selector = NBIO::Selector.new(ioc)
|
28
|
+
@rd = NBIO::Reader.new(@selector,ior)
|
29
|
+
@out = Writer.instance
|
30
|
+
@out.out = iow
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
setup_connection
|
35
|
+
@dir_class = Pwrake.const_get(@option[:shared_directory])
|
36
|
+
@dir_class.init(@option)
|
19
37
|
@ex_list = {}
|
20
|
-
@out = Writer.instance # firstly replace $stderr
|
21
38
|
@log = LogExecutor.instance
|
22
39
|
@log.init(@option)
|
23
40
|
@log.open(@dir_class)
|
24
41
|
@out.add_logger(@log)
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
42
|
+
send_ipaddr
|
43
|
+
send_ncore
|
44
|
+
# does NOT exit when writing to broken pipe
|
45
|
+
Signal.trap("PIPE", "SIG_IGN")
|
46
|
+
end
|
47
|
+
|
48
|
+
def send_ipaddr
|
49
|
+
# get IP addresses
|
50
|
+
v = Socket.getifaddrs.
|
51
|
+
select{|a| a.addr.ip? && (a.flags & Socket::IFF_MULTICAST != 0)}
|
52
|
+
# write IP addresses
|
53
|
+
v.each do |a|
|
54
|
+
@out.puts "ip:#{a.addr.ip_address}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def send_ncore
|
59
|
+
if @ncore.kind_of?(Integer)
|
60
|
+
if @ncore <= 0
|
61
|
+
@ncore += processor_count()
|
30
62
|
end
|
31
63
|
if @ncore <= 0
|
32
|
-
m = "Out of range: ncore=#{ncore.inspect}"
|
64
|
+
m = "Out of range: ncore=#{@ncore.inspect}"
|
33
65
|
@out.puts "ncore:"+m
|
34
66
|
raise ArgumentError,m
|
35
67
|
end
|
36
|
-
elsif ncore.nil?
|
68
|
+
elsif @ncore.nil?
|
37
69
|
@ncore = processor_count()
|
38
70
|
else
|
39
|
-
m = "Invalid argument: ncore=#{ncore.inspect}"
|
71
|
+
m = "Invalid argument: ncore=#{@ncore.inspect}"
|
40
72
|
@out.puts "ncore:"+m
|
41
73
|
raise ArgumentError,m
|
42
74
|
end
|
43
75
|
@out.puts "ncore:#{@ncore}"
|
44
|
-
# does NOT exit when writing to broken pipe
|
45
|
-
Signal.trap("PIPE", "SIG_IGN")
|
46
76
|
end
|
47
77
|
|
48
78
|
def get_line(io)
|
@@ -57,10 +87,10 @@ module Pwrake
|
|
57
87
|
|
58
88
|
def run
|
59
89
|
setup_option
|
60
|
-
setup_loop
|
61
|
-
@
|
62
|
-
|
63
|
-
@selector.
|
90
|
+
Fiber.new{setup_loop}.resume
|
91
|
+
@selector.run
|
92
|
+
Fiber.new{command_callback}.resume
|
93
|
+
@selector.run
|
64
94
|
rescue => exc
|
65
95
|
@log.error(([exc.to_s]+exc.backtrace).join("\n"))
|
66
96
|
ensure
|
@@ -70,15 +100,14 @@ module Pwrake
|
|
70
100
|
def setup_option
|
71
101
|
@log.info @option.inspect
|
72
102
|
@out.heartbeat = @option[:heartbeat]
|
73
|
-
@shell_cmd = @option[:shell_command]
|
74
|
-
@shell_rc = @option[:shell_rc] || []
|
75
103
|
(@option[:pass_env]||{}).each do |k,v|
|
76
104
|
ENV[k] = v
|
77
105
|
end
|
78
106
|
end
|
79
107
|
|
80
108
|
def setup_loop
|
81
|
-
|
109
|
+
loop do
|
110
|
+
line = get_line(@rd)
|
82
111
|
case line
|
83
112
|
when /^(\d+):open$/o
|
84
113
|
$1.split.each do |id|
|
@@ -92,12 +121,16 @@ module Pwrake
|
|
92
121
|
end
|
93
122
|
end
|
94
123
|
end
|
95
|
-
raise RuntimeError,"incomplete setup_loop"
|
96
124
|
end
|
97
125
|
|
98
|
-
def command_callback
|
99
|
-
while line = get_line(rd)
|
126
|
+
def command_callback
|
127
|
+
while line = get_line(@rd)
|
100
128
|
case line
|
129
|
+
when /^(\d+):exit$/o
|
130
|
+
id = $1
|
131
|
+
ex = @ex_list.delete(id)
|
132
|
+
ex.close
|
133
|
+
ex.join
|
101
134
|
when /^(\d+):(.*)$/o
|
102
135
|
id,cmd = $1,$2
|
103
136
|
@ex_list[id].execute(cmd.chomp)
|
@@ -105,7 +138,7 @@ module Pwrake
|
|
105
138
|
break if common_line(line)
|
106
139
|
end
|
107
140
|
end
|
108
|
-
if rd.eof?
|
141
|
+
if @rd.eof?
|
109
142
|
# connection lost
|
110
143
|
raise RuntimeError,"lost connection to master"
|
111
144
|
end
|
@@ -114,7 +147,6 @@ module Pwrake
|
|
114
147
|
def common_line(line)
|
115
148
|
case line
|
116
149
|
when /^exit$/o
|
117
|
-
@selector.delete_reader($stdin)
|
118
150
|
return true
|
119
151
|
#
|
120
152
|
when /^kill:(.*)$/o
|
@@ -129,7 +161,7 @@ module Pwrake
|
|
129
161
|
return false
|
130
162
|
#
|
131
163
|
else
|
132
|
-
msg = "invalid line: #{line}"
|
164
|
+
msg = "invalid line: #{line.inspect}"
|
133
165
|
@log.fatal msg
|
134
166
|
raise RuntimeError,msg
|
135
167
|
end
|
@@ -142,12 +174,7 @@ module Pwrake
|
|
142
174
|
@ex_list.each_value{|ex| ex.close}
|
143
175
|
@ex_list.each_value{|ex| ex.join}
|
144
176
|
@log.info "worker:end:#{@ex_list.keys.inspect}"
|
145
|
-
|
146
|
-
Timeout.timeout(20){@log.close}
|
147
|
-
rescue => e
|
148
|
-
$stdout.puts e
|
149
|
-
$stdout.puts e.backtrace.join("\n")
|
150
|
-
end
|
177
|
+
Timeout.timeout(20){@log.close}
|
151
178
|
ensure
|
152
179
|
@out.puts "exited"
|
153
180
|
end
|