pwrake 2.1.3 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +25 -12
- data/bin/pwrake-mpi +41 -0
- data/bin/pwrake-mpi-run +16 -0
- data/lib/pwrake/branch/branch.rb +17 -29
- data/lib/pwrake/branch/branch_application.rb +31 -41
- data/lib/pwrake/branch/communicator.rb +32 -11
- data/lib/pwrake/branch/communicator_set.rb +6 -0
- data/lib/pwrake/logger.rb +29 -1
- data/lib/pwrake/master/master.rb +51 -64
- data/lib/pwrake/master/master_application.rb +4 -9
- data/lib/pwrake/mpi/branch.rb +76 -0
- data/lib/pwrake/mpi/worker.rb +42 -0
- data/lib/pwrake/nbio.rb +60 -62
- data/lib/pwrake/option/host_map.rb +50 -9
- data/lib/pwrake/option/option.rb +55 -66
- data/lib/pwrake/option/option_default_filesystem.rb +48 -0
- data/lib/pwrake/option/option_gfarm.rb +1 -0
- data/lib/pwrake/option/option_gfarm2fs.rb +101 -0
- data/lib/pwrake/queue/locality_aware_queue.rb +7 -11
- data/lib/pwrake/report/task_stat.rb +4 -5
- data/lib/pwrake/task/task_wrapper.rb +57 -34
- data/lib/pwrake/version.rb +1 -1
- data/lib/pwrake/worker/executor.rb +32 -14
- data/lib/pwrake/worker/invoker.rb +61 -34
- data/lib/pwrake/worker/worker_main.rb +5 -5
- data/lib/pwrake/worker/writer.rb +27 -20
- metadata +11 -5
- data/lib/pwrake/option/option_filesystem.rb +0 -123
- data/lib/pwrake/worker/load.rb +0 -14
- data/lib/pwrake/worker/reader.rb +0 -73
@@ -6,9 +6,7 @@ module Pwrake
|
|
6
6
|
# group_map = {gid1=>[hid1,hid2,...], ...}
|
7
7
|
@size_q = 0
|
8
8
|
@q = {}
|
9
|
-
@hostinfo_by_name = {}
|
10
9
|
@hostinfo_by_id.each do |id,h|
|
11
|
-
@hostinfo_by_name[h.name] = h
|
12
10
|
@q[id] = @array_class.new(h.ncore)
|
13
11
|
end
|
14
12
|
@q_group = {}
|
@@ -26,23 +24,21 @@ module Pwrake
|
|
26
24
|
@n_turn = @disable_steal ? 1 : 2
|
27
25
|
end
|
28
26
|
|
29
|
-
|
30
27
|
def enq_impl(t)
|
31
28
|
hints = t && t.suggest_location
|
32
29
|
Log.debug "enq #{t.name} hints=#{hints.inspect}"
|
33
30
|
if hints.nil? || hints.empty?
|
34
31
|
@q_remote.push(t)
|
35
32
|
else
|
36
|
-
|
33
|
+
kv = {}
|
37
34
|
hints.each do |h|
|
38
|
-
|
39
|
-
if host_info && q = @q[host_info.id]
|
40
|
-
t.assigned.push(host_info.id)
|
41
|
-
q.push(t)
|
42
|
-
stored = true
|
43
|
-
end
|
35
|
+
HostMap.ipmatch_for_name(h).each{|id| kv[id] = true}
|
44
36
|
end
|
45
|
-
if
|
37
|
+
if !kv.empty?
|
38
|
+
kv.each_key do |id|
|
39
|
+
t.assigned.push(id)
|
40
|
+
@q[id].push(t)
|
41
|
+
end
|
46
42
|
@size_q += 1
|
47
43
|
else
|
48
44
|
@q_remote.push(t)
|
@@ -39,12 +39,10 @@ module Pwrake
|
|
39
39
|
|
40
40
|
def task_locality
|
41
41
|
file_size = {}
|
42
|
-
file_host = {}
|
43
42
|
h = {}
|
44
43
|
@task_table.each do |row|
|
45
44
|
name = row['task_name']
|
46
45
|
file_size[name] = row['file_size'].to_i
|
47
|
-
file_host[name] = (row['file_host']||'').split('|')
|
48
46
|
exec_host = row['exec_host'] || ""
|
49
47
|
h[exec_host] = true
|
50
48
|
end
|
@@ -54,15 +52,16 @@ module Pwrake
|
|
54
52
|
if row['executed']=='1'
|
55
53
|
name = row['task_name']
|
56
54
|
exec_host = row['exec_host']
|
57
|
-
loc =
|
55
|
+
loc = (row['write_loc'] == "L")
|
58
56
|
count(exec_host, loc, :out_num, 1)
|
59
57
|
count(exec_host, loc, :out_size, file_size[name])
|
60
58
|
|
61
59
|
preq_files = (row['preq']||'').split('|')
|
62
|
-
|
60
|
+
preq_loc = row['preq_loc']||''
|
61
|
+
preq_files.each_with_index do |preq,i|
|
63
62
|
sz = file_size[preq]
|
64
63
|
if sz && sz > 0
|
65
|
-
loc =
|
64
|
+
loc = (preq_loc[i] == "L")
|
66
65
|
count(exec_host, loc, :in_num, 1)
|
67
66
|
count(exec_host, loc, :in_size, sz)
|
68
67
|
end
|
@@ -25,10 +25,10 @@ module Pwrake
|
|
25
25
|
@input_file_mtime = nil
|
26
26
|
@rank = nil
|
27
27
|
@priority = nil
|
28
|
-
@lock_rank = Monitor.new
|
29
28
|
@executed = false
|
30
29
|
@assigned = []
|
31
30
|
@exec_host = nil
|
31
|
+
@exec_host_id = nil
|
32
32
|
@tried_hosts = []
|
33
33
|
@n_retry = @property.retry || Rake.application.pwrake_options["RETRY"] || 1
|
34
34
|
end
|
@@ -41,7 +41,7 @@ module Pwrake
|
|
41
41
|
attr_reader :assigned
|
42
42
|
attr_reader :tried_hosts
|
43
43
|
attr_accessor :executed
|
44
|
-
attr_accessor :exec_host
|
44
|
+
attr_accessor :exec_host, :exec_host_id
|
45
45
|
attr_accessor :shell_id, :status
|
46
46
|
|
47
47
|
def self.format_time(t)
|
@@ -53,8 +53,8 @@ module Pwrake
|
|
53
53
|
fn = File.join(dir,option['TASK_CSV_FILE'])
|
54
54
|
@@task_logger = CSV.open(fn,'w')
|
55
55
|
@@task_logger.puts %w[
|
56
|
-
task_id task_name start_time end_time elap_time preq preq_host
|
57
|
-
exec_host shell_id has_action executed file_size file_mtime file_host
|
56
|
+
task_id task_name start_time end_time elap_time preq preq_host preq_loc
|
57
|
+
exec_host shell_id has_action executed file_size file_mtime file_host write_loc
|
58
58
|
]
|
59
59
|
end
|
60
60
|
end
|
@@ -78,14 +78,14 @@ module Pwrake
|
|
78
78
|
@n_retry == 0
|
79
79
|
end
|
80
80
|
|
81
|
-
def postprocess(
|
81
|
+
def postprocess(postproc)
|
82
82
|
@executed = true if !@task.actions.empty?
|
83
83
|
#tm_taskend = Time.now
|
84
84
|
if is_file_task?
|
85
85
|
#t = Time.now
|
86
86
|
if File.exist?(name)
|
87
87
|
@file_stat = File::Stat.new(name)
|
88
|
-
@location =
|
88
|
+
@location = postproc.run(self)
|
89
89
|
end
|
90
90
|
end
|
91
91
|
#Log.debug "postprocess time=#{Time.now-tm_taskend}"
|
@@ -114,11 +114,11 @@ module Pwrake
|
|
114
114
|
def log_task
|
115
115
|
@time_end = Time.now
|
116
116
|
#
|
117
|
-
|
117
|
+
sug_host = suggest_location()
|
118
118
|
shell = Pwrake::Shell.current
|
119
119
|
#
|
120
|
-
if
|
121
|
-
Rake.application.count(
|
120
|
+
if sug_host && !sug_host.empty? && shell && !actions.empty?
|
121
|
+
Rake.application.count( sug_host, shell.host )
|
122
122
|
end
|
123
123
|
return if !@@task_logger
|
124
124
|
#
|
@@ -127,20 +127,34 @@ module Pwrake
|
|
127
127
|
RANK_STAT.add_sample(rank,elap)
|
128
128
|
end
|
129
129
|
#
|
130
|
+
# locality check
|
131
|
+
loc_na = true
|
132
|
+
preq_loc = prerequisites.map do |preq|
|
133
|
+
locs = Rake.application[preq].wrapper.location
|
134
|
+
if loc = file_locality(locs)
|
135
|
+
loc_na = false
|
136
|
+
loc
|
137
|
+
else
|
138
|
+
"n"
|
139
|
+
end
|
140
|
+
end.join("")
|
141
|
+
preq_loc = nil if loc_na
|
142
|
+
write_loc = file_locality(@location)
|
143
|
+
#
|
130
144
|
if @file_stat
|
131
|
-
fstat = [@file_stat.size, @file_stat.mtime, self.location.join('|')]
|
145
|
+
fstat = [@file_stat.size, @file_stat.mtime, self.location.join('|'), write_loc]
|
132
146
|
else
|
133
|
-
fstat = [nil]*
|
147
|
+
fstat = [nil]*4
|
134
148
|
end
|
135
149
|
#
|
136
|
-
# task_id task_name start_time end_time elap_time preq preq_host
|
137
|
-
# exec_host shell_id has_action executed file_size file_mtime file_host
|
150
|
+
# task_id task_name start_time end_time elap_time preq preq_host preq_loc
|
151
|
+
# exec_host shell_id has_action executed file_size file_mtime file_host write_loc
|
138
152
|
#
|
139
153
|
row = [ @task_id, name, @time_start, @time_end, elap,
|
140
|
-
prerequisites,
|
154
|
+
prerequisites, sug_host, preq_loc, @exec_host, @shell_id,
|
141
155
|
(actions.empty?) ? 0 : 1,
|
142
156
|
(@executed) ? 1 : 0,
|
143
|
-
|
157
|
+
] + fstat
|
144
158
|
row.map!{|x|
|
145
159
|
if x.kind_of?(Time)
|
146
160
|
TaskWrapper.format_time(x)
|
@@ -166,6 +180,17 @@ module Pwrake
|
|
166
180
|
end
|
167
181
|
end
|
168
182
|
|
183
|
+
def file_locality(nodes)
|
184
|
+
if nodes.empty? || !@exec_host_id
|
185
|
+
nil # not available
|
186
|
+
elsif nodes.any?{|node|
|
187
|
+
HostMap.ipmatch_for_name(node).include?(@exec_host_id)}
|
188
|
+
"L" # Local
|
189
|
+
else
|
190
|
+
"R" # Remote
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
169
194
|
def is_file_task?
|
170
195
|
@task.kind_of?(Rake::FileTask)
|
171
196
|
end
|
@@ -223,28 +248,26 @@ module Pwrake
|
|
223
248
|
end
|
224
249
|
|
225
250
|
def rank
|
226
|
-
|
227
|
-
if
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
max_rank = r
|
236
|
-
end
|
237
|
-
end
|
238
|
-
if has_output_file?
|
239
|
-
step = 1
|
240
|
-
else
|
241
|
-
step = 0
|
251
|
+
if @rank.nil?
|
252
|
+
if subsequents.nil? || subsequents.empty?
|
253
|
+
@rank = 0
|
254
|
+
else
|
255
|
+
max_rank = 0
|
256
|
+
subsequents.each do |subsq|
|
257
|
+
r = subsq.wrapper.rank
|
258
|
+
if max_rank < r
|
259
|
+
max_rank = r
|
242
260
|
end
|
243
|
-
@rank = max_rank + step
|
244
261
|
end
|
245
|
-
|
262
|
+
if has_output_file?
|
263
|
+
step = 1
|
264
|
+
else
|
265
|
+
step = 0
|
266
|
+
end
|
267
|
+
@rank = max_rank + step
|
246
268
|
end
|
247
|
-
|
269
|
+
Log.debug "Task[#{name}] rank=#{@rank.inspect}"
|
270
|
+
end
|
248
271
|
@rank
|
249
272
|
end
|
250
273
|
|
data/lib/pwrake/version.rb
CHANGED
@@ -7,17 +7,37 @@ module Pwrake
|
|
7
7
|
@id = id
|
8
8
|
@out = Writer.instance
|
9
9
|
@log = LogExecutor.instance
|
10
|
-
@queue =
|
10
|
+
@queue = FiberQueue.new
|
11
11
|
@rd_list = []
|
12
12
|
@dir = dir_class.new
|
13
13
|
@dir.open
|
14
14
|
@dir.open_messages.each{|m| @log.info(m)}
|
15
15
|
@out.puts "#{@id}:open"
|
16
|
+
|
17
|
+
r,w = IO.pipe
|
18
|
+
@command_pipe_r = NBIO::Reader.new(@selector,r)
|
19
|
+
@command_pipe_w = NBIO::Writer.new(@selector,w)
|
20
|
+
@start_process_fiber = Fiber.new do
|
21
|
+
while line = @queue.deq
|
22
|
+
cmd = line
|
23
|
+
while /\\$/ =~ line # line continues
|
24
|
+
line = @queue.deq
|
25
|
+
break if !line
|
26
|
+
cmd += line
|
27
|
+
end
|
28
|
+
break if @stopped
|
29
|
+
cmd.chomp!
|
30
|
+
if !cmd.empty?
|
31
|
+
start_process(cmd)
|
32
|
+
end
|
33
|
+
Fiber.yield
|
34
|
+
end
|
35
|
+
end
|
16
36
|
end
|
17
37
|
|
18
38
|
def stop
|
19
39
|
@stopped = true
|
20
|
-
@queue.
|
40
|
+
@queue.finish
|
21
41
|
end
|
22
42
|
|
23
43
|
def close
|
@@ -43,13 +63,12 @@ module Pwrake
|
|
43
63
|
|
44
64
|
def execute(cmd)
|
45
65
|
return if @stopped
|
46
|
-
@queue.
|
47
|
-
|
66
|
+
@queue.enq(cmd)
|
67
|
+
@start_process_fiber.resume
|
48
68
|
end
|
49
69
|
|
50
|
-
def start_process
|
70
|
+
def start_process(command)
|
51
71
|
return if @thread # running
|
52
|
-
command = @queue.shift
|
53
72
|
return if !command # empty queue
|
54
73
|
@spawn_in, @sh_in = IO.pipe
|
55
74
|
@sh_out, @spawn_out = IO.pipe
|
@@ -71,20 +90,19 @@ module Pwrake
|
|
71
90
|
@spawn_err.close
|
72
91
|
end
|
73
92
|
|
74
|
-
@rd_out = Reader.new(@sh_out
|
75
|
-
@rd_err = Reader.new(@sh_err
|
93
|
+
@rd_out = NBIO::Reader.new(@selector,@sh_out)
|
94
|
+
@rd_err = NBIO::Reader.new(@selector,@sh_err)
|
76
95
|
@rd_list = [@rd_out,@rd_err]
|
77
96
|
|
78
|
-
|
79
|
-
|
97
|
+
Fiber.new{callback(@rd_err,"e")}.resume
|
98
|
+
Fiber.new{callback(@rd_out,"o")}.resume
|
80
99
|
end
|
81
100
|
|
82
|
-
def callback(rd)
|
101
|
+
def callback(rd,mode)
|
83
102
|
while s = rd.gets
|
84
|
-
@out.puts "#{@id}:#{
|
103
|
+
@out.puts "#{@id}:#{mode}:#{s.chomp}"
|
85
104
|
end
|
86
105
|
if rd.eof?
|
87
|
-
@selector.delete_reader(rd.io)
|
88
106
|
@rd_list.delete(rd)
|
89
107
|
if @rd_list.empty? # process_end
|
90
108
|
@thread = @pid = nil
|
@@ -93,7 +111,7 @@ module Pwrake
|
|
93
111
|
@sh_in.close
|
94
112
|
@sh_out.close
|
95
113
|
@sh_err.close
|
96
|
-
|
114
|
+
@start_process_fiber.resume # next process
|
97
115
|
end
|
98
116
|
end
|
99
117
|
rescue => exc
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require "socket"
|
2
|
+
|
1
3
|
module Pwrake
|
2
4
|
|
3
5
|
class Invoker
|
@@ -12,37 +14,65 @@ module Pwrake
|
|
12
14
|
end
|
13
15
|
end
|
14
16
|
|
15
|
-
def
|
16
|
-
|
17
|
-
|
18
|
-
|
17
|
+
def get_io
|
18
|
+
[IO, $stdin, $stdout]
|
19
|
+
end
|
20
|
+
|
21
|
+
def setup_connection
|
22
|
+
ioc, ior, iow = get_io()
|
23
|
+
# read @ncore and @option
|
24
|
+
@ncore,len = ior.read(8).unpack("V2")
|
25
|
+
@option = Marshal.load(ior.read(len))
|
26
|
+
# set pipe to branch-master
|
27
|
+
@selector = NBIO::Selector.new(ioc)
|
28
|
+
@rd = NBIO::Reader.new(@selector,ior)
|
29
|
+
@out = Writer.instance
|
30
|
+
@out.out = iow
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
setup_connection
|
35
|
+
@dir_class = Pwrake.const_get(@option[:shared_directory])
|
36
|
+
@dir_class.init(@option)
|
19
37
|
@ex_list = {}
|
20
|
-
@out = Writer.instance # firstly replace $stderr
|
21
38
|
@log = LogExecutor.instance
|
22
39
|
@log.init(@option)
|
23
40
|
@log.open(@dir_class)
|
24
41
|
@out.add_logger(@log)
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
42
|
+
send_ipaddr
|
43
|
+
send_ncore
|
44
|
+
# does NOT exit when writing to broken pipe
|
45
|
+
Signal.trap("PIPE", "SIG_IGN")
|
46
|
+
end
|
47
|
+
|
48
|
+
def send_ipaddr
|
49
|
+
# get IP addresses
|
50
|
+
v = Socket.getifaddrs.
|
51
|
+
select{|a| a.addr.ip? && (a.flags & Socket::IFF_MULTICAST != 0)}
|
52
|
+
# write IP addresses
|
53
|
+
v.each do |a|
|
54
|
+
@out.puts "ip:#{a.addr.ip_address}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def send_ncore
|
59
|
+
if @ncore.kind_of?(Integer)
|
60
|
+
if @ncore <= 0
|
61
|
+
@ncore += processor_count()
|
30
62
|
end
|
31
63
|
if @ncore <= 0
|
32
|
-
m = "Out of range: ncore=#{ncore.inspect}"
|
64
|
+
m = "Out of range: ncore=#{@ncore.inspect}"
|
33
65
|
@out.puts "ncore:"+m
|
34
66
|
raise ArgumentError,m
|
35
67
|
end
|
36
|
-
elsif ncore.nil?
|
68
|
+
elsif @ncore.nil?
|
37
69
|
@ncore = processor_count()
|
38
70
|
else
|
39
|
-
m = "Invalid argument: ncore=#{ncore.inspect}"
|
71
|
+
m = "Invalid argument: ncore=#{@ncore.inspect}"
|
40
72
|
@out.puts "ncore:"+m
|
41
73
|
raise ArgumentError,m
|
42
74
|
end
|
43
75
|
@out.puts "ncore:#{@ncore}"
|
44
|
-
# does NOT exit when writing to broken pipe
|
45
|
-
Signal.trap("PIPE", "SIG_IGN")
|
46
76
|
end
|
47
77
|
|
48
78
|
def get_line(io)
|
@@ -57,10 +87,10 @@ module Pwrake
|
|
57
87
|
|
58
88
|
def run
|
59
89
|
setup_option
|
60
|
-
setup_loop
|
61
|
-
@
|
62
|
-
|
63
|
-
@selector.
|
90
|
+
Fiber.new{setup_loop}.resume
|
91
|
+
@selector.run
|
92
|
+
Fiber.new{command_callback}.resume
|
93
|
+
@selector.run
|
64
94
|
rescue => exc
|
65
95
|
@log.error(([exc.to_s]+exc.backtrace).join("\n"))
|
66
96
|
ensure
|
@@ -70,15 +100,14 @@ module Pwrake
|
|
70
100
|
def setup_option
|
71
101
|
@log.info @option.inspect
|
72
102
|
@out.heartbeat = @option[:heartbeat]
|
73
|
-
@shell_cmd = @option[:shell_command]
|
74
|
-
@shell_rc = @option[:shell_rc] || []
|
75
103
|
(@option[:pass_env]||{}).each do |k,v|
|
76
104
|
ENV[k] = v
|
77
105
|
end
|
78
106
|
end
|
79
107
|
|
80
108
|
def setup_loop
|
81
|
-
|
109
|
+
loop do
|
110
|
+
line = get_line(@rd)
|
82
111
|
case line
|
83
112
|
when /^(\d+):open$/o
|
84
113
|
$1.split.each do |id|
|
@@ -92,12 +121,16 @@ module Pwrake
|
|
92
121
|
end
|
93
122
|
end
|
94
123
|
end
|
95
|
-
raise RuntimeError,"incomplete setup_loop"
|
96
124
|
end
|
97
125
|
|
98
|
-
def command_callback
|
99
|
-
while line = get_line(rd)
|
126
|
+
def command_callback
|
127
|
+
while line = get_line(@rd)
|
100
128
|
case line
|
129
|
+
when /^(\d+):exit$/o
|
130
|
+
id = $1
|
131
|
+
ex = @ex_list.delete(id)
|
132
|
+
ex.close
|
133
|
+
ex.join
|
101
134
|
when /^(\d+):(.*)$/o
|
102
135
|
id,cmd = $1,$2
|
103
136
|
@ex_list[id].execute(cmd.chomp)
|
@@ -105,7 +138,7 @@ module Pwrake
|
|
105
138
|
break if common_line(line)
|
106
139
|
end
|
107
140
|
end
|
108
|
-
if rd.eof?
|
141
|
+
if @rd.eof?
|
109
142
|
# connection lost
|
110
143
|
raise RuntimeError,"lost connection to master"
|
111
144
|
end
|
@@ -114,7 +147,6 @@ module Pwrake
|
|
114
147
|
def common_line(line)
|
115
148
|
case line
|
116
149
|
when /^exit$/o
|
117
|
-
@selector.delete_reader($stdin)
|
118
150
|
return true
|
119
151
|
#
|
120
152
|
when /^kill:(.*)$/o
|
@@ -129,7 +161,7 @@ module Pwrake
|
|
129
161
|
return false
|
130
162
|
#
|
131
163
|
else
|
132
|
-
msg = "invalid line: #{line}"
|
164
|
+
msg = "invalid line: #{line.inspect}"
|
133
165
|
@log.fatal msg
|
134
166
|
raise RuntimeError,msg
|
135
167
|
end
|
@@ -142,12 +174,7 @@ module Pwrake
|
|
142
174
|
@ex_list.each_value{|ex| ex.close}
|
143
175
|
@ex_list.each_value{|ex| ex.join}
|
144
176
|
@log.info "worker:end:#{@ex_list.keys.inspect}"
|
145
|
-
|
146
|
-
Timeout.timeout(20){@log.close}
|
147
|
-
rescue => e
|
148
|
-
$stdout.puts e
|
149
|
-
$stdout.puts e.backtrace.join("\n")
|
150
|
-
end
|
177
|
+
Timeout.timeout(20){@log.close}
|
151
178
|
ensure
|
152
179
|
@out.puts "exited"
|
153
180
|
end
|