pwrake 2.0.1 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +22 -9
- data/bin/gfwhere-pipe +33 -9
- data/bin/pwrake +5 -2
- data/bin/pwrake_branch +5 -3
- data/lib/pwrake/branch/branch.rb +95 -86
- data/lib/pwrake/branch/branch_application.rb +4 -0
- data/lib/pwrake/branch/communicator.rb +173 -0
- data/lib/pwrake/branch/communicator_set.rb +100 -0
- data/lib/pwrake/branch/fiber_queue.rb +10 -0
- data/lib/pwrake/branch/shell.rb +68 -24
- data/lib/pwrake/branch/shell_profiler.rb +2 -0
- data/lib/pwrake/gfarm/gfarm_postprocess.rb +8 -7
- data/lib/pwrake/logger.rb +5 -0
- data/lib/pwrake/master/master.rb +190 -87
- data/lib/pwrake/master/master_application.rb +8 -0
- data/lib/pwrake/nbio.rb +525 -0
- data/lib/pwrake/option/host_map.rb +36 -4
- data/lib/pwrake/option/option.rb +7 -1
- data/lib/pwrake/option/option_filesystem.rb +13 -3
- data/lib/pwrake/queue/locality_aware_queue.rb +41 -6
- data/lib/pwrake/queue/queue_array.rb +31 -11
- data/lib/pwrake/queue/task_queue.rb +15 -18
- data/lib/pwrake/report/report.rb +2 -0
- data/lib/pwrake/task/task_algorithm.rb +4 -1
- data/lib/pwrake/task/task_manager.rb +2 -0
- data/lib/pwrake/task/task_property.rb +1 -0
- data/lib/pwrake/task/task_wrapper.rb +40 -21
- data/lib/pwrake/version.rb +1 -1
- data/lib/pwrake/worker/invoker.rb +4 -29
- data/pwrake.gemspec +3 -2
- metadata +24 -12
- data/lib/pwrake/branch.rb +0 -22
- data/lib/pwrake/branch/worker_communicator.rb +0 -104
- data/lib/pwrake/iomux/channel.rb +0 -70
- data/lib/pwrake/iomux/handler.rb +0 -124
- data/lib/pwrake/iomux/handler_set.rb +0 -35
- data/lib/pwrake/iomux/runner.rb +0 -62
- data/lib/pwrake/master.rb +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91439940b04a74b462713ec03d5f5bd4f282965f
|
4
|
+
data.tar.gz: 06a86ba801618144354c31957a73430a3cb00725
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b9ce53904fa158730c7b0a58e68b70fc18e6df5bd73b2114b04064376bce3cfd02de3b36ac7996a077a969e0790dd6ce5c9006344307950fef9fe42e2255512
|
7
|
+
data.tar.gz: a8e8ba98341678f4a9a4744236a31a7b6769287fc6dc5554a1408d960d7b915c1397e88f8d957e0b39f9331d73df2a6dc50696bd4972c048257c543d214e9cc4
|
data/README.md
CHANGED
@@ -18,20 +18,31 @@ Parallel Workflow extension for Rake, runs on multicores, clusters, clouds.
|
|
18
18
|
* Pwrake automatically connects to remote hosts using SSH. You do not need to start a daemon.
|
19
19
|
* Remote host names and the number of cores to use are provided in a hostfile.
|
20
20
|
* [Gfarm file system](http://sourceforge.net/projects/gfarm/) utilizes storage of compute nodes. It provides the high-performance parallel I/O.
|
21
|
-
* Parallel I/O access to local
|
21
|
+
* Parallel I/O access to local storage of compute nodes enables scalable increase in the I/O performance.
|
22
22
|
* Gfarm schedules a compute node to store an output file, to local storage.
|
23
23
|
* Pwrake schedules a compute node to execute a task, to a node where input files are stored.
|
24
24
|
* Other supports for Gfarm: Automatic mount of the Gfarm file system, etc.
|
25
25
|
|
26
26
|
## Installation
|
27
27
|
|
28
|
-
|
28
|
+
Install with RubyGems:
|
29
|
+
|
30
|
+
$ gem install pwrake
|
31
|
+
|
32
|
+
Or download source tgz/zip and expand, cd to subdirectory and install:
|
29
33
|
|
30
34
|
$ ruby setup.rb
|
31
35
|
|
32
|
-
|
36
|
+
In the latter case, you need install [Parallel](https://github.com/grosser/parallel) manually. It is required by Pwrake for processor count.
|
37
|
+
|
38
|
+
If you use rbenv, your system may fail to find pwrake command after installation:
|
39
|
+
|
40
|
+
-bash: pwrake: command not found
|
41
|
+
|
42
|
+
In this case, you need the rehash of command paths:
|
43
|
+
|
44
|
+
$ rbenv rehash
|
33
45
|
|
34
|
-
$ gem install pwrake
|
35
46
|
|
36
47
|
## Usage
|
37
48
|
|
@@ -73,7 +84,7 @@ Or, gem install:
|
|
73
84
|
-A, --disable-affinity [Pw] Turn OFF affinity (AFFINITY=off)
|
74
85
|
-S, --disable-steal [Pw] Turn OFF task steal
|
75
86
|
-d, --debug [Pw] Output Debug messages
|
76
|
-
--pwrake-conf [FILE] [Pw] Pwrake
|
87
|
+
--pwrake-conf [FILE] [Pw] Pwrake configuration file in YAML
|
77
88
|
--show-conf, --show-config [Pw] Show Pwrake configuration options
|
78
89
|
--report LOGDIR [Pw] Report workflow statistics from LOGDIR to HTML and exit.
|
79
90
|
--clear-gfarm2fs [Pw] Clear gfarm2fs mountpoints left after failure.
|
@@ -106,8 +117,8 @@ Or, gem install:
|
|
106
117
|
SHELL_COMMAND default=$SHELL
|
107
118
|
SHELL_RC Run-Command when shell starts
|
108
119
|
PASS_ENV (Array) Environment variables passed to SSH
|
109
|
-
HEARTBEAT
|
110
|
-
RETRY default=
|
120
|
+
HEARTBEAT default=240 - Hearbeat interval in seconds
|
121
|
+
RETRY default=1 - The number of retry
|
111
122
|
FAILED_TARGET rename(default)|delete|leave - Treatment of failed target files
|
112
123
|
FAILURE_TERMINATION wait(default)|kill|continue - Behavior of other tasks when a task is failed
|
113
124
|
QUEUE_PRIORITY LIHR(default)|FIFO|LIFO|RANK
|
@@ -186,6 +197,8 @@ Properties (The leftmost item is default):
|
|
186
197
|
|
187
198
|
## Acknowledgment
|
188
199
|
|
189
|
-
This work is supported by
|
190
|
-
* JST CREST, research
|
200
|
+
This work is supported by:
|
201
|
+
* JST CREST, research themes:
|
202
|
+
* ["Statistical Computational Cosmology with Big Astronomical Imaging Data,"](http://www.jst.go.jp/kisoken/crest/en/project/44/14532369.html)
|
203
|
+
* ["System Software for Post Petascale Data Intensive Science,"](http://postpeta.jst.go.jp/en/researchers/tatebe22.html)
|
191
204
|
* MEXT Promotion of Research for Next Generation IT Infrastructure "Resources Linkage for e-Science (RENKEI)."
|
data/bin/gfwhere-pipe
CHANGED
@@ -65,13 +65,6 @@ module Gfarm
|
|
65
65
|
end
|
66
66
|
|
67
67
|
|
68
|
-
def connection(*args)
|
69
|
-
Connection.set_args(args)
|
70
|
-
Connection.instance
|
71
|
-
end
|
72
|
-
module_function :connection
|
73
|
-
|
74
|
-
|
75
68
|
class Connection
|
76
69
|
include Singleton
|
77
70
|
|
@@ -124,10 +117,14 @@ module Gfarm
|
|
124
117
|
FFI.gfs_replica_info_free(ptr)
|
125
118
|
end
|
126
119
|
|
120
|
+
def self.set_opts(opts)
|
121
|
+
@@opts = opts
|
122
|
+
end
|
123
|
+
|
127
124
|
def initialize(gfarm, path)
|
128
125
|
@gfarm = gfarm
|
129
126
|
@realpath = @gfarm.realpath_by_gfarm2fs(path)
|
130
|
-
flag =
|
127
|
+
flag = @@opts.flags
|
131
128
|
ptr = ::FFI::MemoryPointer.new(:pointer, 1)
|
132
129
|
e = FFI.gfs_replica_info_by_name(@realpath, flag, ptr)
|
133
130
|
if e != GFARM_ERR_NO_ERROR
|
@@ -147,13 +144,40 @@ module Gfarm
|
|
147
144
|
end
|
148
145
|
end
|
149
146
|
|
147
|
+
class Options
|
148
|
+
INCLUDING_DEAD_HOST = 1
|
149
|
+
INCLUDING_INCOMPLETE_COPY = 2
|
150
|
+
INCLUDING_DEAD_COPY = 4
|
151
|
+
|
152
|
+
def initialize(argv)
|
153
|
+
@args = []
|
154
|
+
@flags = 0
|
155
|
+
argv.each do |x|
|
156
|
+
case x
|
157
|
+
when "-i"
|
158
|
+
@including_incomplete_copy = true
|
159
|
+
@flags |= INCLUDING_INCOMPLETE_COPY
|
160
|
+
else
|
161
|
+
@args << x
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
attr_reader :args
|
167
|
+
attr_reader :flags
|
168
|
+
attr_reader :including_incomplete_copy
|
169
|
+
end
|
170
|
+
|
150
171
|
end
|
151
172
|
|
152
173
|
[:PIPE,:TERM,:INT].each do |sig|
|
153
174
|
Signal.trap(sig, "EXIT")
|
154
175
|
end
|
155
176
|
|
156
|
-
|
177
|
+
opts = Gfarm::Options.new(ARGV)
|
178
|
+
Gfarm::ReplicaInfo.set_opts(opts)
|
179
|
+
Gfarm::Connection.set_args(opts.args)
|
180
|
+
gfarm = Gfarm::Connection.instance
|
157
181
|
|
158
182
|
while path=$stdin.gets
|
159
183
|
path.chomp!
|
data/bin/pwrake
CHANGED
@@ -10,8 +10,11 @@ end
|
|
10
10
|
libpath = File.absolute_path(File.dirname(__FILE__))+"/../lib"
|
11
11
|
$LOAD_PATH.unshift libpath
|
12
12
|
|
13
|
-
require "pwrake/
|
14
|
-
require "pwrake/
|
13
|
+
require "pwrake/version"
|
14
|
+
require "pwrake/master/master_application"
|
15
|
+
require "pwrake/task/task_manager"
|
16
|
+
require "pwrake/task/task_algorithm"
|
17
|
+
require "pwrake/branch/branch_application"
|
15
18
|
class Rake::Application
|
16
19
|
include Pwrake::BranchApplication
|
17
20
|
prepend Pwrake::MasterApplication
|
data/bin/pwrake_branch
CHANGED
@@ -10,13 +10,15 @@ end
|
|
10
10
|
libpath = File.absolute_path(File.dirname(__FILE__))+"/../lib"
|
11
11
|
$LOAD_PATH.unshift libpath
|
12
12
|
|
13
|
-
require "pwrake/branch"
|
13
|
+
require "pwrake/branch/branch_application"
|
14
14
|
class Rake::Application
|
15
15
|
include Pwrake::BranchApplication
|
16
16
|
end
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
require "pwrake/task/task_algorithm"
|
19
|
+
class Rake::Task
|
20
|
+
include Pwrake::TaskAlgorithm
|
21
|
+
end
|
20
22
|
|
21
23
|
# does NOT exit when writing to broken pipe
|
22
24
|
Signal.trap(:PIPE, "IGNORE")
|
data/lib/pwrake/branch/branch.rb
CHANGED
@@ -1,19 +1,24 @@
|
|
1
|
+
require "pwrake/nbio"
|
2
|
+
require "pwrake/branch/communicator_set"
|
3
|
+
require "pwrake/branch/fiber_queue"
|
4
|
+
require "pwrake/branch/shell"
|
5
|
+
require "pwrake/branch/file_utils"
|
6
|
+
require "pwrake/option/option"
|
7
|
+
|
1
8
|
module Pwrake
|
2
9
|
|
3
10
|
class Branch
|
4
11
|
|
5
12
|
def initialize(opts,r,w)
|
6
|
-
|
13
|
+
Thread.abort_on_exception = true
|
7
14
|
@option = opts
|
8
15
|
@task_q = {} # worker_id => FiberQueue.new
|
9
16
|
@shells = []
|
10
17
|
@ior = r
|
11
18
|
@iow = w
|
12
|
-
@
|
13
|
-
@
|
14
|
-
@
|
15
|
-
@wk_comm = {}
|
16
|
-
@wk_hdl_set = HandlerSet.new
|
19
|
+
@selector = NBIO::Selector.new
|
20
|
+
@master_rd = NBIO::Reader.new(@selector,@ior)
|
21
|
+
@master_wt = NBIO::Writer.new(@selector,@iow)
|
17
22
|
@shell_start_interval = @option['SHELL_START_INTERVAL']
|
18
23
|
end
|
19
24
|
|
@@ -24,8 +29,8 @@ module Pwrake
|
|
24
29
|
setup_shell
|
25
30
|
setup_fiber
|
26
31
|
setup_master_channel
|
27
|
-
@
|
28
|
-
Log.debug "
|
32
|
+
@cs.run("task execution")
|
33
|
+
Log.debug "Branch#run end"
|
29
34
|
end
|
30
35
|
|
31
36
|
attr_reader :logger
|
@@ -49,83 +54,73 @@ module Pwrake
|
|
49
54
|
else
|
50
55
|
@logger.level = Logger::WARN
|
51
56
|
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def setup_worker
|
55
|
-
s = @ior.gets
|
56
|
-
if s.chomp != "host_list_begin"
|
57
|
-
raise "Branch#setup_worker: recv=#{s.chomp} expected=host_list_begin"
|
58
|
-
end
|
59
57
|
|
60
58
|
if dir = @option['LOG_DIR']
|
61
59
|
fn = File.join(dir,@option["COMMAND_CSV_FILE"])
|
62
60
|
Shell.profiler.open(fn,@option['GNU_TIME'],@option['PLOT_PARALLELISM'])
|
63
61
|
end
|
62
|
+
end
|
64
63
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
comm = WorkerCommunicator.new(id,host,ncore,@runner,@option)
|
74
|
-
comm.setup_connection(worker_code)
|
75
|
-
@wk_comm[id] = comm
|
76
|
-
@wk_hdl_set << comm.handler
|
77
|
-
@task_q[id] = FiberQueue.new
|
78
|
-
else
|
79
|
-
raise "Branch#setup_worker: recv=#{s.chomp} expected=host:id hostname ncore"
|
80
|
-
end
|
64
|
+
def setup_worker
|
65
|
+
@cs = CommunicatorSet.new(@master_rd,@selector,@option.worker_option)
|
66
|
+
@cs.create_communicators
|
67
|
+
worker_code = read_worker_progs(@option.worker_progs)
|
68
|
+
@cs.each_value do |comm|
|
69
|
+
Fiber.new do
|
70
|
+
comm.connect(worker_code)
|
71
|
+
end.resume
|
81
72
|
end
|
73
|
+
@cs.run("connect to workers")
|
74
|
+
#
|
75
|
+
Fiber.new do
|
76
|
+
@cs.each_value do |comm|
|
77
|
+
# set WorkerChannel#ncore at Master
|
78
|
+
@master_wt.put_line "ncore:#{comm.id}:#{comm.ncore}"
|
79
|
+
end
|
80
|
+
@master_wt.put_line "ncore:done"
|
81
|
+
end.resume
|
82
|
+
@selector.run
|
83
|
+
end
|
82
84
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
85
|
+
def read_worker_progs(worker_progs)
|
86
|
+
code = ""
|
87
|
+
modpath = {}
|
88
|
+
worker_progs.each do |f|
|
89
|
+
m = f.split(/\//).first
|
90
|
+
if !modpath[m]
|
91
|
+
$LOAD_PATH.each do |x|
|
92
|
+
if File.directory?(File.join(x,m))
|
93
|
+
modpath[m] = x
|
92
94
|
break
|
93
95
|
end
|
94
96
|
end
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
end
|
106
|
-
|
107
|
-
# ncore
|
108
|
-
@wk_comm.each_value do |comm|
|
109
|
-
# set WorkerChannel#ncore at Master
|
110
|
-
@master_hdl.put_line "ncore:#{comm.id}:#{comm.ncore}"
|
97
|
+
if !modpath[m]
|
98
|
+
raise RuntimeError,"load path for module #{m} not found"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
path = File.join(modpath[m],f)
|
102
|
+
path += '.rb' if /\.rb$/ !~ path
|
103
|
+
if !File.exist?(path)
|
104
|
+
raise RuntimeError,"program file #{path} not found"
|
105
|
+
end
|
106
|
+
code << IO.read(path) + "\n"
|
111
107
|
end
|
112
|
-
|
108
|
+
code
|
113
109
|
end
|
114
110
|
|
115
111
|
def setup_shell
|
116
112
|
@shells = []
|
117
|
-
|
118
|
-
|
119
|
-
@wk_comm.each_value do |comm|
|
113
|
+
@cs.each_value do |comm|
|
114
|
+
@task_q[comm.id] = task_q = FiberQueue.new
|
120
115
|
comm.ncore.times do
|
121
|
-
chan =
|
122
|
-
|
123
|
-
shell = Shell.new(chan,@task_q[comm.id],@option.worker_option)
|
124
|
-
@shells << shell
|
116
|
+
chan = comm.new_channel
|
117
|
+
shell = Shell.new(chan,comm,task_q,@option.worker_option)
|
125
118
|
# wait for remote shell open
|
126
119
|
Fiber.new do
|
127
|
-
if
|
128
|
-
|
120
|
+
if shell.open
|
121
|
+
@shells << shell
|
122
|
+
else
|
123
|
+
@master_wt.put_line "retire:#{comm.id}"
|
129
124
|
end
|
130
125
|
Log.debug "Branch#setup_shells: end of fiber to open shell"
|
131
126
|
end.resume
|
@@ -133,24 +128,20 @@ module Pwrake
|
|
133
128
|
end
|
134
129
|
end
|
135
130
|
|
136
|
-
@
|
137
|
-
|
138
|
-
if !errors.empty?
|
139
|
-
raise RuntimeError,"Failed to start workers: #{errors.inspect}"
|
140
|
-
end
|
131
|
+
@cs.run("setup shells")
|
141
132
|
end
|
142
133
|
|
143
134
|
def setup_fiber
|
144
135
|
# start fibers
|
145
136
|
@shells.each do |shell|
|
146
|
-
shell.create_fiber(@
|
137
|
+
shell.create_fiber(@master_wt).resume
|
147
138
|
end
|
148
139
|
Log.debug "all fiber started"
|
149
140
|
|
150
|
-
@
|
141
|
+
@cs.each_value do |comm|
|
151
142
|
#comm.start_default_fiber
|
152
143
|
Fiber.new do
|
153
|
-
while s = comm.
|
144
|
+
while s = comm.reader.get_line
|
154
145
|
break unless comm.common_line(s)
|
155
146
|
end
|
156
147
|
Log.debug "Branch#setup_fiber: end of fiber for default channel"
|
@@ -158,34 +149,52 @@ module Pwrake
|
|
158
149
|
end
|
159
150
|
|
160
151
|
# setup end
|
161
|
-
@
|
162
|
-
comm.
|
152
|
+
@cs.each_value do |comm|
|
153
|
+
comm.writer.put_line "setup_end"
|
163
154
|
end
|
164
155
|
|
165
|
-
@
|
156
|
+
@master_wt.put_line "branch_setup:done"
|
166
157
|
Log.debug "Branch#setup_fiber: setup end"
|
167
158
|
end
|
168
159
|
|
169
160
|
def setup_master_channel
|
170
161
|
Fiber.new do
|
171
|
-
while s = @
|
162
|
+
while s = @master_rd.get_line
|
172
163
|
# receive command from main pwrake
|
173
164
|
Log.debug "Branch:recv #{s.inspect} from master"
|
174
165
|
case s
|
175
166
|
#
|
176
167
|
when /^(\d+):(.+)$/o
|
177
168
|
id, tname = $1,$2
|
178
|
-
|
169
|
+
begin
|
170
|
+
task_name = tname.sub(/^\d+:/,"")
|
171
|
+
@task_q[id].enq(tname)
|
172
|
+
rescue => e
|
173
|
+
Log.error Log.bt(e)
|
174
|
+
ret="taskfail:#{id}:#{task_name}"
|
175
|
+
Log.debug "fail to enq task_q[#{id}], ret=#{ret}"
|
176
|
+
@master_wt.put_line(ret)
|
177
|
+
end
|
179
178
|
#
|
180
179
|
when /^exit$/
|
181
|
-
|
182
|
-
|
183
|
-
@
|
180
|
+
#@task_q.each_value{|q| q.finish}
|
181
|
+
#@cs.drop_all
|
182
|
+
@cs.finish_shells
|
183
|
+
|
184
|
+
#@shells.each{|shell| shell.exit} # just for comfirm
|
185
|
+
#@selector.halt # should halt after exited
|
184
186
|
break
|
185
187
|
#
|
188
|
+
when /^drop:(.*)$/o
|
189
|
+
id = $1
|
190
|
+
taskq = @task_q.delete(id)
|
191
|
+
Log.debug "drop @task_q[#{id}]=#{taskq.inspect}"
|
192
|
+
@cs.drop(id)
|
193
|
+
#
|
186
194
|
when /^kill:(.*)$/o
|
187
195
|
sig = $1
|
188
196
|
kill(sig)
|
197
|
+
#
|
189
198
|
else
|
190
199
|
Log.debug "Branch: invalid line from master: #{s}"
|
191
200
|
end
|
@@ -196,16 +205,16 @@ module Pwrake
|
|
196
205
|
|
197
206
|
def kill(sig="INT")
|
198
207
|
Log.warn "Branch#kill #{sig}"
|
199
|
-
@
|
208
|
+
@cs.kill(sig)
|
200
209
|
end
|
201
210
|
|
202
211
|
def finish
|
203
212
|
return if @finished
|
204
213
|
@finished = true
|
205
|
-
Log.debug "Branch#finish: begin"
|
206
|
-
@
|
214
|
+
#Log.debug "Branch#finish: begin"
|
215
|
+
@cs.exit
|
207
216
|
Log.debug "Branch#finish: worker exited"
|
208
|
-
@
|
217
|
+
@master_wt.put_line "exited"
|
209
218
|
Log.debug "Branch#finish: sent 'exited' to master"
|
210
219
|
end
|
211
220
|
|