pwrake 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +22 -9
- data/bin/gfwhere-pipe +33 -9
- data/bin/pwrake +5 -2
- data/bin/pwrake_branch +5 -3
- data/lib/pwrake/branch/branch.rb +95 -86
- data/lib/pwrake/branch/branch_application.rb +4 -0
- data/lib/pwrake/branch/communicator.rb +173 -0
- data/lib/pwrake/branch/communicator_set.rb +100 -0
- data/lib/pwrake/branch/fiber_queue.rb +10 -0
- data/lib/pwrake/branch/shell.rb +68 -24
- data/lib/pwrake/branch/shell_profiler.rb +2 -0
- data/lib/pwrake/gfarm/gfarm_postprocess.rb +8 -7
- data/lib/pwrake/logger.rb +5 -0
- data/lib/pwrake/master/master.rb +190 -87
- data/lib/pwrake/master/master_application.rb +8 -0
- data/lib/pwrake/nbio.rb +525 -0
- data/lib/pwrake/option/host_map.rb +36 -4
- data/lib/pwrake/option/option.rb +7 -1
- data/lib/pwrake/option/option_filesystem.rb +13 -3
- data/lib/pwrake/queue/locality_aware_queue.rb +41 -6
- data/lib/pwrake/queue/queue_array.rb +31 -11
- data/lib/pwrake/queue/task_queue.rb +15 -18
- data/lib/pwrake/report/report.rb +2 -0
- data/lib/pwrake/task/task_algorithm.rb +4 -1
- data/lib/pwrake/task/task_manager.rb +2 -0
- data/lib/pwrake/task/task_property.rb +1 -0
- data/lib/pwrake/task/task_wrapper.rb +40 -21
- data/lib/pwrake/version.rb +1 -1
- data/lib/pwrake/worker/invoker.rb +4 -29
- data/pwrake.gemspec +3 -2
- metadata +24 -12
- data/lib/pwrake/branch.rb +0 -22
- data/lib/pwrake/branch/worker_communicator.rb +0 -104
- data/lib/pwrake/iomux/channel.rb +0 -70
- data/lib/pwrake/iomux/handler.rb +0 -124
- data/lib/pwrake/iomux/handler_set.rb +0 -35
- data/lib/pwrake/iomux/runner.rb +0 -62
- data/lib/pwrake/master.rb +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 91439940b04a74b462713ec03d5f5bd4f282965f
|
4
|
+
data.tar.gz: 06a86ba801618144354c31957a73430a3cb00725
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b9ce53904fa158730c7b0a58e68b70fc18e6df5bd73b2114b04064376bce3cfd02de3b36ac7996a077a969e0790dd6ce5c9006344307950fef9fe42e2255512
|
7
|
+
data.tar.gz: a8e8ba98341678f4a9a4744236a31a7b6769287fc6dc5554a1408d960d7b915c1397e88f8d957e0b39f9331d73df2a6dc50696bd4972c048257c543d214e9cc4
|
data/README.md
CHANGED
@@ -18,20 +18,31 @@ Parallel Workflow extension for Rake, runs on multicores, clusters, clouds.
|
|
18
18
|
* Pwrake automatically connects to remote hosts using SSH. You do not need to start a daemon.
|
19
19
|
* Remote host names and the number of cores to use are provided in a hostfile.
|
20
20
|
* [Gfarm file system](http://sourceforge.net/projects/gfarm/) utilizes storage of compute nodes. It provides the high-performance parallel I/O.
|
21
|
-
* Parallel I/O access to local
|
21
|
+
* Parallel I/O access to local storage of compute nodes enables scalable increase in the I/O performance.
|
22
22
|
* Gfarm schedules a compute node to store an output file, to local storage.
|
23
23
|
* Pwrake schedules a compute node to execute a task, to a node where input files are stored.
|
24
24
|
* Other supports for Gfarm: Automatic mount of the Gfarm file system, etc.
|
25
25
|
|
26
26
|
## Installation
|
27
27
|
|
28
|
-
|
28
|
+
Install with RubyGems:
|
29
|
+
|
30
|
+
$ gem install pwrake
|
31
|
+
|
32
|
+
Or download source tgz/zip and expand, cd to subdirectory and install:
|
29
33
|
|
30
34
|
$ ruby setup.rb
|
31
35
|
|
32
|
-
|
36
|
+
In the latter case, you need install [Parallel](https://github.com/grosser/parallel) manually. It is required by Pwrake for processor count.
|
37
|
+
|
38
|
+
If you use rbenv, your system may fail to find pwrake command after installation:
|
39
|
+
|
40
|
+
-bash: pwrake: command not found
|
41
|
+
|
42
|
+
In this case, you need the rehash of command paths:
|
43
|
+
|
44
|
+
$ rbenv rehash
|
33
45
|
|
34
|
-
$ gem install pwrake
|
35
46
|
|
36
47
|
## Usage
|
37
48
|
|
@@ -73,7 +84,7 @@ Or, gem install:
|
|
73
84
|
-A, --disable-affinity [Pw] Turn OFF affinity (AFFINITY=off)
|
74
85
|
-S, --disable-steal [Pw] Turn OFF task steal
|
75
86
|
-d, --debug [Pw] Output Debug messages
|
76
|
-
--pwrake-conf [FILE] [Pw] Pwrake
|
87
|
+
--pwrake-conf [FILE] [Pw] Pwrake configuration file in YAML
|
77
88
|
--show-conf, --show-config [Pw] Show Pwrake configuration options
|
78
89
|
--report LOGDIR [Pw] Report workflow statistics from LOGDIR to HTML and exit.
|
79
90
|
--clear-gfarm2fs [Pw] Clear gfarm2fs mountpoints left after failure.
|
@@ -106,8 +117,8 @@ Or, gem install:
|
|
106
117
|
SHELL_COMMAND default=$SHELL
|
107
118
|
SHELL_RC Run-Command when shell starts
|
108
119
|
PASS_ENV (Array) Environment variables passed to SSH
|
109
|
-
HEARTBEAT
|
110
|
-
RETRY default=
|
120
|
+
HEARTBEAT default=240 - Hearbeat interval in seconds
|
121
|
+
RETRY default=1 - The number of retry
|
111
122
|
FAILED_TARGET rename(default)|delete|leave - Treatment of failed target files
|
112
123
|
FAILURE_TERMINATION wait(default)|kill|continue - Behavior of other tasks when a task is failed
|
113
124
|
QUEUE_PRIORITY LIHR(default)|FIFO|LIFO|RANK
|
@@ -186,6 +197,8 @@ Properties (The leftmost item is default):
|
|
186
197
|
|
187
198
|
## Acknowledgment
|
188
199
|
|
189
|
-
This work is supported by
|
190
|
-
* JST CREST, research
|
200
|
+
This work is supported by:
|
201
|
+
* JST CREST, research themes:
|
202
|
+
* ["Statistical Computational Cosmology with Big Astronomical Imaging Data,"](http://www.jst.go.jp/kisoken/crest/en/project/44/14532369.html)
|
203
|
+
* ["System Software for Post Petascale Data Intensive Science,"](http://postpeta.jst.go.jp/en/researchers/tatebe22.html)
|
191
204
|
* MEXT Promotion of Research for Next Generation IT Infrastructure "Resources Linkage for e-Science (RENKEI)."
|
data/bin/gfwhere-pipe
CHANGED
@@ -65,13 +65,6 @@ module Gfarm
|
|
65
65
|
end
|
66
66
|
|
67
67
|
|
68
|
-
def connection(*args)
|
69
|
-
Connection.set_args(args)
|
70
|
-
Connection.instance
|
71
|
-
end
|
72
|
-
module_function :connection
|
73
|
-
|
74
|
-
|
75
68
|
class Connection
|
76
69
|
include Singleton
|
77
70
|
|
@@ -124,10 +117,14 @@ module Gfarm
|
|
124
117
|
FFI.gfs_replica_info_free(ptr)
|
125
118
|
end
|
126
119
|
|
120
|
+
def self.set_opts(opts)
|
121
|
+
@@opts = opts
|
122
|
+
end
|
123
|
+
|
127
124
|
def initialize(gfarm, path)
|
128
125
|
@gfarm = gfarm
|
129
126
|
@realpath = @gfarm.realpath_by_gfarm2fs(path)
|
130
|
-
flag =
|
127
|
+
flag = @@opts.flags
|
131
128
|
ptr = ::FFI::MemoryPointer.new(:pointer, 1)
|
132
129
|
e = FFI.gfs_replica_info_by_name(@realpath, flag, ptr)
|
133
130
|
if e != GFARM_ERR_NO_ERROR
|
@@ -147,13 +144,40 @@ module Gfarm
|
|
147
144
|
end
|
148
145
|
end
|
149
146
|
|
147
|
+
class Options
|
148
|
+
INCLUDING_DEAD_HOST = 1
|
149
|
+
INCLUDING_INCOMPLETE_COPY = 2
|
150
|
+
INCLUDING_DEAD_COPY = 4
|
151
|
+
|
152
|
+
def initialize(argv)
|
153
|
+
@args = []
|
154
|
+
@flags = 0
|
155
|
+
argv.each do |x|
|
156
|
+
case x
|
157
|
+
when "-i"
|
158
|
+
@including_incomplete_copy = true
|
159
|
+
@flags |= INCLUDING_INCOMPLETE_COPY
|
160
|
+
else
|
161
|
+
@args << x
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
attr_reader :args
|
167
|
+
attr_reader :flags
|
168
|
+
attr_reader :including_incomplete_copy
|
169
|
+
end
|
170
|
+
|
150
171
|
end
|
151
172
|
|
152
173
|
[:PIPE,:TERM,:INT].each do |sig|
|
153
174
|
Signal.trap(sig, "EXIT")
|
154
175
|
end
|
155
176
|
|
156
|
-
|
177
|
+
opts = Gfarm::Options.new(ARGV)
|
178
|
+
Gfarm::ReplicaInfo.set_opts(opts)
|
179
|
+
Gfarm::Connection.set_args(opts.args)
|
180
|
+
gfarm = Gfarm::Connection.instance
|
157
181
|
|
158
182
|
while path=$stdin.gets
|
159
183
|
path.chomp!
|
data/bin/pwrake
CHANGED
@@ -10,8 +10,11 @@ end
|
|
10
10
|
libpath = File.absolute_path(File.dirname(__FILE__))+"/../lib"
|
11
11
|
$LOAD_PATH.unshift libpath
|
12
12
|
|
13
|
-
require "pwrake/
|
14
|
-
require "pwrake/
|
13
|
+
require "pwrake/version"
|
14
|
+
require "pwrake/master/master_application"
|
15
|
+
require "pwrake/task/task_manager"
|
16
|
+
require "pwrake/task/task_algorithm"
|
17
|
+
require "pwrake/branch/branch_application"
|
15
18
|
class Rake::Application
|
16
19
|
include Pwrake::BranchApplication
|
17
20
|
prepend Pwrake::MasterApplication
|
data/bin/pwrake_branch
CHANGED
@@ -10,13 +10,15 @@ end
|
|
10
10
|
libpath = File.absolute_path(File.dirname(__FILE__))+"/../lib"
|
11
11
|
$LOAD_PATH.unshift libpath
|
12
12
|
|
13
|
-
require "pwrake/branch"
|
13
|
+
require "pwrake/branch/branch_application"
|
14
14
|
class Rake::Application
|
15
15
|
include Pwrake::BranchApplication
|
16
16
|
end
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
require "pwrake/task/task_algorithm"
|
19
|
+
class Rake::Task
|
20
|
+
include Pwrake::TaskAlgorithm
|
21
|
+
end
|
20
22
|
|
21
23
|
# does NOT exit when writing to broken pipe
|
22
24
|
Signal.trap(:PIPE, "IGNORE")
|
data/lib/pwrake/branch/branch.rb
CHANGED
@@ -1,19 +1,24 @@
|
|
1
|
+
require "pwrake/nbio"
|
2
|
+
require "pwrake/branch/communicator_set"
|
3
|
+
require "pwrake/branch/fiber_queue"
|
4
|
+
require "pwrake/branch/shell"
|
5
|
+
require "pwrake/branch/file_utils"
|
6
|
+
require "pwrake/option/option"
|
7
|
+
|
1
8
|
module Pwrake
|
2
9
|
|
3
10
|
class Branch
|
4
11
|
|
5
12
|
def initialize(opts,r,w)
|
6
|
-
|
13
|
+
Thread.abort_on_exception = true
|
7
14
|
@option = opts
|
8
15
|
@task_q = {} # worker_id => FiberQueue.new
|
9
16
|
@shells = []
|
10
17
|
@ior = r
|
11
18
|
@iow = w
|
12
|
-
@
|
13
|
-
@
|
14
|
-
@
|
15
|
-
@wk_comm = {}
|
16
|
-
@wk_hdl_set = HandlerSet.new
|
19
|
+
@selector = NBIO::Selector.new
|
20
|
+
@master_rd = NBIO::Reader.new(@selector,@ior)
|
21
|
+
@master_wt = NBIO::Writer.new(@selector,@iow)
|
17
22
|
@shell_start_interval = @option['SHELL_START_INTERVAL']
|
18
23
|
end
|
19
24
|
|
@@ -24,8 +29,8 @@ module Pwrake
|
|
24
29
|
setup_shell
|
25
30
|
setup_fiber
|
26
31
|
setup_master_channel
|
27
|
-
@
|
28
|
-
Log.debug "
|
32
|
+
@cs.run("task execution")
|
33
|
+
Log.debug "Branch#run end"
|
29
34
|
end
|
30
35
|
|
31
36
|
attr_reader :logger
|
@@ -49,83 +54,73 @@ module Pwrake
|
|
49
54
|
else
|
50
55
|
@logger.level = Logger::WARN
|
51
56
|
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def setup_worker
|
55
|
-
s = @ior.gets
|
56
|
-
if s.chomp != "host_list_begin"
|
57
|
-
raise "Branch#setup_worker: recv=#{s.chomp} expected=host_list_begin"
|
58
|
-
end
|
59
57
|
|
60
58
|
if dir = @option['LOG_DIR']
|
61
59
|
fn = File.join(dir,@option["COMMAND_CSV_FILE"])
|
62
60
|
Shell.profiler.open(fn,@option['GNU_TIME'],@option['PLOT_PARALLELISM'])
|
63
61
|
end
|
62
|
+
end
|
64
63
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
comm = WorkerCommunicator.new(id,host,ncore,@runner,@option)
|
74
|
-
comm.setup_connection(worker_code)
|
75
|
-
@wk_comm[id] = comm
|
76
|
-
@wk_hdl_set << comm.handler
|
77
|
-
@task_q[id] = FiberQueue.new
|
78
|
-
else
|
79
|
-
raise "Branch#setup_worker: recv=#{s.chomp} expected=host:id hostname ncore"
|
80
|
-
end
|
64
|
+
def setup_worker
|
65
|
+
@cs = CommunicatorSet.new(@master_rd,@selector,@option.worker_option)
|
66
|
+
@cs.create_communicators
|
67
|
+
worker_code = read_worker_progs(@option.worker_progs)
|
68
|
+
@cs.each_value do |comm|
|
69
|
+
Fiber.new do
|
70
|
+
comm.connect(worker_code)
|
71
|
+
end.resume
|
81
72
|
end
|
73
|
+
@cs.run("connect to workers")
|
74
|
+
#
|
75
|
+
Fiber.new do
|
76
|
+
@cs.each_value do |comm|
|
77
|
+
# set WorkerChannel#ncore at Master
|
78
|
+
@master_wt.put_line "ncore:#{comm.id}:#{comm.ncore}"
|
79
|
+
end
|
80
|
+
@master_wt.put_line "ncore:done"
|
81
|
+
end.resume
|
82
|
+
@selector.run
|
83
|
+
end
|
82
84
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
85
|
+
def read_worker_progs(worker_progs)
|
86
|
+
code = ""
|
87
|
+
modpath = {}
|
88
|
+
worker_progs.each do |f|
|
89
|
+
m = f.split(/\//).first
|
90
|
+
if !modpath[m]
|
91
|
+
$LOAD_PATH.each do |x|
|
92
|
+
if File.directory?(File.join(x,m))
|
93
|
+
modpath[m] = x
|
92
94
|
break
|
93
95
|
end
|
94
96
|
end
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
end
|
106
|
-
|
107
|
-
# ncore
|
108
|
-
@wk_comm.each_value do |comm|
|
109
|
-
# set WorkerChannel#ncore at Master
|
110
|
-
@master_hdl.put_line "ncore:#{comm.id}:#{comm.ncore}"
|
97
|
+
if !modpath[m]
|
98
|
+
raise RuntimeError,"load path for module #{m} not found"
|
99
|
+
end
|
100
|
+
end
|
101
|
+
path = File.join(modpath[m],f)
|
102
|
+
path += '.rb' if /\.rb$/ !~ path
|
103
|
+
if !File.exist?(path)
|
104
|
+
raise RuntimeError,"program file #{path} not found"
|
105
|
+
end
|
106
|
+
code << IO.read(path) + "\n"
|
111
107
|
end
|
112
|
-
|
108
|
+
code
|
113
109
|
end
|
114
110
|
|
115
111
|
def setup_shell
|
116
112
|
@shells = []
|
117
|
-
|
118
|
-
|
119
|
-
@wk_comm.each_value do |comm|
|
113
|
+
@cs.each_value do |comm|
|
114
|
+
@task_q[comm.id] = task_q = FiberQueue.new
|
120
115
|
comm.ncore.times do
|
121
|
-
chan =
|
122
|
-
|
123
|
-
shell = Shell.new(chan,@task_q[comm.id],@option.worker_option)
|
124
|
-
@shells << shell
|
116
|
+
chan = comm.new_channel
|
117
|
+
shell = Shell.new(chan,comm,task_q,@option.worker_option)
|
125
118
|
# wait for remote shell open
|
126
119
|
Fiber.new do
|
127
|
-
if
|
128
|
-
|
120
|
+
if shell.open
|
121
|
+
@shells << shell
|
122
|
+
else
|
123
|
+
@master_wt.put_line "retire:#{comm.id}"
|
129
124
|
end
|
130
125
|
Log.debug "Branch#setup_shells: end of fiber to open shell"
|
131
126
|
end.resume
|
@@ -133,24 +128,20 @@ module Pwrake
|
|
133
128
|
end
|
134
129
|
end
|
135
130
|
|
136
|
-
@
|
137
|
-
|
138
|
-
if !errors.empty?
|
139
|
-
raise RuntimeError,"Failed to start workers: #{errors.inspect}"
|
140
|
-
end
|
131
|
+
@cs.run("setup shells")
|
141
132
|
end
|
142
133
|
|
143
134
|
def setup_fiber
|
144
135
|
# start fibers
|
145
136
|
@shells.each do |shell|
|
146
|
-
shell.create_fiber(@
|
137
|
+
shell.create_fiber(@master_wt).resume
|
147
138
|
end
|
148
139
|
Log.debug "all fiber started"
|
149
140
|
|
150
|
-
@
|
141
|
+
@cs.each_value do |comm|
|
151
142
|
#comm.start_default_fiber
|
152
143
|
Fiber.new do
|
153
|
-
while s = comm.
|
144
|
+
while s = comm.reader.get_line
|
154
145
|
break unless comm.common_line(s)
|
155
146
|
end
|
156
147
|
Log.debug "Branch#setup_fiber: end of fiber for default channel"
|
@@ -158,34 +149,52 @@ module Pwrake
|
|
158
149
|
end
|
159
150
|
|
160
151
|
# setup end
|
161
|
-
@
|
162
|
-
comm.
|
152
|
+
@cs.each_value do |comm|
|
153
|
+
comm.writer.put_line "setup_end"
|
163
154
|
end
|
164
155
|
|
165
|
-
@
|
156
|
+
@master_wt.put_line "branch_setup:done"
|
166
157
|
Log.debug "Branch#setup_fiber: setup end"
|
167
158
|
end
|
168
159
|
|
169
160
|
def setup_master_channel
|
170
161
|
Fiber.new do
|
171
|
-
while s = @
|
162
|
+
while s = @master_rd.get_line
|
172
163
|
# receive command from main pwrake
|
173
164
|
Log.debug "Branch:recv #{s.inspect} from master"
|
174
165
|
case s
|
175
166
|
#
|
176
167
|
when /^(\d+):(.+)$/o
|
177
168
|
id, tname = $1,$2
|
178
|
-
|
169
|
+
begin
|
170
|
+
task_name = tname.sub(/^\d+:/,"")
|
171
|
+
@task_q[id].enq(tname)
|
172
|
+
rescue => e
|
173
|
+
Log.error Log.bt(e)
|
174
|
+
ret="taskfail:#{id}:#{task_name}"
|
175
|
+
Log.debug "fail to enq task_q[#{id}], ret=#{ret}"
|
176
|
+
@master_wt.put_line(ret)
|
177
|
+
end
|
179
178
|
#
|
180
179
|
when /^exit$/
|
181
|
-
|
182
|
-
|
183
|
-
@
|
180
|
+
#@task_q.each_value{|q| q.finish}
|
181
|
+
#@cs.drop_all
|
182
|
+
@cs.finish_shells
|
183
|
+
|
184
|
+
#@shells.each{|shell| shell.exit} # just for comfirm
|
185
|
+
#@selector.halt # should halt after exited
|
184
186
|
break
|
185
187
|
#
|
188
|
+
when /^drop:(.*)$/o
|
189
|
+
id = $1
|
190
|
+
taskq = @task_q.delete(id)
|
191
|
+
Log.debug "drop @task_q[#{id}]=#{taskq.inspect}"
|
192
|
+
@cs.drop(id)
|
193
|
+
#
|
186
194
|
when /^kill:(.*)$/o
|
187
195
|
sig = $1
|
188
196
|
kill(sig)
|
197
|
+
#
|
189
198
|
else
|
190
199
|
Log.debug "Branch: invalid line from master: #{s}"
|
191
200
|
end
|
@@ -196,16 +205,16 @@ module Pwrake
|
|
196
205
|
|
197
206
|
def kill(sig="INT")
|
198
207
|
Log.warn "Branch#kill #{sig}"
|
199
|
-
@
|
208
|
+
@cs.kill(sig)
|
200
209
|
end
|
201
210
|
|
202
211
|
def finish
|
203
212
|
return if @finished
|
204
213
|
@finished = true
|
205
|
-
Log.debug "Branch#finish: begin"
|
206
|
-
@
|
214
|
+
#Log.debug "Branch#finish: begin"
|
215
|
+
@cs.exit
|
207
216
|
Log.debug "Branch#finish: worker exited"
|
208
|
-
@
|
217
|
+
@master_wt.put_line "exited"
|
209
218
|
Log.debug "Branch#finish: sent 'exited' to master"
|
210
219
|
end
|
211
220
|
|