ood_core 0.5.1 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/lib/ood_core/batch_connect/template.rb +17 -6
- data/lib/ood_core/batch_connect/templates/vnc.rb +2 -2
- data/lib/ood_core/job/adapters/drmaa.rb +1002 -0
- data/lib/ood_core/job/adapters/helper.rb +18 -0
- data/lib/ood_core/job/adapters/lsf/batch.rb +4 -3
- data/lib/ood_core/job/adapters/lsf.rb +4 -2
- data/lib/ood_core/job/adapters/pbspro.rb +19 -8
- data/lib/ood_core/job/adapters/sge/batch.rb +203 -0
- data/lib/ood_core/job/adapters/sge/helper.rb +65 -0
- data/lib/ood_core/job/adapters/sge/qstat_xml_j_r_listener.rb +116 -0
- data/lib/ood_core/job/adapters/sge/qstat_xml_r_listener.rb +138 -0
- data/lib/ood_core/job/adapters/sge.rb +163 -0
- data/lib/ood_core/job/adapters/slurm.rb +16 -5
- data/lib/ood_core/job/adapters/torque/attributes.rb +109 -0
- data/lib/ood_core/job/adapters/torque/batch.rb +470 -0
- data/lib/ood_core/job/adapters/torque/error.rb +403 -0
- data/lib/ood_core/job/adapters/torque/ffi.rb +430 -0
- data/lib/ood_core/job/adapters/torque.rb +23 -18
- data/lib/ood_core/job/status.rb +3 -13
- data/lib/ood_core/refinements/drmaa_extensions.rb +21 -0
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +3 -3
- metadata +23 -9
@@ -0,0 +1,163 @@
|
|
1
|
+
require "ood_core/refinements/array_extensions"
|
2
|
+
require "ood_core/refinements/hash_extensions"
|
3
|
+
|
4
|
+
module OodCore
|
5
|
+
module Job
|
6
|
+
class Factory
|
7
|
+
using Refinements::HashExtensions
|
8
|
+
|
9
|
+
# Build the Sun Grid Engine adapter from a configuration
|
10
|
+
# @param config [#to_h] the configuration for job adapter
|
11
|
+
# @option config [Object] :cluster (nil) The cluster to communicate with
|
12
|
+
# @option config [Object] :conf (nil) Path to the SGE conf
|
13
|
+
# @option config [Object] :bin (nil) Path to SGE client binaries
|
14
|
+
# @option config [Object] :sge_root (nil) Path to SGE root, note that
|
15
|
+
# @option config [#to_h] :bin_overrides ({}) Optional overrides to SGE client executables
|
16
|
+
# this may be nil, but must be set to use the DRMAA API, and there is a
|
17
|
+
# severe performance penalty calling Sge#info without using DRMAA.
|
18
|
+
def self.build_sge(config)
|
19
|
+
batch = Adapters::Sge::Batch.new(config.to_h.symbolize_keys)
|
20
|
+
Adapters::Sge.new(batch: batch)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
module Adapters
|
25
|
+
class Sge < Adapter
|
26
|
+
using Refinements::HashExtensions
|
27
|
+
using Refinements::ArrayExtensions
|
28
|
+
|
29
|
+
require "ood_core/job/adapters/sge/batch"
|
30
|
+
require "ood_core/job/adapters/sge/helper"
|
31
|
+
|
32
|
+
# The cluster of the Sun Grid Engine batch server
|
33
|
+
# @example UCLA's hoffman2 cluster
|
34
|
+
# my_batch.cluster #=> "hoffman2"
|
35
|
+
# @return [String, nil] the cluster name
|
36
|
+
attr_reader :cluster
|
37
|
+
|
38
|
+
# The path to the Sun Grid Engine configuration file
|
39
|
+
# @example For Sun Grid Engine 8.0.1
|
40
|
+
# my_batch.conf.to_s #=> "/u/systems/UGE8.0.1vm/h2.conf
|
41
|
+
# @return [Pathname, nil] path to gridengine conf
|
42
|
+
attr_reader :conf
|
43
|
+
|
44
|
+
# The path to the Sun Grid Engine client installation binaries
|
45
|
+
# @example For Sun Grid Engine 8.0.1
|
46
|
+
# my_batch.bin.to_s #=> "/u/systems/UGE8.0.1vm/bin/lx-amd64/
|
47
|
+
# @return [Pathname] path to SGE binaries
|
48
|
+
attr_reader :bin
|
49
|
+
|
50
|
+
# The root exception class that all Sun Grid Engine-specific exceptions inherit
|
51
|
+
# from
|
52
|
+
class Error < StandardError; end
|
53
|
+
|
54
|
+
# @param batch [Adapters::Sge::Batch]
|
55
|
+
def initialize(batch:)
|
56
|
+
@batch = batch
|
57
|
+
@helper = Sge::Helper.new
|
58
|
+
end
|
59
|
+
|
60
|
+
# Submit a job with the attributes defined in the job template instance
|
61
|
+
# @example Submit job template to cluster
|
62
|
+
# solver_id = job_adapter.submit(solver_script)
|
63
|
+
# #=> "1234.server"
|
64
|
+
# @example Submit job that depends on previous job
|
65
|
+
# post_id = job_adapter.submit(
|
66
|
+
# post_script,
|
67
|
+
# afterok: solver_id
|
68
|
+
# )
|
69
|
+
# #=> "1235.server"
|
70
|
+
# @param script [Script] script object that describes the
|
71
|
+
# script and attributes for the submitted job
|
72
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
|
73
|
+
# at any point after dependent jobs have started execution
|
74
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
75
|
+
# execution only after dependent jobs have terminated with no errors
|
76
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
77
|
+
# execution only after dependent jobs have terminated with errors
|
78
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
79
|
+
# execution after dependent jobs have terminated
|
80
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
81
|
+
# @return [String] the job id returned after successfully submitting a job
|
82
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
83
|
+
# SGE supports jod dependencies on job completion
|
84
|
+
args = @helper.batch_submit_args(script, after: after, afterok: afterok, afternotok: afternotok, afterany: afterany)
|
85
|
+
|
86
|
+
@batch.submit(script.content, args)
|
87
|
+
rescue Batch::Error => e
|
88
|
+
raise JobAdapterError, e.message
|
89
|
+
end
|
90
|
+
|
91
|
+
# Retrieve info for all jobs from the resource manager
|
92
|
+
# @return [Array<Info>] information describing submitted jobs
|
93
|
+
def info_all
|
94
|
+
@batch.get_all
|
95
|
+
rescue Batch::Error => e
|
96
|
+
raise JobAdapterError, e.message
|
97
|
+
end
|
98
|
+
|
99
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
100
|
+
# resource manager
|
101
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
102
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
103
|
+
# @return [Array<Info>] information describing submitted jobs
|
104
|
+
def info_where_owner(owner)
|
105
|
+
owner = Array.wrap(owner).map(&:to_s).join(',')
|
106
|
+
@batch.get_all(owner: owner)
|
107
|
+
rescue Batch::Error => e
|
108
|
+
raise JobAdapterError, e.message
|
109
|
+
end
|
110
|
+
|
111
|
+
# Retrieve job info from the resource manager
|
112
|
+
# @param id [#to_s] the id of the job
|
113
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
114
|
+
# @return [Info] information describing submitted job
|
115
|
+
def info(id)
|
116
|
+
@batch.get_info_enqueued_job(id)
|
117
|
+
rescue Batch::Error => e
|
118
|
+
raise JobAdapterError, e.message
|
119
|
+
end
|
120
|
+
|
121
|
+
# Retrieve job status from resource manager
|
122
|
+
# @param id [#to_s] the id of the job
|
123
|
+
# @raise [JobAdapterError] if something goes wrong getting the status of a job
|
124
|
+
# @return [Status] status of job
|
125
|
+
def status(id)
|
126
|
+
info(id).status
|
127
|
+
rescue Batch::Error => e
|
128
|
+
raise JobAdapterError, e.message
|
129
|
+
end
|
130
|
+
|
131
|
+
# Put the submitted job on hold
|
132
|
+
# @param id [#to_s] the id of the job
|
133
|
+
# @raise [JobAdapterError] if something goes wrong holding a job
|
134
|
+
# @return [void]
|
135
|
+
def hold(id)
|
136
|
+
@batch.hold(id.to_s)
|
137
|
+
rescue Batch::Error => e
|
138
|
+
raise JobAdapterError, e.message
|
139
|
+
end
|
140
|
+
|
141
|
+
# Release the job that is on hold
|
142
|
+
# @param id [#to_s] the id of the job
|
143
|
+
# @raise [JobAdapterError] if something goes wrong releasing a job
|
144
|
+
# @return [void]
|
145
|
+
def release(id)
|
146
|
+
@batch.release(id.to_s)
|
147
|
+
rescue Batch::Error => e
|
148
|
+
raise JobAdapterError, e.message
|
149
|
+
end
|
150
|
+
|
151
|
+
# Delete the submitted job
|
152
|
+
# @param id [#to_s] the id of the job
|
153
|
+
# @raise [JobAdapterError] if something goes wrong deleting a job
|
154
|
+
# @return [void]
|
155
|
+
def delete(id)
|
156
|
+
@batch.delete(id.to_s)
|
157
|
+
rescue Batch::Error => e
|
158
|
+
raise JobAdapterError, e.message
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require "time"
|
2
2
|
require "ood_core/refinements/hash_extensions"
|
3
|
+
require "ood_core/job/adapters/helper"
|
3
4
|
|
4
5
|
module OodCore
|
5
6
|
module Job
|
@@ -11,12 +12,14 @@ module OodCore
|
|
11
12
|
# @option config [Object] :cluster (nil) The cluster to communicate with
|
12
13
|
# @option config [Object] :conf (nil) Path to the slurm conf
|
13
14
|
# @option config [Object] :bin (nil) Path to slurm client binaries
|
15
|
+
# @option config [#to_h] :bin_overrides ({}) Optional overrides to Slurm client executables
|
14
16
|
def self.build_slurm(config)
|
15
17
|
c = config.to_h.symbolize_keys
|
16
18
|
cluster = c.fetch(:cluster, nil)
|
17
19
|
conf = c.fetch(:conf, nil)
|
18
20
|
bin = c.fetch(:bin, nil)
|
19
|
-
|
21
|
+
bin_overrides = c.fetch(:bin_overrides, {})
|
22
|
+
slurm = Adapters::Slurm::Batch.new(cluster: cluster, conf: conf, bin: bin, bin_overrides: bin_overrides)
|
20
23
|
Adapters::Slurm.new(slurm: slurm)
|
21
24
|
end
|
22
25
|
end
|
@@ -48,6 +51,12 @@ module OodCore
|
|
48
51
|
# @return [Pathname] path to slurm binaries
|
49
52
|
attr_reader :bin
|
50
53
|
|
54
|
+
# Optional overrides for Slurm client executables
|
55
|
+
# @example
|
56
|
+
# {'sbatch' => '/usr/local/bin/sbatch'}
|
57
|
+
# @return Hash<String, String>
|
58
|
+
attr_reader :bin_overrides
|
59
|
+
|
51
60
|
# The root exception class that all Slurm-specific exceptions inherit
|
52
61
|
# from
|
53
62
|
class Error < StandardError; end
|
@@ -55,10 +64,11 @@ module OodCore
|
|
55
64
|
# @param cluster [#to_s, nil] the cluster name
|
56
65
|
# @param conf [#to_s, nil] path to the slurm conf
|
57
66
|
# @param bin [#to_s] path to slurm installation binaries
|
58
|
-
def initialize(cluster: nil, bin: nil, conf: nil)
|
67
|
+
def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {})
|
59
68
|
@cluster = cluster && cluster.to_s
|
60
69
|
@conf = conf && Pathname.new(conf.to_s)
|
61
70
|
@bin = Pathname.new(bin.to_s)
|
71
|
+
@bin_overrides = bin_overrides
|
62
72
|
end
|
63
73
|
|
64
74
|
# Get a list of hashes detailing each of the jobs on the batch server
|
@@ -140,7 +150,7 @@ module OodCore
|
|
140
150
|
private
|
141
151
|
# Call a forked Slurm command for a given cluster
|
142
152
|
def call(cmd, *args, env: {}, stdin: "")
|
143
|
-
cmd =
|
153
|
+
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
144
154
|
args = args.map(&:to_s)
|
145
155
|
args += ["-M", cluster] if cluster
|
146
156
|
env = env.to_h
|
@@ -150,11 +160,11 @@ module OodCore
|
|
150
160
|
end
|
151
161
|
|
152
162
|
# Fields requested from a formatted `squeue` call
|
163
|
+
# Note that the order of these fields is important
|
153
164
|
def fields
|
154
165
|
{
|
155
166
|
account: "%a",
|
156
167
|
job_id: "%A",
|
157
|
-
gres: "%b",
|
158
168
|
exec_host: "%B",
|
159
169
|
min_cpus: "%c",
|
160
170
|
cpus: "%C",
|
@@ -200,7 +210,8 @@ module OodCore
|
|
200
210
|
nice: "%y",
|
201
211
|
scheduled_nodes: "%Y",
|
202
212
|
sockets_cores_threads: "%z",
|
203
|
-
work_dir: "%Z"
|
213
|
+
work_dir: "%Z",
|
214
|
+
gres: "%b", # must come at the end to fix a bug with Slurm 18
|
204
215
|
}
|
205
216
|
end
|
206
217
|
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
class OodCore::Job::Adapters::Torque
|
2
|
+
# Maintains a constant Hash of defined PBS attribute types
|
3
|
+
# Includes:
|
4
|
+
# Attribute names used by user commands
|
5
|
+
# Additional job and general attribute names
|
6
|
+
# Additional queue attribute names
|
7
|
+
# Additional server attribute names
|
8
|
+
# Additional node attribute names
|
9
|
+
ATTR = {
|
10
|
+
# Attribute names used by user commands
|
11
|
+
a: :Execution_Time,
|
12
|
+
c: :Checkpoint,
|
13
|
+
e: :Error_Path,
|
14
|
+
f: :fault_tolerant,
|
15
|
+
g: :group_list,
|
16
|
+
h: :Hold_Types,
|
17
|
+
j: :Join_Path,
|
18
|
+
k: :Keep_Files,
|
19
|
+
l: :Resource_List,
|
20
|
+
m: :Mail_Points,
|
21
|
+
o: :Output_Path,
|
22
|
+
p: :Priority,
|
23
|
+
q: :destination,
|
24
|
+
r: :Rerunable,
|
25
|
+
t: :job_array_request,
|
26
|
+
array_id: :job_array_id,
|
27
|
+
u: :User_List,
|
28
|
+
v: :Variable_List,
|
29
|
+
A: :Account_Name,
|
30
|
+
args: :job_arguments,
|
31
|
+
reservation_id: :reservation_id,
|
32
|
+
login_node_id: :login_node_id,
|
33
|
+
login_prop: :login_property,
|
34
|
+
external_nodes: :external_nodes,
|
35
|
+
multi_req_alps: :multi_req_alps,
|
36
|
+
M: :Mail_Users,
|
37
|
+
N: :Job_Name,
|
38
|
+
S: :Shell_Path_List,
|
39
|
+
depend: :depend,
|
40
|
+
inter: :interactive,
|
41
|
+
stagein: :stagein,
|
42
|
+
stageout: :stageout,
|
43
|
+
jobtype: :jobtype,
|
44
|
+
submit_host: :submit_host,
|
45
|
+
init_work_dir: :init_work_dir,
|
46
|
+
|
47
|
+
# Additional job and general attribute names
|
48
|
+
ctime: :ctime,
|
49
|
+
exechost: :exec_host,
|
50
|
+
execport: :exec_port,
|
51
|
+
mtime: :mtime,
|
52
|
+
qtime: :qtime,
|
53
|
+
session: :session_id,
|
54
|
+
euser: :euser,
|
55
|
+
egroup: :egroup,
|
56
|
+
hashname: :hashname,
|
57
|
+
hopcount: :hop_count,
|
58
|
+
security: :security,
|
59
|
+
sched_hint: :sched_hint,
|
60
|
+
substate: :substate,
|
61
|
+
name: :Job_Name,
|
62
|
+
owner: :Job_Owner,
|
63
|
+
used: :resources_used,
|
64
|
+
state: :job_state,
|
65
|
+
queue: :queue,
|
66
|
+
server: :server,
|
67
|
+
maxrun: :max_running,
|
68
|
+
maxreport: :max_report,
|
69
|
+
total: :total_jobs,
|
70
|
+
comment: :comment,
|
71
|
+
cookie: :cookie,
|
72
|
+
qrank: :queue_rank,
|
73
|
+
altid: :alt_id,
|
74
|
+
etime: :etime,
|
75
|
+
exitstat: :exit_status,
|
76
|
+
forwardx11: :forward_x11,
|
77
|
+
submit_args: :submit_args,
|
78
|
+
tokens: :tokens,
|
79
|
+
netcounter: :net_counter,
|
80
|
+
umask: :umask,
|
81
|
+
start_time: :start_time,
|
82
|
+
start_count: :start_count,
|
83
|
+
checkpoint_dir: :checkpoint_dir,
|
84
|
+
checkpoint_name: :checkpoint_name,
|
85
|
+
checkpoint_time: :checkpoint_time,
|
86
|
+
checkpoint_restart_status: :checkpoint_restart_status,
|
87
|
+
restart_name: :restart_name,
|
88
|
+
comp_time: :comp_time,
|
89
|
+
reported: :reported,
|
90
|
+
intcmd: :inter_cmd,
|
91
|
+
job_radix: :job_radix,
|
92
|
+
sister_list: :sister_list,
|
93
|
+
total_runtime: :total_runtime,
|
94
|
+
P: :proxy_user,
|
95
|
+
node_exclusive: :node_exclusive,
|
96
|
+
exec_gpus: :exec_gpus,
|
97
|
+
exec_mics: :exec_mics,
|
98
|
+
J: :job_id,
|
99
|
+
pagg: :pagg_id,
|
100
|
+
system_start_time: :system_start_time,
|
101
|
+
gpu_flags: :gpu_flags,
|
102
|
+
|
103
|
+
# Additional queue attribute names
|
104
|
+
|
105
|
+
# Additional server attribute names
|
106
|
+
|
107
|
+
# Additional node attribute names
|
108
|
+
}
|
109
|
+
end
|