ood_core 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/lib/ood_core/batch_connect/template.rb +17 -6
- data/lib/ood_core/batch_connect/templates/vnc.rb +2 -2
- data/lib/ood_core/job/adapters/drmaa.rb +1002 -0
- data/lib/ood_core/job/adapters/helper.rb +18 -0
- data/lib/ood_core/job/adapters/lsf/batch.rb +4 -3
- data/lib/ood_core/job/adapters/lsf.rb +4 -2
- data/lib/ood_core/job/adapters/pbspro.rb +19 -8
- data/lib/ood_core/job/adapters/sge/batch.rb +203 -0
- data/lib/ood_core/job/adapters/sge/helper.rb +65 -0
- data/lib/ood_core/job/adapters/sge/qstat_xml_j_r_listener.rb +116 -0
- data/lib/ood_core/job/adapters/sge/qstat_xml_r_listener.rb +138 -0
- data/lib/ood_core/job/adapters/sge.rb +163 -0
- data/lib/ood_core/job/adapters/slurm.rb +16 -5
- data/lib/ood_core/job/adapters/torque/attributes.rb +109 -0
- data/lib/ood_core/job/adapters/torque/batch.rb +470 -0
- data/lib/ood_core/job/adapters/torque/error.rb +403 -0
- data/lib/ood_core/job/adapters/torque/ffi.rb +430 -0
- data/lib/ood_core/job/adapters/torque.rb +23 -18
- data/lib/ood_core/job/status.rb +3 -13
- data/lib/ood_core/refinements/drmaa_extensions.rb +21 -0
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +3 -3
- metadata +23 -9
@@ -0,0 +1,163 @@
|
|
1
|
+
require "ood_core/refinements/array_extensions"
|
2
|
+
require "ood_core/refinements/hash_extensions"
|
3
|
+
|
4
|
+
module OodCore
|
5
|
+
module Job
|
6
|
+
class Factory
|
7
|
+
using Refinements::HashExtensions
|
8
|
+
|
9
|
+
# Build the Sun Grid Engine adapter from a configuration
|
10
|
+
# @param config [#to_h] the configuration for job adapter
|
11
|
+
# @option config [Object] :cluster (nil) The cluster to communicate with
|
12
|
+
# @option config [Object] :conf (nil) Path to the SGE conf
|
13
|
+
# @option config [Object] :bin (nil) Path to SGE client binaries
|
14
|
+
# @option config [Object] :sge_root (nil) Path to SGE root, note that
|
15
|
+
# @option config [#to_h] :bin_overrides ({}) Optional overrides to SGE client executables
|
16
|
+
# this may be nil, but must be set to use the DRMAA API, and there is a
|
17
|
+
# severe performance penalty calling Sge#info without using DRMAA.
|
18
|
+
def self.build_sge(config)
|
19
|
+
batch = Adapters::Sge::Batch.new(config.to_h.symbolize_keys)
|
20
|
+
Adapters::Sge.new(batch: batch)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
module Adapters
|
25
|
+
class Sge < Adapter
|
26
|
+
using Refinements::HashExtensions
|
27
|
+
using Refinements::ArrayExtensions
|
28
|
+
|
29
|
+
require "ood_core/job/adapters/sge/batch"
|
30
|
+
require "ood_core/job/adapters/sge/helper"
|
31
|
+
|
32
|
+
# The cluster of the Sun Grid Engine batch server
|
33
|
+
# @example UCLA's hoffman2 cluster
|
34
|
+
# my_batch.cluster #=> "hoffman2"
|
35
|
+
# @return [String, nil] the cluster name
|
36
|
+
attr_reader :cluster
|
37
|
+
|
38
|
+
# The path to the Sun Grid Engine configuration file
|
39
|
+
# @example For Sun Grid Engine 8.0.1
|
40
|
+
# my_batch.conf.to_s #=> "/u/systems/UGE8.0.1vm/h2.conf
|
41
|
+
# @return [Pathname, nil] path to gridengine conf
|
42
|
+
attr_reader :conf
|
43
|
+
|
44
|
+
# The path to the Sun Grid Engine client installation binaries
|
45
|
+
# @example For Sun Grid Engine 8.0.1
|
46
|
+
# my_batch.bin.to_s #=> "/u/systems/UGE8.0.1vm/bin/lx-amd64/
|
47
|
+
# @return [Pathname] path to SGE binaries
|
48
|
+
attr_reader :bin
|
49
|
+
|
50
|
+
# The root exception class that all Sun Grid Engine-specific exceptions inherit
|
51
|
+
# from
|
52
|
+
class Error < StandardError; end
|
53
|
+
|
54
|
+
# @param batch [Adapters::Sge::Batch]
|
55
|
+
def initialize(batch:)
|
56
|
+
@batch = batch
|
57
|
+
@helper = Sge::Helper.new
|
58
|
+
end
|
59
|
+
|
60
|
+
# Submit a job with the attributes defined in the job template instance
|
61
|
+
# @example Submit job template to cluster
|
62
|
+
# solver_id = job_adapter.submit(solver_script)
|
63
|
+
# #=> "1234.server"
|
64
|
+
# @example Submit job that depends on previous job
|
65
|
+
# post_id = job_adapter.submit(
|
66
|
+
# post_script,
|
67
|
+
# afterok: solver_id
|
68
|
+
# )
|
69
|
+
# #=> "1235.server"
|
70
|
+
# @param script [Script] script object that describes the
|
71
|
+
# script and attributes for the submitted job
|
72
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
|
73
|
+
# at any point after dependent jobs have started execution
|
74
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
75
|
+
# execution only after dependent jobs have terminated with no errors
|
76
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
77
|
+
# execution only after dependent jobs have terminated with errors
|
78
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
79
|
+
# execution after dependent jobs have terminated
|
80
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
81
|
+
# @return [String] the job id returned after successfully submitting a job
|
82
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
83
|
+
# SGE supports jod dependencies on job completion
|
84
|
+
args = @helper.batch_submit_args(script, after: after, afterok: afterok, afternotok: afternotok, afterany: afterany)
|
85
|
+
|
86
|
+
@batch.submit(script.content, args)
|
87
|
+
rescue Batch::Error => e
|
88
|
+
raise JobAdapterError, e.message
|
89
|
+
end
|
90
|
+
|
91
|
+
# Retrieve info for all jobs from the resource manager
|
92
|
+
# @return [Array<Info>] information describing submitted jobs
|
93
|
+
def info_all
|
94
|
+
@batch.get_all
|
95
|
+
rescue Batch::Error => e
|
96
|
+
raise JobAdapterError, e.message
|
97
|
+
end
|
98
|
+
|
99
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
100
|
+
# resource manager
|
101
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
102
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
103
|
+
# @return [Array<Info>] information describing submitted jobs
|
104
|
+
def info_where_owner(owner)
|
105
|
+
owner = Array.wrap(owner).map(&:to_s).join(',')
|
106
|
+
@batch.get_all(owner: owner)
|
107
|
+
rescue Batch::Error => e
|
108
|
+
raise JobAdapterError, e.message
|
109
|
+
end
|
110
|
+
|
111
|
+
# Retrieve job info from the resource manager
|
112
|
+
# @param id [#to_s] the id of the job
|
113
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
114
|
+
# @return [Info] information describing submitted job
|
115
|
+
def info(id)
|
116
|
+
@batch.get_info_enqueued_job(id)
|
117
|
+
rescue Batch::Error => e
|
118
|
+
raise JobAdapterError, e.message
|
119
|
+
end
|
120
|
+
|
121
|
+
# Retrieve job status from resource manager
|
122
|
+
# @param id [#to_s] the id of the job
|
123
|
+
# @raise [JobAdapterError] if something goes wrong getting the status of a job
|
124
|
+
# @return [Status] status of job
|
125
|
+
def status(id)
|
126
|
+
info(id).status
|
127
|
+
rescue Batch::Error => e
|
128
|
+
raise JobAdapterError, e.message
|
129
|
+
end
|
130
|
+
|
131
|
+
# Put the submitted job on hold
|
132
|
+
# @param id [#to_s] the id of the job
|
133
|
+
# @raise [JobAdapterError] if something goes wrong holding a job
|
134
|
+
# @return [void]
|
135
|
+
def hold(id)
|
136
|
+
@batch.hold(id.to_s)
|
137
|
+
rescue Batch::Error => e
|
138
|
+
raise JobAdapterError, e.message
|
139
|
+
end
|
140
|
+
|
141
|
+
# Release the job that is on hold
|
142
|
+
# @param id [#to_s] the id of the job
|
143
|
+
# @raise [JobAdapterError] if something goes wrong releasing a job
|
144
|
+
# @return [void]
|
145
|
+
def release(id)
|
146
|
+
@batch.release(id.to_s)
|
147
|
+
rescue Batch::Error => e
|
148
|
+
raise JobAdapterError, e.message
|
149
|
+
end
|
150
|
+
|
151
|
+
# Delete the submitted job
|
152
|
+
# @param id [#to_s] the id of the job
|
153
|
+
# @raise [JobAdapterError] if something goes wrong deleting a job
|
154
|
+
# @return [void]
|
155
|
+
def delete(id)
|
156
|
+
@batch.delete(id.to_s)
|
157
|
+
rescue Batch::Error => e
|
158
|
+
raise JobAdapterError, e.message
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require "time"
|
2
2
|
require "ood_core/refinements/hash_extensions"
|
3
|
+
require "ood_core/job/adapters/helper"
|
3
4
|
|
4
5
|
module OodCore
|
5
6
|
module Job
|
@@ -11,12 +12,14 @@ module OodCore
|
|
11
12
|
# @option config [Object] :cluster (nil) The cluster to communicate with
|
12
13
|
# @option config [Object] :conf (nil) Path to the slurm conf
|
13
14
|
# @option config [Object] :bin (nil) Path to slurm client binaries
|
15
|
+
# @option config [#to_h] :bin_overrides ({}) Optional overrides to Slurm client executables
|
14
16
|
def self.build_slurm(config)
|
15
17
|
c = config.to_h.symbolize_keys
|
16
18
|
cluster = c.fetch(:cluster, nil)
|
17
19
|
conf = c.fetch(:conf, nil)
|
18
20
|
bin = c.fetch(:bin, nil)
|
19
|
-
|
21
|
+
bin_overrides = c.fetch(:bin_overrides, {})
|
22
|
+
slurm = Adapters::Slurm::Batch.new(cluster: cluster, conf: conf, bin: bin, bin_overrides: bin_overrides)
|
20
23
|
Adapters::Slurm.new(slurm: slurm)
|
21
24
|
end
|
22
25
|
end
|
@@ -48,6 +51,12 @@ module OodCore
|
|
48
51
|
# @return [Pathname] path to slurm binaries
|
49
52
|
attr_reader :bin
|
50
53
|
|
54
|
+
# Optional overrides for Slurm client executables
|
55
|
+
# @example
|
56
|
+
# {'sbatch' => '/usr/local/bin/sbatch'}
|
57
|
+
# @return Hash<String, String>
|
58
|
+
attr_reader :bin_overrides
|
59
|
+
|
51
60
|
# The root exception class that all Slurm-specific exceptions inherit
|
52
61
|
# from
|
53
62
|
class Error < StandardError; end
|
@@ -55,10 +64,11 @@ module OodCore
|
|
55
64
|
# @param cluster [#to_s, nil] the cluster name
|
56
65
|
# @param conf [#to_s, nil] path to the slurm conf
|
57
66
|
# @param bin [#to_s] path to slurm installation binaries
|
58
|
-
def initialize(cluster: nil, bin: nil, conf: nil)
|
67
|
+
def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {})
|
59
68
|
@cluster = cluster && cluster.to_s
|
60
69
|
@conf = conf && Pathname.new(conf.to_s)
|
61
70
|
@bin = Pathname.new(bin.to_s)
|
71
|
+
@bin_overrides = bin_overrides
|
62
72
|
end
|
63
73
|
|
64
74
|
# Get a list of hashes detailing each of the jobs on the batch server
|
@@ -140,7 +150,7 @@ module OodCore
|
|
140
150
|
private
|
141
151
|
# Call a forked Slurm command for a given cluster
|
142
152
|
def call(cmd, *args, env: {}, stdin: "")
|
143
|
-
cmd =
|
153
|
+
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
144
154
|
args = args.map(&:to_s)
|
145
155
|
args += ["-M", cluster] if cluster
|
146
156
|
env = env.to_h
|
@@ -150,11 +160,11 @@ module OodCore
|
|
150
160
|
end
|
151
161
|
|
152
162
|
# Fields requested from a formatted `squeue` call
|
163
|
+
# Note that the order of these fields is important
|
153
164
|
def fields
|
154
165
|
{
|
155
166
|
account: "%a",
|
156
167
|
job_id: "%A",
|
157
|
-
gres: "%b",
|
158
168
|
exec_host: "%B",
|
159
169
|
min_cpus: "%c",
|
160
170
|
cpus: "%C",
|
@@ -200,7 +210,8 @@ module OodCore
|
|
200
210
|
nice: "%y",
|
201
211
|
scheduled_nodes: "%Y",
|
202
212
|
sockets_cores_threads: "%z",
|
203
|
-
work_dir: "%Z"
|
213
|
+
work_dir: "%Z",
|
214
|
+
gres: "%b", # must come at the end to fix a bug with Slurm 18
|
204
215
|
}
|
205
216
|
end
|
206
217
|
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
class OodCore::Job::Adapters::Torque
|
2
|
+
# Maintains a constant Hash of defined PBS attribute types
|
3
|
+
# Includes:
|
4
|
+
# Attribute names used by user commands
|
5
|
+
# Additional job and general attribute names
|
6
|
+
# Additional queue attribute names
|
7
|
+
# Additional server attribute names
|
8
|
+
# Additional node attribute names
|
9
|
+
ATTR = {
|
10
|
+
# Attribute names used by user commands
|
11
|
+
a: :Execution_Time,
|
12
|
+
c: :Checkpoint,
|
13
|
+
e: :Error_Path,
|
14
|
+
f: :fault_tolerant,
|
15
|
+
g: :group_list,
|
16
|
+
h: :Hold_Types,
|
17
|
+
j: :Join_Path,
|
18
|
+
k: :Keep_Files,
|
19
|
+
l: :Resource_List,
|
20
|
+
m: :Mail_Points,
|
21
|
+
o: :Output_Path,
|
22
|
+
p: :Priority,
|
23
|
+
q: :destination,
|
24
|
+
r: :Rerunable,
|
25
|
+
t: :job_array_request,
|
26
|
+
array_id: :job_array_id,
|
27
|
+
u: :User_List,
|
28
|
+
v: :Variable_List,
|
29
|
+
A: :Account_Name,
|
30
|
+
args: :job_arguments,
|
31
|
+
reservation_id: :reservation_id,
|
32
|
+
login_node_id: :login_node_id,
|
33
|
+
login_prop: :login_property,
|
34
|
+
external_nodes: :external_nodes,
|
35
|
+
multi_req_alps: :multi_req_alps,
|
36
|
+
M: :Mail_Users,
|
37
|
+
N: :Job_Name,
|
38
|
+
S: :Shell_Path_List,
|
39
|
+
depend: :depend,
|
40
|
+
inter: :interactive,
|
41
|
+
stagein: :stagein,
|
42
|
+
stageout: :stageout,
|
43
|
+
jobtype: :jobtype,
|
44
|
+
submit_host: :submit_host,
|
45
|
+
init_work_dir: :init_work_dir,
|
46
|
+
|
47
|
+
# Additional job and general attribute names
|
48
|
+
ctime: :ctime,
|
49
|
+
exechost: :exec_host,
|
50
|
+
execport: :exec_port,
|
51
|
+
mtime: :mtime,
|
52
|
+
qtime: :qtime,
|
53
|
+
session: :session_id,
|
54
|
+
euser: :euser,
|
55
|
+
egroup: :egroup,
|
56
|
+
hashname: :hashname,
|
57
|
+
hopcount: :hop_count,
|
58
|
+
security: :security,
|
59
|
+
sched_hint: :sched_hint,
|
60
|
+
substate: :substate,
|
61
|
+
name: :Job_Name,
|
62
|
+
owner: :Job_Owner,
|
63
|
+
used: :resources_used,
|
64
|
+
state: :job_state,
|
65
|
+
queue: :queue,
|
66
|
+
server: :server,
|
67
|
+
maxrun: :max_running,
|
68
|
+
maxreport: :max_report,
|
69
|
+
total: :total_jobs,
|
70
|
+
comment: :comment,
|
71
|
+
cookie: :cookie,
|
72
|
+
qrank: :queue_rank,
|
73
|
+
altid: :alt_id,
|
74
|
+
etime: :etime,
|
75
|
+
exitstat: :exit_status,
|
76
|
+
forwardx11: :forward_x11,
|
77
|
+
submit_args: :submit_args,
|
78
|
+
tokens: :tokens,
|
79
|
+
netcounter: :net_counter,
|
80
|
+
umask: :umask,
|
81
|
+
start_time: :start_time,
|
82
|
+
start_count: :start_count,
|
83
|
+
checkpoint_dir: :checkpoint_dir,
|
84
|
+
checkpoint_name: :checkpoint_name,
|
85
|
+
checkpoint_time: :checkpoint_time,
|
86
|
+
checkpoint_restart_status: :checkpoint_restart_status,
|
87
|
+
restart_name: :restart_name,
|
88
|
+
comp_time: :comp_time,
|
89
|
+
reported: :reported,
|
90
|
+
intcmd: :inter_cmd,
|
91
|
+
job_radix: :job_radix,
|
92
|
+
sister_list: :sister_list,
|
93
|
+
total_runtime: :total_runtime,
|
94
|
+
P: :proxy_user,
|
95
|
+
node_exclusive: :node_exclusive,
|
96
|
+
exec_gpus: :exec_gpus,
|
97
|
+
exec_mics: :exec_mics,
|
98
|
+
J: :job_id,
|
99
|
+
pagg: :pagg_id,
|
100
|
+
system_start_time: :system_start_time,
|
101
|
+
gpu_flags: :gpu_flags,
|
102
|
+
|
103
|
+
# Additional queue attribute names
|
104
|
+
|
105
|
+
# Additional server attribute names
|
106
|
+
|
107
|
+
# Additional node attribute names
|
108
|
+
}
|
109
|
+
end
|