ood_core 0.27.1 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ood_core/batch_connect/template.rb +3 -2
- data/lib/ood_core/batch_connect/templates/vnc.rb +1 -1
- data/lib/ood_core/batch_connect/templates/vnc_container.rb +1 -1
- data/lib/ood_core/job/adapter.rb +12 -0
- data/lib/ood_core/job/adapters/coder/batch.rb +170 -0
- data/lib/ood_core/job/adapters/coder/coder_job_info.rb +8 -0
- data/lib/ood_core/job/adapters/coder.rb +120 -0
- data/lib/ood_core/job/adapters/htcondor.rb +549 -0
- data/lib/ood_core/job/adapters/psij/delete.py +18 -0
- data/lib/ood_core/job/adapters/psij/get_info.py +55 -0
- data/lib/ood_core/job/adapters/psij/hold.py +18 -0
- data/lib/ood_core/job/adapters/psij/release.py +18 -0
- data/lib/ood_core/job/adapters/psij/submit.py +28 -0
- data/lib/ood_core/job/adapters/psij.rb +410 -0
- data/lib/ood_core/job/adapters/slurm.rb +133 -3
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +1 -1
- metadata +14 -4
@@ -0,0 +1,549 @@
|
|
1
|
+
require "time"
|
2
|
+
require 'etc'
|
3
|
+
require 'tempfile'
|
4
|
+
require "ood_core/refinements/hash_extensions"
|
5
|
+
require "ood_core/refinements/array_extensions"
|
6
|
+
require "ood_core/job/adapters/helper"
|
7
|
+
|
8
|
+
module OodCore
|
9
|
+
module Job
|
10
|
+
class Factory
|
11
|
+
using Refinements::HashExtensions
|
12
|
+
|
13
|
+
# Build the HTCondor adapter from a configuration
|
14
|
+
# @param config [#to_h] the configuration for job adapter
|
15
|
+
# @option config [Object] :bin (nil) Path to HTCondor client binaries
|
16
|
+
# @option config [Object] :submit_host ("") Submit job on login node via ssh
|
17
|
+
# @option config [Object] :strict_host_checking (true) Whether to use strict host checking when ssh to submit_host
|
18
|
+
def self.build_htcondor(config)
|
19
|
+
c = config.to_h.symbolize_keys
|
20
|
+
bin = c.fetch(:bin, nil)
|
21
|
+
bin_overrides = c.fetch(:bin_overrides, {})
|
22
|
+
submit_host = c.fetch(:submit_host, "")
|
23
|
+
strict_host_checking = c.fetch(:strict_host_checking, true)
|
24
|
+
default_universe = c.fetch(:default_universe, "vanilla")
|
25
|
+
default_docker_image = c.fetch(:default_docker_image, "ubuntu:latest")
|
26
|
+
user_group_map = c.fetch(:user_group_map, nil)
|
27
|
+
cluster = c.fetch(:cluster, "")
|
28
|
+
additional_attributes = c.fetch(:additional_attributes, {})
|
29
|
+
htcondor = Adapters::HTCondor::Batch.new(bin: bin, bin_overrides: bin_overrides,
|
30
|
+
submit_host: submit_host, strict_host_checking: strict_host_checking,
|
31
|
+
default_universe: default_universe,
|
32
|
+
default_docker_image: default_docker_image,
|
33
|
+
user_group_map: user_group_map,
|
34
|
+
cluster: cluster,
|
35
|
+
additional_attributes: additional_attributes,
|
36
|
+
)
|
37
|
+
Adapters::HTCondor.new(htcondor: htcondor)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
module Adapters
|
42
|
+
# An adapter object that describes the communication with an HTCondor
|
43
|
+
# resource manager for job management.
|
44
|
+
class HTCondor < Adapter
|
45
|
+
using Refinements::HashExtensions
|
46
|
+
using Refinements::ArrayExtensions
|
47
|
+
|
48
|
+
# Object used for simplified communication with an HTCondor batch server
|
49
|
+
# @api private
|
50
|
+
class Batch
|
51
|
+
# The path to the HTCondor client installation binaries
|
52
|
+
# @return [Pathname] path to HTCondor binaries
|
53
|
+
attr_reader :bin
|
54
|
+
|
55
|
+
# The path to the HTCondor client installation binaries that override
|
56
|
+
# the default binaries
|
57
|
+
# @return [Pathname] path to HTCondor binaries overrides
|
58
|
+
attr_reader :bin_overrides
|
59
|
+
|
60
|
+
# The login node where the job is submitted via ssh
|
61
|
+
# @return [String] The login node
|
62
|
+
attr_reader :submit_host
|
63
|
+
|
64
|
+
# Whether to use strict host checking when ssh to submit_host
|
65
|
+
# @return [Bool]; true if empty
|
66
|
+
attr_reader :strict_host_checking
|
67
|
+
|
68
|
+
# Default universe for jobs submitted to HTCondor
|
69
|
+
# @return [String] the default universe for jobs
|
70
|
+
attr_reader :default_universe
|
71
|
+
|
72
|
+
# Default docker image for jobs submitted to HTCondor
|
73
|
+
# @return [String] the default docker image for jobs
|
74
|
+
attr_reader :default_docker_image
|
75
|
+
|
76
|
+
# A path to the user/group map for HTCondor jobs
|
77
|
+
# The format in the file should adhere to the format used by [AssignAccountingGroup](https://htcondor.readthedocs.io/en/latest/admin-manual/introduction-to-configuration.html#FEATURE:ASSIGNACCOUNTINGGROUP)
|
78
|
+
# @return [String,nil] the path to the user/group map file
|
79
|
+
attr_reader :user_group_map
|
80
|
+
|
81
|
+
# The cluster name for this HTCondor instance
|
82
|
+
# @return [String] the cluster name
|
83
|
+
attr_reader :cluster
|
84
|
+
|
85
|
+
# Additional attributes to be added to the job submission
|
86
|
+
# @return [Hash{#to_s => #to_s}] additional attributes to be added to the job submission
|
87
|
+
attr_reader :additional_attributes
|
88
|
+
|
89
|
+
# The version of HTCondor on the submit_host
|
90
|
+
# @return [Gem::Version] the version of HTCondor
|
91
|
+
attr_reader :version
|
92
|
+
|
93
|
+
# The root exception class that all HTCondor-specific exceptions inherit
|
94
|
+
# from
|
95
|
+
class Error < StandardError; end
|
96
|
+
|
97
|
+
# @param bin [#to_s] path to HTCondor installation binaries
|
98
|
+
# @param submit_host [#to_s] Submits the job on a login node via ssh
|
99
|
+
# @param strict_host_checking [Bool] Whether to use strict host checking when ssh to submit_host
|
100
|
+
def initialize(bin: nil, bin_overrides: {}, submit_host: "", strict_host_checking: false, default_universe: "vanilla", default_docker_image: "ubuntu:latest", user_group_map: nil, cluster: "", additional_attributes: {})
|
101
|
+
@bin = Pathname.new(bin.to_s)
|
102
|
+
@bin_overrides = bin_overrides
|
103
|
+
@submit_host = submit_host.to_s
|
104
|
+
@strict_host_checking = strict_host_checking
|
105
|
+
@default_universe = default_universe.to_s
|
106
|
+
@default_docker_image = default_docker_image.to_s
|
107
|
+
@user_group_map = user_group_map.to_s unless user_group_map.nil?
|
108
|
+
@cluster = cluster.to_s
|
109
|
+
@additional_attributes = additional_attributes
|
110
|
+
@version = get_htcondor_version
|
111
|
+
end
|
112
|
+
|
113
|
+
# Submit a script to the batch server
|
114
|
+
# @param args [Array<#to_s>] arguments passed to `condor_submit` command
|
115
|
+
# @param env [Hash{#to_s => #to_s}] environment variables set
|
116
|
+
# @param script [String] the script to submit
|
117
|
+
# @raise [Error] if `condor_submit` command exited unsuccessfully
|
118
|
+
# @return [String] the id of the job that was created
|
119
|
+
def submit_string(args: [], script_args: [], env: {}, script: "")
|
120
|
+
args = args.map(&:to_s)
|
121
|
+
script_args = script_args.map(&:to_s).map { |s| s.to_s.gsub('"', "'") } # cannot do double
|
122
|
+
env = env.to_h.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
|
123
|
+
|
124
|
+
path = "#{Dir.tmpdir}/htcondor_submit_#{SecureRandom.uuid}"
|
125
|
+
|
126
|
+
call("bash", "-c", "cat > #{path}", stdin: script)
|
127
|
+
output = call("condor_submit", *args, env: env, stdin: "arguments=#{path.split("/").last} #{script_args.join(" ")}\ntransfer_input_files=#{path}").strip
|
128
|
+
|
129
|
+
match = output.match(/(cluster )?(\d+)/)
|
130
|
+
raise Error, "Failed to parse job ID from output: #{output}" unless match
|
131
|
+
match[2]
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
# Run the `condor_rm` command to remove a job
|
136
|
+
# @param id [#to_s] the id of the job to remove
|
137
|
+
# @raise [Error] if `condor_rm` command exited unsuccessfully
|
138
|
+
def remove_job(id)
|
139
|
+
call("condor_rm", id.to_s)
|
140
|
+
rescue Error => e
|
141
|
+
raise Error, "Failed to remove job #{id}: #{e.message}"
|
142
|
+
end
|
143
|
+
|
144
|
+
# Place a job on hold using `condor_hold`
|
145
|
+
# @param id [#to_s] the id of the job to hold
|
146
|
+
# @raise [Error] if `condor_hold` command exited unsuccessfully
|
147
|
+
def hold_job(id)
|
148
|
+
id = id.to_s
|
149
|
+
call("condor_hold", id)
|
150
|
+
rescue Error => e
|
151
|
+
raise Error, "Failed to hold job #{id}: #{e.message}"
|
152
|
+
end
|
153
|
+
|
154
|
+
# Release a job from hold using `condor_release`
|
155
|
+
# @param id [#to_s] the id of the job to release
|
156
|
+
# @raise [Error] if `condor_release` command exited unsuccessfully
|
157
|
+
def release_job(id)
|
158
|
+
id = id.to_s
|
159
|
+
call("condor_release", id)
|
160
|
+
rescue Error => e
|
161
|
+
raise Error, "Failed to release job #{id}: #{e.message}"
|
162
|
+
end
|
163
|
+
|
164
|
+
def condor_q_attrs
|
165
|
+
{
|
166
|
+
id: "ClusterId",
|
167
|
+
sub_id: "ProcId",
|
168
|
+
status: "JobStatus",
|
169
|
+
owner: "Owner",
|
170
|
+
acct_group: "AcctGroup",
|
171
|
+
job_name: "JobBatchName",
|
172
|
+
procs: "CpusProvisioned",
|
173
|
+
gpus: "GpusProvisioned",
|
174
|
+
submission_time: "QDate",
|
175
|
+
dispatch_time: "JobCurrentStartDate",
|
176
|
+
sys_cpu_time: "RemoteSysCpu",
|
177
|
+
user_cpu_time: "RemoteUserCpu",
|
178
|
+
wallclock_time: "RemoteWallClockTime"
|
179
|
+
}
|
180
|
+
end
|
181
|
+
|
182
|
+
# Retrieve job information using `condor_q`
|
183
|
+
# @param id [#to_s] the id of the job
|
184
|
+
# @param owner [String] the owner(s) of the job
|
185
|
+
# @raise [Error] if `condor_q` command exited unsuccessfully
|
186
|
+
# @return [Array<Hash>] list of details for jobs
|
187
|
+
def get_jobs(id: "", owner: nil)
|
188
|
+
args = []
|
189
|
+
unless id.to_s.empty?
|
190
|
+
if id.to_s.include?(".") # if id is a job array, we need to use the ClusterId and ProcId
|
191
|
+
cluster_id, proc_id = id.to_s.split(".")
|
192
|
+
args.concat ["-constraint", "\"ClusterId == #{cluster_id} && ProcId == #{proc_id}\""]
|
193
|
+
else # if id is a single job, we can just use the ClusterId
|
194
|
+
args.concat ["-constraint", "\"ClusterId == #{id}\""]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
args.concat ["-constraint", "\"Owner == #{owner}\""] unless owner.to_s.empty?
|
198
|
+
args.concat ["-af", *condor_q_attrs.values]
|
199
|
+
|
200
|
+
output = call("condor_q", *args)
|
201
|
+
parse_condor_q_output(output)
|
202
|
+
end
|
203
|
+
|
204
|
+
# Retrieve slot information using `condor_status`
|
205
|
+
# @param owner [String] the owner(s) of the slots
|
206
|
+
# @raise [Error] if `condor_status` command exited unsuccessfully
|
207
|
+
# @return [Array<Hash>] list of details for slots
|
208
|
+
def get_slots
|
209
|
+
args = ["-af", "Machine", "TotalSlotCPUs", "TotalSlotGPUs", "TotalSlotMemory", "CPUs", "GPUs", "Memory", "NumDynamicSlots"]
|
210
|
+
args.concat ["-constraint", "\"DynamicSlot is undefined\""]
|
211
|
+
|
212
|
+
output = call("condor_status", *args)
|
213
|
+
parse_condor_status_output(output)
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
# Retrieve accounts using user_group_map on @submit_host
|
218
|
+
# @return [Hash{String => Array<String>}] mapping of usernames to their groups
|
219
|
+
def get_accounts
|
220
|
+
raise Error, "user_group_map is not defined" if user_group_map.nil? || user_group_map.empty?
|
221
|
+
|
222
|
+
# Retrieve accounts, use local file, if exists. Otherwise use from submit_host
|
223
|
+
if File.exist?(user_group_map) && File.readable?(user_group_map)
|
224
|
+
output = File.read(user_group_map)
|
225
|
+
else
|
226
|
+
output = call("cat", user_group_map)
|
227
|
+
end
|
228
|
+
accounts = {}
|
229
|
+
output.each_line do |line|
|
230
|
+
next if line.strip.empty? || line.start_with?("#") # Skip empty lines and comments
|
231
|
+
_, username, groups = line.strip.split(/\s+/, 3)
|
232
|
+
accounts[username] = groups.split(",") if username && groups
|
233
|
+
end
|
234
|
+
|
235
|
+
accounts
|
236
|
+
rescue Error => e
|
237
|
+
raise Error, "Failed to retrieve accounts: #{e.message}"
|
238
|
+
end
|
239
|
+
|
240
|
+
private
|
241
|
+
|
242
|
+
# Parse the output of `condor_q` into a list of job hashes
|
243
|
+
def parse_condor_q_output(output)
|
244
|
+
jobs = []
|
245
|
+
fields = condor_q_attrs
|
246
|
+
output.each_line do |line|
|
247
|
+
# Parse each line into a hash
|
248
|
+
job_data = line.split
|
249
|
+
job = Hash[fields.keys.zip(job_data)]
|
250
|
+
job[:submit_host] = @submit_host # Add submit host to job data
|
251
|
+
job[:native] = job_data # Add native attributes to job data
|
252
|
+
jobs << job
|
253
|
+
end
|
254
|
+
jobs
|
255
|
+
end
|
256
|
+
|
257
|
+
# Parse the output of `condor_status` into a list of slot hashes
|
258
|
+
def parse_condor_status_output(output)
|
259
|
+
slots = []
|
260
|
+
output.each_line do |line|
|
261
|
+
# Parse each line into a hash (custom parsing logic for HTCondor slots)
|
262
|
+
slot_data = line.split
|
263
|
+
slots << { machine: slot_data[0], total_cpus: slot_data[1].to_i, total_gpus: slot_data[2].to_i, total_memory: slot_data[3].to_i,
|
264
|
+
cpus: slot_data[4].to_i, gpus: slot_data[5].to_i, memory: slot_data[6].to_i,
|
265
|
+
num_dynamic_slots: slot_data[7].to_i }
|
266
|
+
end
|
267
|
+
slots
|
268
|
+
end
|
269
|
+
|
270
|
+
# Call a forked HTCondor command
|
271
|
+
def call(cmd, *args, env: {}, stdin: "")
|
272
|
+
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
273
|
+
args = args.map(&:to_s)
|
274
|
+
|
275
|
+
cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
|
276
|
+
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
277
|
+
s.success? ? o : raise(Error, e)
|
278
|
+
end
|
279
|
+
|
280
|
+
def get_htcondor_version
|
281
|
+
output = call("condor_version")
|
282
|
+
match = output.match(/CondorVersion: (\d+\.\d+\.\d+)/)
|
283
|
+
raise Error, "Failed to parse HTCondor version from output: #{output}" unless match
|
284
|
+
Gem::Version.new(match[1])
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
# Map HTCondor job statuses to symbols
|
289
|
+
STATUS_MAP = {
|
290
|
+
"1" => :queued,
|
291
|
+
"2" => :running,
|
292
|
+
"3" => :running,
|
293
|
+
"4" => :completed,
|
294
|
+
"5" => :queued_held,
|
295
|
+
"6" => :running,
|
296
|
+
"7" => :suspended
|
297
|
+
}.freeze
|
298
|
+
|
299
|
+
# @api private
|
300
|
+
# @param opts [#to_h] the options defining this adapter
|
301
|
+
# @option opts [Batch] :htcondor The HTCondor batch object
|
302
|
+
# @see Factory.build_htcondor
|
303
|
+
def initialize(opts = {})
|
304
|
+
o = opts.to_h.symbolize_keys
|
305
|
+
|
306
|
+
@htcondor = o.fetch(:htcondor) { raise ArgumentError, "No HTCondor object specified. Missing argument: htcondor" }
|
307
|
+
end
|
308
|
+
|
309
|
+
# Submit a job with the attributes defined in the job template instance
|
310
|
+
# @param script [Script] script object that describes the script and
|
311
|
+
# attributes for the submitted job
|
312
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
313
|
+
# @return [String] the job id returned after successfully submitting a
|
314
|
+
# job
|
315
|
+
def submit(script)
|
316
|
+
args = []
|
317
|
+
args.concat ["-batch-name", "#{script.job_name}"] unless script.job_name.nil?
|
318
|
+
args.concat ["-name", "#{script.queue_name}"] unless script.queue_name.nil?
|
319
|
+
args.concat ["-a", "priority=#{script.priority}"] unless script.priority.nil?
|
320
|
+
args.concat ["-a", "accounting_group=#{script.accounting_id}"] unless script.accounting_id.nil?
|
321
|
+
|
322
|
+
args.concat ["-a", "submit_as_hold=#{script.hold}"] unless script.submit_as_hold.nil?
|
323
|
+
args.concat ["-a", "max_retries=0"] unless !script.rerunnable.nil? && script.rerunnable
|
324
|
+
|
325
|
+
args.concat ["-a", "allowed_execute_duration=#{script.wall_time}"] unless script.wall_time.nil?
|
326
|
+
args.concat ["-a", "periodic_remove='HoldReasonCode == 47'"] unless script.wall_time.nil?
|
327
|
+
args.concat ["-a", "deferral_time=#{script.start_time.tv_sec}"] unless script.start_time.nil?
|
328
|
+
|
329
|
+
args.concat ["-a", "request_cpus=#{script.cores}"] unless script.cores.nil?
|
330
|
+
# requesting 1GB of memory per core seems reasonable
|
331
|
+
args.concat ["-a", "request_memory=#{script.cores * 1024}"] unless script.native.include?(:request_memory) && !script.native[:request_memory].nil?
|
332
|
+
args.concat ["-a", "request_gpus=#{script.gpus_per_node}"] unless script.gpus_per_node.nil?
|
333
|
+
|
334
|
+
universe = script.native[:universe] || @htcondor.default_universe
|
335
|
+
args.concat ["-a", "universe=#{universe}"]
|
336
|
+
container_image = script.native[:docker_image] || @htcondor.default_docker_image
|
337
|
+
if universe == "docker" then
|
338
|
+
args.concat ["-a", "docker_image=#{@htcondor.default_docker_image}"] unless script.native.include?(:docker_image) && !script.native[:docker_image].nil?
|
339
|
+
elsif universe == "container" then
|
340
|
+
script.native.delete(:docker_image) unless !script.native.include?(:docker_image)
|
341
|
+
script.native[:container_image] = container_image
|
342
|
+
end
|
343
|
+
|
344
|
+
args.concat ["-a", "input=#{script.input_path}"] unless script.input_path.nil?
|
345
|
+
if script.output_path.nil? then args.concat ["-a", "output=output.txt"] else args.concat ["-a", "output=#{script.output_path}"] end
|
346
|
+
if script.error_path.nil? then args.concat ["-a", "error=error.txt"] else args.concat ["-a", "error=#{script.error_path}"] end
|
347
|
+
if script.workdir.nil? then args.concat ["-a", "log=job.log"] else args.concat ["-a", "log=#{script.workdir}/job.log"] end
|
348
|
+
|
349
|
+
args.concat ["-a", "initialdir=#{script.workdir}"] unless script.workdir.nil?
|
350
|
+
args.concat ["-a", "\"environment=\\\"#{script.job_environment.to_a.map { |k, v| "#{k}='#{v.gsub("'", "''").gsub('"', "\\\"\\\"")}'" }.join(' ')}\\\"\""] unless script.job_environment.nil? || script.job_environment.empty?
|
351
|
+
args.concat ["-a", "getenv=#{script.copy_environment}"] unless script.copy_environment.nil?
|
352
|
+
|
353
|
+
args.concat ["-a", "should_transfer_files=true"]
|
354
|
+
args.concat ["-a", "+OpenOnDemand=true"]
|
355
|
+
|
356
|
+
# send email when started / terminated
|
357
|
+
if script.email_on_started && script.email_on_terminated then
|
358
|
+
raise JobAdapterError, "Cannot handle both email_on_started and email_on_terminated set to true" if script.email_on_started && script.email_on_terminated
|
359
|
+
# args.concat ["-a", "notification=Always"] # might be supported in the future?
|
360
|
+
elsif script.email_on_started then
|
361
|
+
if @htcondor.version >= Gem::Version.new("24.10.0") then
|
362
|
+
args.concat ["-a", "notification=Start"]
|
363
|
+
else
|
364
|
+
raise JobAdapterError, "Email notification on job start is not supported by this HTCondor version. Please upgrade to 24.10.0 or later."
|
365
|
+
end
|
366
|
+
elsif script.email_on_terminated then
|
367
|
+
args.concat ["-a", "notification=Complete"]
|
368
|
+
else
|
369
|
+
args.concat ["-a", "notification=Never"]
|
370
|
+
end
|
371
|
+
args.concat ["-a", "notify_user=#{script.email}"] unless script.email.nil?
|
372
|
+
|
373
|
+
args.concat @htcondor.additional_attributes.to_a.map { |k, v| "-a #{k}=#{v}" } unless @htcondor.additional_attributes.nil? || @htcondor.additional_attributes.empty?
|
374
|
+
args.concat script.native.to_a.map { |k, v| "-a #{k}=#{v}" } unless script.native.nil? || script.native.empty?
|
375
|
+
|
376
|
+
content = script.content
|
377
|
+
|
378
|
+
# Set executable to some shell to execute the script
|
379
|
+
if script.shell_path.nil?
|
380
|
+
args.concat ["-a", "executable=/bin/bash"]
|
381
|
+
else
|
382
|
+
args.concat ["-a", "executable=#{script.shell_path}"]
|
383
|
+
end
|
384
|
+
|
385
|
+
# terse to shut up the output, - to get the script arguments from stdin.
|
386
|
+
args.concat ["-terse", "-"]
|
387
|
+
|
388
|
+
if script.job_array_request.nil?
|
389
|
+
# If no job array request is specified, we submit a single job
|
390
|
+
args.concat ["-queue", "1"]
|
391
|
+
else
|
392
|
+
# If a job array request is specified, we submit a job array
|
393
|
+
# The job array request is expected to be a string like "1-10" or "1,2,3"
|
394
|
+
# we must convert 1-3 to 1,2,3.
|
395
|
+
if script.job_array_request.include?("-")
|
396
|
+
start, finish = script.job_array_request.split("-").map(&:to_i)
|
397
|
+
job_ids = (start..finish).to_a.join(",")
|
398
|
+
else
|
399
|
+
job_ids = script.job_array_request
|
400
|
+
end
|
401
|
+
# Generate multiple jobs in the job array by setting OODArrayId to the requested array ids
|
402
|
+
# While -queue 10 would generate 10 jobs, the ProcId would always be 0-9, not 1-10 - or whatever the request is.
|
403
|
+
# So we set the OODArrayId to the requested job ids.
|
404
|
+
args.concat ["-queue", "1", "+OODArrayId", "in", job_ids.to_s]
|
405
|
+
end
|
406
|
+
|
407
|
+
script_args = script.args || []
|
408
|
+
|
409
|
+
@htcondor.submit_string(args: args, script_args: script_args, script: content)
|
410
|
+
rescue Batch::Error => e
|
411
|
+
raise JobAdapterError, e.message
|
412
|
+
end
|
413
|
+
|
414
|
+
# Retrieve job info from the resource manager
|
415
|
+
# @param id [#to_s] the id of the job
|
416
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
417
|
+
# @return [Info] information describing submitted job
|
418
|
+
def info(id)
|
419
|
+
id = id.to_s
|
420
|
+
jobs = @htcondor.get_jobs(id: id)
|
421
|
+
jobs.empty? ? Info.new(id: id, status: :completed) : parse_job_info(jobs.first)
|
422
|
+
rescue Batch::Error => e
|
423
|
+
raise JobAdapterError, e.message
|
424
|
+
end
|
425
|
+
|
426
|
+
# Retrieve information for all jobs
|
427
|
+
# @raise [JobAdapterError] if something goes wrong retrieving job info
|
428
|
+
# @return [Array<Info>] list of information describing submitted jobs
|
429
|
+
def info_all(attrs: nil)
|
430
|
+
jobs = @htcondor.get_jobs
|
431
|
+
jobs.map { |job| parse_job_info(job) }
|
432
|
+
rescue Batch::Error => e
|
433
|
+
raise JobAdapterError, e.message
|
434
|
+
end
|
435
|
+
|
436
|
+
# Retrieve the status of a job
|
437
|
+
# @param id [#to_s] the id of the job
|
438
|
+
# @raise [JobAdapterError] if something goes wrong retrieving the job status
|
439
|
+
# @return [Symbol] the status of the job
|
440
|
+
def status(id)
|
441
|
+
id = id.to_s
|
442
|
+
jobs = @htcondor.get_jobs(id: id)
|
443
|
+
jobs.empty? ? :completed : get_state(jobs.first[:status])
|
444
|
+
rescue Batch::Error => e
|
445
|
+
raise JobAdapterError, e.message
|
446
|
+
end
|
447
|
+
|
448
|
+
# Retrieve cluster status information
|
449
|
+
# @raise [JobAdapterError] if something goes wrong retrieving cluster status
|
450
|
+
# @return [Hash] summary of cluster status including active and total nodes, processors, GPUs, etc.
|
451
|
+
def cluster_info
|
452
|
+
slots = @htcondor.get_slots
|
453
|
+
active_nodes = slots.count { |slot| slot[:num_dynamic_slots] > 0 }
|
454
|
+
total_nodes = slots.map { |slot| slot[:machine] }.uniq.count
|
455
|
+
active_processors = slots.sum { |slot| slot[:total_cpus] - slot[:cpus] }
|
456
|
+
total_processors = slots.sum { |slot| slot[:total_cpus] }
|
457
|
+
active_gpus = slots.sum { |slot| slot[:total_gpus] - slot[:gpus] }
|
458
|
+
total_gpus = slots.sum { |slot| slot[:total_gpus] }
|
459
|
+
|
460
|
+
ClusterInfo.new({
|
461
|
+
active_nodes: active_nodes,
|
462
|
+
total_nodes: total_nodes,
|
463
|
+
active_processors: active_processors,
|
464
|
+
total_processors: total_processors,
|
465
|
+
active_gpus: active_gpus,
|
466
|
+
total_gpus: total_gpus
|
467
|
+
})
|
468
|
+
rescue Batch::Error => e
|
469
|
+
raise JobAdapterError, e.message
|
470
|
+
end
|
471
|
+
|
472
|
+
# Indicate that the job adapter supports job arrays
|
473
|
+
def supports_job_arrays?
|
474
|
+
true
|
475
|
+
end
|
476
|
+
|
477
|
+
# Place a job on hold
|
478
|
+
# @param id [#to_s] the id of the job
|
479
|
+
# @raise [JobAdapterError] if something goes wrong placing the job on hold
|
480
|
+
def hold(id)
|
481
|
+
@htcondor.hold_job(id)
|
482
|
+
rescue Batch::Error => e
|
483
|
+
raise JobAdapterError, e.message
|
484
|
+
end
|
485
|
+
|
486
|
+
# Release a job from hold
|
487
|
+
# @param id [#to_s] the id of the job
|
488
|
+
# @raise [JobAdapterError] if something goes wrong releasing the job
|
489
|
+
def release(id)
|
490
|
+
@htcondor.release_job(id)
|
491
|
+
rescue Batch::Error => e
|
492
|
+
raise JobAdapterError, e.message
|
493
|
+
end
|
494
|
+
# Delete a job
|
495
|
+
# @param id [#to_s] the id of the job
|
496
|
+
# @raise [JobAdapterError] if something goes wrong deleting the job
|
497
|
+
def delete(id)
|
498
|
+
@htcondor.remove_job(id)
|
499
|
+
rescue Batch::Error => e
|
500
|
+
raise JobAdapterError, e.message
|
501
|
+
end
|
502
|
+
|
503
|
+
# Retrieve the relevant groups for the current user
|
504
|
+
# @return [Array<AccountInfo>] list of groups for the current user
|
505
|
+
def accounts
|
506
|
+
username = Etc.getlogin
|
507
|
+
groups = @htcondor.get_accounts[username]
|
508
|
+
parse_group_into_account_info(groups)
|
509
|
+
rescue Batch::Error => e
|
510
|
+
raise JobAdapterError, e.message
|
511
|
+
end
|
512
|
+
|
513
|
+
private
|
514
|
+
|
515
|
+
def get_state(st)
|
516
|
+
STATUS_MAP.fetch(st.to_s, :undetermined)
|
517
|
+
end
|
518
|
+
|
519
|
+
# Parse hash describing HTCondor job status
|
520
|
+
def parse_job_info(job)
|
521
|
+
Info.new(
|
522
|
+
id: job[:id].to_s + (job[:sub_id].to_s.empty? ? "" : ".#{job[:sub_id]}"),
|
523
|
+
status: get_state(job[:status]),
|
524
|
+
job_name: job[:job_name],
|
525
|
+
job_owner: job[:owner],
|
526
|
+
accounting_id: job[:acct_group],
|
527
|
+
submit_host: job[:submit_host],
|
528
|
+
procs: job[:procs].to_i,
|
529
|
+
gpus: job[:gpus].to_i,
|
530
|
+
submission_time: Time.at(job[:submission_time].to_i),
|
531
|
+
dispatch_time: Time.at(job[:dispatch_time].to_i),
|
532
|
+
cpu_time: job[:sys_cpu_time].to_i + job[:user_cpu_time].to_i,
|
533
|
+
wallclock_time: job[:wallclock_time].to_i,
|
534
|
+
native: job[:native],
|
535
|
+
|
536
|
+
)
|
537
|
+
end
|
538
|
+
|
539
|
+
# Parse group information into AccountInfo objects
|
540
|
+
# @param groups [Array<String>] list of group names
|
541
|
+
# @return [Array<AccountInfo>] list of AccountInfo objects
|
542
|
+
def parse_group_into_account_info(groups)
|
543
|
+
groups.map { |group| AccountInfo.new(name: group, cluster: @htcondor.cluster) }
|
544
|
+
end
|
545
|
+
|
546
|
+
end
|
547
|
+
end
|
548
|
+
end
|
549
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
parser = argparse.ArgumentParser(description="Process job parameters")
|
4
|
+
parser.add_argument("--id", type=str, required=True, help="Path to the job script")
|
5
|
+
parser.add_argument("--executor", type=str, required=True, help="Executor to be used")
|
6
|
+
|
7
|
+
args = parser.parse_args()
|
8
|
+
|
9
|
+
from psij import Job, JobExecutor
|
10
|
+
|
11
|
+
ex = JobExecutor.get_instance(args.executor)
|
12
|
+
job = Job()
|
13
|
+
job._native_id = args.id
|
14
|
+
# catch exception
|
15
|
+
try:
|
16
|
+
ex.cancel(job)
|
17
|
+
except Exception as e:
|
18
|
+
print(f"Invalid job id specified")
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import argparse
|
2
|
+
import json
|
3
|
+
from datetime import datetime, timedelta
|
4
|
+
import time
|
5
|
+
|
6
|
+
parser = argparse.ArgumentParser(description="Process job parameters")
|
7
|
+
parser.add_argument("--id", type=str, help="Path to the job script")
|
8
|
+
parser.add_argument("--owner", type=str, help="the name of job owner")
|
9
|
+
parser.add_argument("--executor", type=str, required=True, help="Executor to be used")
|
10
|
+
|
11
|
+
args = parser.parse_args()
|
12
|
+
|
13
|
+
from psij import Job, JobExecutor
|
14
|
+
from psij.serialize import JSONSerializer
|
15
|
+
|
16
|
+
ex = JobExecutor.get_instance(args.executor)
|
17
|
+
if args.id:
|
18
|
+
job = Job()
|
19
|
+
job._native_id = args.id
|
20
|
+
job_data = ex.info([job])
|
21
|
+
elif args.owner:
|
22
|
+
job_data = ex.info(owner=args.owner)
|
23
|
+
else:
|
24
|
+
job_data = ex.info()
|
25
|
+
|
26
|
+
s = JSONSerializer()
|
27
|
+
# create dict for each job.
|
28
|
+
# [ {'native_id': native_id, ... }, {'native_id': native_id, ...}, ...]
|
29
|
+
data = []
|
30
|
+
for job in job_data:
|
31
|
+
d = {}
|
32
|
+
d["native_id"] = job.native_id
|
33
|
+
d["current_state"] = job._status.state.name
|
34
|
+
d.update(job.current_info.__dict__)
|
35
|
+
d.update(s._from_spec(job.spec))
|
36
|
+
# the attributes and resources are nested in the job data.
|
37
|
+
# we need to flatten them.
|
38
|
+
attr = d["attributes"]
|
39
|
+
del d["attributes"]
|
40
|
+
d.update(attr)
|
41
|
+
# convert deltatime or string to integer
|
42
|
+
d["duration"] = job.spec.attributes.duration.total_seconds()
|
43
|
+
d["wall_time"] = int(d["wall_time"])
|
44
|
+
resources = d["resources"]
|
45
|
+
del d["resources"]
|
46
|
+
d.update(resources)
|
47
|
+
d["submission_time"] = d["submission_time"].strftime("%Y-%m-%d %H:%M:%S")
|
48
|
+
if d["dispatch_time"] is not None:
|
49
|
+
d["dispatch_time"] = d["dispatch_time"].strftime("%Y-%m-%d %H:%M:%S")
|
50
|
+
else:
|
51
|
+
d["dispatch_time"] = None
|
52
|
+
|
53
|
+
data.append(d)
|
54
|
+
|
55
|
+
print(json.dumps(data))
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
parser = argparse.ArgumentParser(description="Process job parameters")
|
4
|
+
parser.add_argument("--id", type=str, required=True, help="Path to the job script")
|
5
|
+
parser.add_argument("--executor", type=str, required=True, help="Executor to be used")
|
6
|
+
|
7
|
+
args = parser.parse_args()
|
8
|
+
|
9
|
+
from psij import Job, JobExecutor
|
10
|
+
|
11
|
+
ex = JobExecutor.get_instance(args.executor)
|
12
|
+
job = Job()
|
13
|
+
job._native_id = args.id
|
14
|
+
# catch exception
|
15
|
+
try:
|
16
|
+
ex.hold(job)
|
17
|
+
except Exception as e:
|
18
|
+
print(f"Invalid job id specified")
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
parser = argparse.ArgumentParser(description="Process job parameters")
|
4
|
+
parser.add_argument("--id", type=str, required=True, help="Path to the job script")
|
5
|
+
parser.add_argument("--executor", type=str, required=True, help="Executor to be used")
|
6
|
+
|
7
|
+
args = parser.parse_args()
|
8
|
+
|
9
|
+
from psij import Job, JobExecutor
|
10
|
+
|
11
|
+
ex = JobExecutor.get_instance(args.executor)
|
12
|
+
job = Job()
|
13
|
+
job._native_id = args.id
|
14
|
+
# catch exception
|
15
|
+
try:
|
16
|
+
ex.release(job)
|
17
|
+
except Exception as e:
|
18
|
+
print(f"Invalid job id specified")
|