ood_core 0.28.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ood_core/job/adapters/htcondor.rb +549 -0
- data/lib/ood_core/job/adapters/psij/delete.py +18 -0
- data/lib/ood_core/job/adapters/psij/get_info.py +55 -0
- data/lib/ood_core/job/adapters/psij/hold.py +18 -0
- data/lib/ood_core/job/adapters/psij/release.py +18 -0
- data/lib/ood_core/job/adapters/psij/submit.py +28 -0
- data/lib/ood_core/job/adapters/psij.rb +410 -0
- data/lib/ood_core/version.rb +1 -1
- metadata +9 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 53043c13d393367627c85321c8c7ef9d69d7a6cbab687ea32a82ab2077484024
|
4
|
+
data.tar.gz: e6c1f60a01e714e5ac02090a9c01ce7b6e96bde96d1568b31c60ab630cecc3c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a5ee3699f5737abf3158e00341fd7e236b6320893f9a3640f7e5b61c854796512e03277c699db100ebeef402f735a58e82596b50dce152a381b01f43cb93ce3b
|
7
|
+
data.tar.gz: 5ff0fc3ed64f3154394aeedc5caed25fd0779304963dbbecf0b6b00083185459abfa40a3787da43f0da21eb475c30bb726c26514e7395ece23b193081e4f2cb1
|
@@ -0,0 +1,549 @@
|
|
1
|
+
require "time"
|
2
|
+
require 'etc'
|
3
|
+
require 'tempfile'
|
4
|
+
require "ood_core/refinements/hash_extensions"
|
5
|
+
require "ood_core/refinements/array_extensions"
|
6
|
+
require "ood_core/job/adapters/helper"
|
7
|
+
|
8
|
+
module OodCore
|
9
|
+
module Job
|
10
|
+
class Factory
|
11
|
+
using Refinements::HashExtensions
|
12
|
+
|
13
|
+
# Build the HTCondor adapter from a configuration
|
14
|
+
# @param config [#to_h] the configuration for job adapter
|
15
|
+
# @option config [Object] :bin (nil) Path to HTCondor client binaries
|
16
|
+
# @option config [Object] :submit_host ("") Submit job on login node via ssh
|
17
|
+
# @option config [Object] :strict_host_checking (true) Whether to use strict host checking when ssh to submit_host
|
18
|
+
def self.build_htcondor(config)
|
19
|
+
c = config.to_h.symbolize_keys
|
20
|
+
bin = c.fetch(:bin, nil)
|
21
|
+
bin_overrides = c.fetch(:bin_overrides, {})
|
22
|
+
submit_host = c.fetch(:submit_host, "")
|
23
|
+
strict_host_checking = c.fetch(:strict_host_checking, true)
|
24
|
+
default_universe = c.fetch(:default_universe, "vanilla")
|
25
|
+
default_docker_image = c.fetch(:default_docker_image, "ubuntu:latest")
|
26
|
+
user_group_map = c.fetch(:user_group_map, nil)
|
27
|
+
cluster = c.fetch(:cluster, "")
|
28
|
+
additional_attributes = c.fetch(:additional_attributes, {})
|
29
|
+
htcondor = Adapters::HTCondor::Batch.new(bin: bin, bin_overrides: bin_overrides,
|
30
|
+
submit_host: submit_host, strict_host_checking: strict_host_checking,
|
31
|
+
default_universe: default_universe,
|
32
|
+
default_docker_image: default_docker_image,
|
33
|
+
user_group_map: user_group_map,
|
34
|
+
cluster: cluster,
|
35
|
+
additional_attributes: additional_attributes,
|
36
|
+
)
|
37
|
+
Adapters::HTCondor.new(htcondor: htcondor)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
module Adapters
|
42
|
+
# An adapter object that describes the communication with an HTCondor
|
43
|
+
# resource manager for job management.
|
44
|
+
class HTCondor < Adapter
|
45
|
+
using Refinements::HashExtensions
|
46
|
+
using Refinements::ArrayExtensions
|
47
|
+
|
48
|
+
# Object used for simplified communication with an HTCondor batch server
|
49
|
+
# @api private
|
50
|
+
class Batch
|
51
|
+
# The path to the HTCondor client installation binaries
|
52
|
+
# @return [Pathname] path to HTCondor binaries
|
53
|
+
attr_reader :bin
|
54
|
+
|
55
|
+
# The path to the HTCondor client installation binaries that override
|
56
|
+
# the default binaries
|
57
|
+
# @return [Pathname] path to HTCondor binaries overrides
|
58
|
+
attr_reader :bin_overrides
|
59
|
+
|
60
|
+
# The login node where the job is submitted via ssh
|
61
|
+
# @return [String] The login node
|
62
|
+
attr_reader :submit_host
|
63
|
+
|
64
|
+
# Whether to use strict host checking when ssh to submit_host
|
65
|
+
# @return [Bool]; true if empty
|
66
|
+
attr_reader :strict_host_checking
|
67
|
+
|
68
|
+
# Default universe for jobs submitted to HTCondor
|
69
|
+
# @return [String] the default universe for jobs
|
70
|
+
attr_reader :default_universe
|
71
|
+
|
72
|
+
# Default docker image for jobs submitted to HTCondor
|
73
|
+
# @return [String] the default docker image for jobs
|
74
|
+
attr_reader :default_docker_image
|
75
|
+
|
76
|
+
# A path to the user/group map for HTCondor jobs
|
77
|
+
# The format in the file should adhere to the format used by [AssignAccountingGroup](https://htcondor.readthedocs.io/en/latest/admin-manual/introduction-to-configuration.html#FEATURE:ASSIGNACCOUNTINGGROUP)
|
78
|
+
# @return [String,nil] the path to the user/group map file
|
79
|
+
attr_reader :user_group_map
|
80
|
+
|
81
|
+
# The cluster name for this HTCondor instance
|
82
|
+
# @return [String] the cluster name
|
83
|
+
attr_reader :cluster
|
84
|
+
|
85
|
+
# Additional attributes to be added to the job submission
|
86
|
+
# @return [Hash{#to_s => #to_s}] additional attributes to be added to the job submission
|
87
|
+
attr_reader :additional_attributes
|
88
|
+
|
89
|
+
# The version of HTCondor on the submit_host
|
90
|
+
# @return [Gem::Version] the version of HTCondor
|
91
|
+
attr_reader :version
|
92
|
+
|
93
|
+
# The root exception class that all HTCondor-specific exceptions inherit
|
94
|
+
# from
|
95
|
+
class Error < StandardError; end
|
96
|
+
|
97
|
+
# @param bin [#to_s] path to HTCondor installation binaries
|
98
|
+
# @param submit_host [#to_s] Submits the job on a login node via ssh
|
99
|
+
# @param strict_host_checking [Bool] Whether to use strict host checking when ssh to submit_host
|
100
|
+
def initialize(bin: nil, bin_overrides: {}, submit_host: "", strict_host_checking: false, default_universe: "vanilla", default_docker_image: "ubuntu:latest", user_group_map: nil, cluster: "", additional_attributes: {})
|
101
|
+
@bin = Pathname.new(bin.to_s)
|
102
|
+
@bin_overrides = bin_overrides
|
103
|
+
@submit_host = submit_host.to_s
|
104
|
+
@strict_host_checking = strict_host_checking
|
105
|
+
@default_universe = default_universe.to_s
|
106
|
+
@default_docker_image = default_docker_image.to_s
|
107
|
+
@user_group_map = user_group_map.to_s unless user_group_map.nil?
|
108
|
+
@cluster = cluster.to_s
|
109
|
+
@additional_attributes = additional_attributes
|
110
|
+
@version = get_htcondor_version
|
111
|
+
end
|
112
|
+
|
113
|
+
# Submit a script to the batch server
|
114
|
+
# @param args [Array<#to_s>] arguments passed to `condor_submit` command
|
115
|
+
# @param env [Hash{#to_s => #to_s}] environment variables set
|
116
|
+
# @param script [String] the script to submit
|
117
|
+
# @raise [Error] if `condor_submit` command exited unsuccessfully
|
118
|
+
# @return [String] the id of the job that was created
|
119
|
+
def submit_string(args: [], script_args: [], env: {}, script: "")
|
120
|
+
args = args.map(&:to_s)
|
121
|
+
script_args = script_args.map(&:to_s).map { |s| s.to_s.gsub('"', "'") } # cannot do double
|
122
|
+
env = env.to_h.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
|
123
|
+
|
124
|
+
path = "#{Dir.tmpdir}/htcondor_submit_#{SecureRandom.uuid}"
|
125
|
+
|
126
|
+
call("bash", "-c", "cat > #{path}", stdin: script)
|
127
|
+
output = call("condor_submit", *args, env: env, stdin: "arguments=#{path.split("/").last} #{script_args.join(" ")}\ntransfer_input_files=#{path}").strip
|
128
|
+
|
129
|
+
match = output.match(/(cluster )?(\d+)/)
|
130
|
+
raise Error, "Failed to parse job ID from output: #{output}" unless match
|
131
|
+
match[2]
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
# Run the `condor_rm` command to remove a job
|
136
|
+
# @param id [#to_s] the id of the job to remove
|
137
|
+
# @raise [Error] if `condor_rm` command exited unsuccessfully
|
138
|
+
def remove_job(id)
|
139
|
+
call("condor_rm", id.to_s)
|
140
|
+
rescue Error => e
|
141
|
+
raise Error, "Failed to remove job #{id}: #{e.message}"
|
142
|
+
end
|
143
|
+
|
144
|
+
# Place a job on hold using `condor_hold`
|
145
|
+
# @param id [#to_s] the id of the job to hold
|
146
|
+
# @raise [Error] if `condor_hold` command exited unsuccessfully
|
147
|
+
def hold_job(id)
|
148
|
+
id = id.to_s
|
149
|
+
call("condor_hold", id)
|
150
|
+
rescue Error => e
|
151
|
+
raise Error, "Failed to hold job #{id}: #{e.message}"
|
152
|
+
end
|
153
|
+
|
154
|
+
# Release a job from hold using `condor_release`
|
155
|
+
# @param id [#to_s] the id of the job to release
|
156
|
+
# @raise [Error] if `condor_release` command exited unsuccessfully
|
157
|
+
def release_job(id)
|
158
|
+
id = id.to_s
|
159
|
+
call("condor_release", id)
|
160
|
+
rescue Error => e
|
161
|
+
raise Error, "Failed to release job #{id}: #{e.message}"
|
162
|
+
end
|
163
|
+
|
164
|
+
def condor_q_attrs
|
165
|
+
{
|
166
|
+
id: "ClusterId",
|
167
|
+
sub_id: "ProcId",
|
168
|
+
status: "JobStatus",
|
169
|
+
owner: "Owner",
|
170
|
+
acct_group: "AcctGroup",
|
171
|
+
job_name: "JobBatchName",
|
172
|
+
procs: "CpusProvisioned",
|
173
|
+
gpus: "GpusProvisioned",
|
174
|
+
submission_time: "QDate",
|
175
|
+
dispatch_time: "JobCurrentStartDate",
|
176
|
+
sys_cpu_time: "RemoteSysCpu",
|
177
|
+
user_cpu_time: "RemoteUserCpu",
|
178
|
+
wallclock_time: "RemoteWallClockTime"
|
179
|
+
}
|
180
|
+
end
|
181
|
+
|
182
|
+
# Retrieve job information using `condor_q`
|
183
|
+
# @param id [#to_s] the id of the job
|
184
|
+
# @param owner [String] the owner(s) of the job
|
185
|
+
# @raise [Error] if `condor_q` command exited unsuccessfully
|
186
|
+
# @return [Array<Hash>] list of details for jobs
|
187
|
+
def get_jobs(id: "", owner: nil)
|
188
|
+
args = []
|
189
|
+
unless id.to_s.empty?
|
190
|
+
if id.to_s.include?(".") # if id is a job array, we need to use the ClusterId and ProcId
|
191
|
+
cluster_id, proc_id = id.to_s.split(".")
|
192
|
+
args.concat ["-constraint", "\"ClusterId == #{cluster_id} && ProcId == #{proc_id}\""]
|
193
|
+
else # if id is a single job, we can just use the ClusterId
|
194
|
+
args.concat ["-constraint", "\"ClusterId == #{id}\""]
|
195
|
+
end
|
196
|
+
end
|
197
|
+
args.concat ["-constraint", "\"Owner == #{owner}\""] unless owner.to_s.empty?
|
198
|
+
args.concat ["-af", *condor_q_attrs.values]
|
199
|
+
|
200
|
+
output = call("condor_q", *args)
|
201
|
+
parse_condor_q_output(output)
|
202
|
+
end
|
203
|
+
|
204
|
+
# Retrieve slot information using `condor_status`
|
205
|
+
# @param owner [String] the owner(s) of the slots
|
206
|
+
# @raise [Error] if `condor_status` command exited unsuccessfully
|
207
|
+
# @return [Array<Hash>] list of details for slots
|
208
|
+
def get_slots
|
209
|
+
args = ["-af", "Machine", "TotalSlotCPUs", "TotalSlotGPUs", "TotalSlotMemory", "CPUs", "GPUs", "Memory", "NumDynamicSlots"]
|
210
|
+
args.concat ["-constraint", "\"DynamicSlot is undefined\""]
|
211
|
+
|
212
|
+
output = call("condor_status", *args)
|
213
|
+
parse_condor_status_output(output)
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
# Retrieve accounts using user_group_map on @submit_host
|
218
|
+
# @return [Hash{String => Array<String>}] mapping of usernames to their groups
|
219
|
+
def get_accounts
|
220
|
+
raise Error, "user_group_map is not defined" if user_group_map.nil? || user_group_map.empty?
|
221
|
+
|
222
|
+
# Retrieve accounts, use local file, if exists. Otherwise use from submit_host
|
223
|
+
if File.exist?(user_group_map) && File.readable?(user_group_map)
|
224
|
+
output = File.read(user_group_map)
|
225
|
+
else
|
226
|
+
output = call("cat", user_group_map)
|
227
|
+
end
|
228
|
+
accounts = {}
|
229
|
+
output.each_line do |line|
|
230
|
+
next if line.strip.empty? || line.start_with?("#") # Skip empty lines and comments
|
231
|
+
_, username, groups = line.strip.split(/\s+/, 3)
|
232
|
+
accounts[username] = groups.split(",") if username && groups
|
233
|
+
end
|
234
|
+
|
235
|
+
accounts
|
236
|
+
rescue Error => e
|
237
|
+
raise Error, "Failed to retrieve accounts: #{e.message}"
|
238
|
+
end
|
239
|
+
|
240
|
+
private
|
241
|
+
|
242
|
+
# Parse the output of `condor_q` into a list of job hashes
|
243
|
+
def parse_condor_q_output(output)
|
244
|
+
jobs = []
|
245
|
+
fields = condor_q_attrs
|
246
|
+
output.each_line do |line|
|
247
|
+
# Parse each line into a hash
|
248
|
+
job_data = line.split
|
249
|
+
job = Hash[fields.keys.zip(job_data)]
|
250
|
+
job[:submit_host] = @submit_host # Add submit host to job data
|
251
|
+
job[:native] = job_data # Add native attributes to job data
|
252
|
+
jobs << job
|
253
|
+
end
|
254
|
+
jobs
|
255
|
+
end
|
256
|
+
|
257
|
+
# Parse the output of `condor_status` into a list of slot hashes
|
258
|
+
def parse_condor_status_output(output)
|
259
|
+
slots = []
|
260
|
+
output.each_line do |line|
|
261
|
+
# Parse each line into a hash (custom parsing logic for HTCondor slots)
|
262
|
+
slot_data = line.split
|
263
|
+
slots << { machine: slot_data[0], total_cpus: slot_data[1].to_i, total_gpus: slot_data[2].to_i, total_memory: slot_data[3].to_i,
|
264
|
+
cpus: slot_data[4].to_i, gpus: slot_data[5].to_i, memory: slot_data[6].to_i,
|
265
|
+
num_dynamic_slots: slot_data[7].to_i }
|
266
|
+
end
|
267
|
+
slots
|
268
|
+
end
|
269
|
+
|
270
|
+
# Call a forked HTCondor command
|
271
|
+
def call(cmd, *args, env: {}, stdin: "")
|
272
|
+
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
273
|
+
args = args.map(&:to_s)
|
274
|
+
|
275
|
+
cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
|
276
|
+
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
277
|
+
s.success? ? o : raise(Error, e)
|
278
|
+
end
|
279
|
+
|
280
|
+
def get_htcondor_version
|
281
|
+
output = call("condor_version")
|
282
|
+
match = output.match(/CondorVersion: (\d+\.\d+\.\d+)/)
|
283
|
+
raise Error, "Failed to parse HTCondor version from output: #{output}" unless match
|
284
|
+
Gem::Version.new(match[1])
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
# Map HTCondor job statuses to symbols
|
289
|
+
STATUS_MAP = {
|
290
|
+
"1" => :queued,
|
291
|
+
"2" => :running,
|
292
|
+
"3" => :running,
|
293
|
+
"4" => :completed,
|
294
|
+
"5" => :queued_held,
|
295
|
+
"6" => :running,
|
296
|
+
"7" => :suspended
|
297
|
+
}.freeze
|
298
|
+
|
299
|
+
# @api private
|
300
|
+
# @param opts [#to_h] the options defining this adapter
|
301
|
+
# @option opts [Batch] :htcondor The HTCondor batch object
|
302
|
+
# @see Factory.build_htcondor
|
303
|
+
def initialize(opts = {})
|
304
|
+
o = opts.to_h.symbolize_keys
|
305
|
+
|
306
|
+
@htcondor = o.fetch(:htcondor) { raise ArgumentError, "No HTCondor object specified. Missing argument: htcondor" }
|
307
|
+
end
|
308
|
+
|
309
|
+
# Submit a job with the attributes defined in the job template instance
|
310
|
+
# @param script [Script] script object that describes the script and
|
311
|
+
# attributes for the submitted job
|
312
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
313
|
+
# @return [String] the job id returned after successfully submitting a
|
314
|
+
# job
|
315
|
+
def submit(script)
|
316
|
+
args = []
|
317
|
+
args.concat ["-batch-name", "#{script.job_name}"] unless script.job_name.nil?
|
318
|
+
args.concat ["-name", "#{script.queue_name}"] unless script.queue_name.nil?
|
319
|
+
args.concat ["-a", "priority=#{script.priority}"] unless script.priority.nil?
|
320
|
+
args.concat ["-a", "accounting_group=#{script.accounting_id}"] unless script.accounting_id.nil?
|
321
|
+
|
322
|
+
args.concat ["-a", "submit_as_hold=#{script.hold}"] unless script.submit_as_hold.nil?
|
323
|
+
args.concat ["-a", "max_retries=0"] unless !script.rerunnable.nil? && script.rerunnable
|
324
|
+
|
325
|
+
args.concat ["-a", "allowed_execute_duration=#{script.wall_time}"] unless script.wall_time.nil?
|
326
|
+
args.concat ["-a", "periodic_remove='HoldReasonCode == 47'"] unless script.wall_time.nil?
|
327
|
+
args.concat ["-a", "deferral_time=#{script.start_time.tv_sec}"] unless script.start_time.nil?
|
328
|
+
|
329
|
+
args.concat ["-a", "request_cpus=#{script.cores}"] unless script.cores.nil?
|
330
|
+
# requesting 1GB of memory per core seems reasonable
|
331
|
+
args.concat ["-a", "request_memory=#{script.cores * 1024}"] unless script.native.include?(:request_memory) && !script.native[:request_memory].nil?
|
332
|
+
args.concat ["-a", "request_gpus=#{script.gpus_per_node}"] unless script.gpus_per_node.nil?
|
333
|
+
|
334
|
+
universe = script.native[:universe] || @htcondor.default_universe
|
335
|
+
args.concat ["-a", "universe=#{universe}"]
|
336
|
+
container_image = script.native[:docker_image] || @htcondor.default_docker_image
|
337
|
+
if universe == "docker" then
|
338
|
+
args.concat ["-a", "docker_image=#{@htcondor.default_docker_image}"] unless script.native.include?(:docker_image) && !script.native[:docker_image].nil?
|
339
|
+
elsif universe == "container" then
|
340
|
+
script.native.delete(:docker_image) unless !script.native.include?(:docker_image)
|
341
|
+
script.native[:container_image] = container_image
|
342
|
+
end
|
343
|
+
|
344
|
+
args.concat ["-a", "input=#{script.input_path}"] unless script.input_path.nil?
|
345
|
+
if script.output_path.nil? then args.concat ["-a", "output=output.txt"] else args.concat ["-a", "output=#{script.output_path}"] end
|
346
|
+
if script.error_path.nil? then args.concat ["-a", "error=error.txt"] else args.concat ["-a", "error=#{script.error_path}"] end
|
347
|
+
if script.workdir.nil? then args.concat ["-a", "log=job.log"] else args.concat ["-a", "log=#{script.workdir}/job.log"] end
|
348
|
+
|
349
|
+
args.concat ["-a", "initialdir=#{script.workdir}"] unless script.workdir.nil?
|
350
|
+
args.concat ["-a", "\"environment=\\\"#{script.job_environment.to_a.map { |k, v| "#{k}='#{v.gsub("'", "''").gsub('"', "\\\"\\\"")}'" }.join(' ')}\\\"\""] unless script.job_environment.nil? || script.job_environment.empty?
|
351
|
+
args.concat ["-a", "getenv=#{script.copy_environment}"] unless script.copy_environment.nil?
|
352
|
+
|
353
|
+
args.concat ["-a", "should_transfer_files=true"]
|
354
|
+
args.concat ["-a", "+OpenOnDemand=true"]
|
355
|
+
|
356
|
+
# send email when started / terminated
|
357
|
+
if script.email_on_started && script.email_on_terminated then
|
358
|
+
raise JobAdapterError, "Cannot handle both email_on_started and email_on_terminated set to true" if script.email_on_started && script.email_on_terminated
|
359
|
+
# args.concat ["-a", "notification=Always"] # might be supported in the future?
|
360
|
+
elsif script.email_on_started then
|
361
|
+
if @htcondor.version >= Gem::Version.new("24.10.0") then
|
362
|
+
args.concat ["-a", "notification=Start"]
|
363
|
+
else
|
364
|
+
raise JobAdapterError, "Email notification on job start is not supported by this HTCondor version. Please upgrade to 24.10.0 or later."
|
365
|
+
end
|
366
|
+
elsif script.email_on_terminated then
|
367
|
+
args.concat ["-a", "notification=Complete"]
|
368
|
+
else
|
369
|
+
args.concat ["-a", "notification=Never"]
|
370
|
+
end
|
371
|
+
args.concat ["-a", "notify_user=#{script.email}"] unless script.email.nil?
|
372
|
+
|
373
|
+
args.concat @htcondor.additional_attributes.to_a.map { |k, v| "-a #{k}=#{v}" } unless @htcondor.additional_attributes.nil? || @htcondor.additional_attributes.empty?
|
374
|
+
args.concat script.native.to_a.map { |k, v| "-a #{k}=#{v}" } unless script.native.nil? || script.native.empty?
|
375
|
+
|
376
|
+
content = script.content
|
377
|
+
|
378
|
+
# Set executable to some shell to execute the script
|
379
|
+
if script.shell_path.nil?
|
380
|
+
args.concat ["-a", "executable=/bin/bash"]
|
381
|
+
else
|
382
|
+
args.concat ["-a", "executable=#{script.shell_path}"]
|
383
|
+
end
|
384
|
+
|
385
|
+
# terse to shut up the output, - to get the script arguments from stdin.
|
386
|
+
args.concat ["-terse", "-"]
|
387
|
+
|
388
|
+
if script.job_array_request.nil?
|
389
|
+
# If no job array request is specified, we submit a single job
|
390
|
+
args.concat ["-queue", "1"]
|
391
|
+
else
|
392
|
+
# If a job array request is specified, we submit a job array
|
393
|
+
# The job array request is expected to be a string like "1-10" or "1,2,3"
|
394
|
+
# we must convert 1-3 to 1,2,3.
|
395
|
+
if script.job_array_request.include?("-")
|
396
|
+
start, finish = script.job_array_request.split("-").map(&:to_i)
|
397
|
+
job_ids = (start..finish).to_a.join(",")
|
398
|
+
else
|
399
|
+
job_ids = script.job_array_request
|
400
|
+
end
|
401
|
+
# Generate multiple jobs in the job array by setting OODArrayId to the requested array ids
|
402
|
+
# While -queue 10 would generate 10 jobs, the ProcId would always be 0-9, not 1-10 - or whatever the request is.
|
403
|
+
# So we set the OODArrayId to the requested job ids.
|
404
|
+
args.concat ["-queue", "1", "+OODArrayId", "in", job_ids.to_s]
|
405
|
+
end
|
406
|
+
|
407
|
+
script_args = script.args || []
|
408
|
+
|
409
|
+
@htcondor.submit_string(args: args, script_args: script_args, script: content)
|
410
|
+
rescue Batch::Error => e
|
411
|
+
raise JobAdapterError, e.message
|
412
|
+
end
|
413
|
+
|
414
|
+
# Retrieve job info from the resource manager
|
415
|
+
# @param id [#to_s] the id of the job
|
416
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
417
|
+
# @return [Info] information describing submitted job
|
418
|
+
def info(id)
|
419
|
+
id = id.to_s
|
420
|
+
jobs = @htcondor.get_jobs(id: id)
|
421
|
+
jobs.empty? ? Info.new(id: id, status: :completed) : parse_job_info(jobs.first)
|
422
|
+
rescue Batch::Error => e
|
423
|
+
raise JobAdapterError, e.message
|
424
|
+
end
|
425
|
+
|
426
|
+
# Retrieve information for all jobs
|
427
|
+
# @raise [JobAdapterError] if something goes wrong retrieving job info
|
428
|
+
# @return [Array<Info>] list of information describing submitted jobs
|
429
|
+
def info_all(attrs: nil)
|
430
|
+
jobs = @htcondor.get_jobs
|
431
|
+
jobs.map { |job| parse_job_info(job) }
|
432
|
+
rescue Batch::Error => e
|
433
|
+
raise JobAdapterError, e.message
|
434
|
+
end
|
435
|
+
|
436
|
+
# Retrieve the status of a job
|
437
|
+
# @param id [#to_s] the id of the job
|
438
|
+
# @raise [JobAdapterError] if something goes wrong retrieving the job status
|
439
|
+
# @return [Symbol] the status of the job
|
440
|
+
def status(id)
|
441
|
+
id = id.to_s
|
442
|
+
jobs = @htcondor.get_jobs(id: id)
|
443
|
+
jobs.empty? ? :completed : get_state(jobs.first[:status])
|
444
|
+
rescue Batch::Error => e
|
445
|
+
raise JobAdapterError, e.message
|
446
|
+
end
|
447
|
+
|
448
|
+
# Retrieve cluster status information
|
449
|
+
# @raise [JobAdapterError] if something goes wrong retrieving cluster status
|
450
|
+
# @return [Hash] summary of cluster status including active and total nodes, processors, GPUs, etc.
|
451
|
+
def cluster_info
|
452
|
+
slots = @htcondor.get_slots
|
453
|
+
active_nodes = slots.count { |slot| slot[:num_dynamic_slots] > 0 }
|
454
|
+
total_nodes = slots.map { |slot| slot[:machine] }.uniq.count
|
455
|
+
active_processors = slots.sum { |slot| slot[:total_cpus] - slot[:cpus] }
|
456
|
+
total_processors = slots.sum { |slot| slot[:total_cpus] }
|
457
|
+
active_gpus = slots.sum { |slot| slot[:total_gpus] - slot[:gpus] }
|
458
|
+
total_gpus = slots.sum { |slot| slot[:total_gpus] }
|
459
|
+
|
460
|
+
ClusterInfo.new({
|
461
|
+
active_nodes: active_nodes,
|
462
|
+
total_nodes: total_nodes,
|
463
|
+
active_processors: active_processors,
|
464
|
+
total_processors: total_processors,
|
465
|
+
active_gpus: active_gpus,
|
466
|
+
total_gpus: total_gpus
|
467
|
+
})
|
468
|
+
rescue Batch::Error => e
|
469
|
+
raise JobAdapterError, e.message
|
470
|
+
end
|
471
|
+
|
472
|
+
# Indicate that the job adapter supports job arrays
|
473
|
+
def supports_job_arrays?
|
474
|
+
true
|
475
|
+
end
|
476
|
+
|
477
|
+
# Place a job on hold
|
478
|
+
# @param id [#to_s] the id of the job
|
479
|
+
# @raise [JobAdapterError] if something goes wrong placing the job on hold
|
480
|
+
def hold(id)
|
481
|
+
@htcondor.hold_job(id)
|
482
|
+
rescue Batch::Error => e
|
483
|
+
raise JobAdapterError, e.message
|
484
|
+
end
|
485
|
+
|
486
|
+
# Release a job from hold
|
487
|
+
# @param id [#to_s] the id of the job
|
488
|
+
# @raise [JobAdapterError] if something goes wrong releasing the job
|
489
|
+
def release(id)
|
490
|
+
@htcondor.release_job(id)
|
491
|
+
rescue Batch::Error => e
|
492
|
+
raise JobAdapterError, e.message
|
493
|
+
end
|
494
|
+
# Delete a job
|
495
|
+
# @param id [#to_s] the id of the job
|
496
|
+
# @raise [JobAdapterError] if something goes wrong deleting the job
|
497
|
+
def delete(id)
|
498
|
+
@htcondor.remove_job(id)
|
499
|
+
rescue Batch::Error => e
|
500
|
+
raise JobAdapterError, e.message
|
501
|
+
end
|
502
|
+
|
503
|
+
# Retrieve the relevant groups for the current user
|
504
|
+
# @return [Array<AccountInfo>] list of groups for the current user
|
505
|
+
def accounts
|
506
|
+
username = Etc.getlogin
|
507
|
+
groups = @htcondor.get_accounts[username]
|
508
|
+
parse_group_into_account_info(groups)
|
509
|
+
rescue Batch::Error => e
|
510
|
+
raise JobAdapterError, e.message
|
511
|
+
end
|
512
|
+
|
513
|
+
private
|
514
|
+
|
515
|
+
def get_state(st)
|
516
|
+
STATUS_MAP.fetch(st.to_s, :undetermined)
|
517
|
+
end
|
518
|
+
|
519
|
+
# Parse hash describing HTCondor job status
|
520
|
+
def parse_job_info(job)
|
521
|
+
Info.new(
|
522
|
+
id: job[:id].to_s + (job[:sub_id].to_s.empty? ? "" : ".#{job[:sub_id]}"),
|
523
|
+
status: get_state(job[:status]),
|
524
|
+
job_name: job[:job_name],
|
525
|
+
job_owner: job[:owner],
|
526
|
+
accounting_id: job[:acct_group],
|
527
|
+
submit_host: job[:submit_host],
|
528
|
+
procs: job[:procs].to_i,
|
529
|
+
gpus: job[:gpus].to_i,
|
530
|
+
submission_time: Time.at(job[:submission_time].to_i),
|
531
|
+
dispatch_time: Time.at(job[:dispatch_time].to_i),
|
532
|
+
cpu_time: job[:sys_cpu_time].to_i + job[:user_cpu_time].to_i,
|
533
|
+
wallclock_time: job[:wallclock_time].to_i,
|
534
|
+
native: job[:native],
|
535
|
+
|
536
|
+
)
|
537
|
+
end
|
538
|
+
|
539
|
+
# Parse group information into AccountInfo objects
|
540
|
+
# @param groups [Array<String>] list of group names
|
541
|
+
# @return [Array<AccountInfo>] list of AccountInfo objects
|
542
|
+
def parse_group_into_account_info(groups)
|
543
|
+
groups.map { |group| AccountInfo.new(name: group, cluster: @htcondor.cluster) }
|
544
|
+
end
|
545
|
+
|
546
|
+
end
|
547
|
+
end
|
548
|
+
end
|
549
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
parser = argparse.ArgumentParser(description="Process job parameters")
|
4
|
+
parser.add_argument("--id", type=str, required=True, help="Path to the job script")
|
5
|
+
parser.add_argument("--executor", type=str, required=True, help="Executor to be used")
|
6
|
+
|
7
|
+
args = parser.parse_args()
|
8
|
+
|
9
|
+
from psij import Job, JobExecutor
|
10
|
+
|
11
|
+
ex = JobExecutor.get_instance(args.executor)
|
12
|
+
job = Job()
|
13
|
+
job._native_id = args.id
|
14
|
+
# catch exception
|
15
|
+
try:
|
16
|
+
ex.cancel(job)
|
17
|
+
except Exception as e:
|
18
|
+
print(f"Invalid job id specified")
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import argparse
|
2
|
+
import json
|
3
|
+
from datetime import datetime, timedelta
|
4
|
+
import time
|
5
|
+
|
6
|
+
parser = argparse.ArgumentParser(description="Process job parameters")
|
7
|
+
parser.add_argument("--id", type=str, help="Path to the job script")
|
8
|
+
parser.add_argument("--owner", type=str, help="the name of job owner")
|
9
|
+
parser.add_argument("--executor", type=str, required=True, help="Executor to be used")
|
10
|
+
|
11
|
+
args = parser.parse_args()
|
12
|
+
|
13
|
+
from psij import Job, JobExecutor
|
14
|
+
from psij.serialize import JSONSerializer
|
15
|
+
|
16
|
+
ex = JobExecutor.get_instance(args.executor)
|
17
|
+
if args.id:
|
18
|
+
job = Job()
|
19
|
+
job._native_id = args.id
|
20
|
+
job_data = ex.info([job])
|
21
|
+
elif args.owner:
|
22
|
+
job_data = ex.info(owner=args.owner)
|
23
|
+
else:
|
24
|
+
job_data = ex.info()
|
25
|
+
|
26
|
+
s = JSONSerializer()
|
27
|
+
# create dict for each job.
|
28
|
+
# [ {'native_id': native_id, ... }, {'native_id': native_id, ...}, ...]
|
29
|
+
data = []
|
30
|
+
for job in job_data:
|
31
|
+
d = {}
|
32
|
+
d["native_id"] = job.native_id
|
33
|
+
d["current_state"] = job._status.state.name
|
34
|
+
d.update(job.current_info.__dict__)
|
35
|
+
d.update(s._from_spec(job.spec))
|
36
|
+
# the attributes and resources are nested in the job data.
|
37
|
+
# we need to flatten them.
|
38
|
+
attr = d["attributes"]
|
39
|
+
del d["attributes"]
|
40
|
+
d.update(attr)
|
41
|
+
# convert deltatime or string to integer
|
42
|
+
d["duration"] = job.spec.attributes.duration.total_seconds()
|
43
|
+
d["wall_time"] = int(d["wall_time"])
|
44
|
+
resources = d["resources"]
|
45
|
+
del d["resources"]
|
46
|
+
d.update(resources)
|
47
|
+
d["submission_time"] = d["submission_time"].strftime("%Y-%m-%d %H:%M:%S")
|
48
|
+
if d["dispatch_time"] is not None:
|
49
|
+
d["dispatch_time"] = d["dispatch_time"].strftime("%Y-%m-%d %H:%M:%S")
|
50
|
+
else:
|
51
|
+
d["dispatch_time"] = None
|
52
|
+
|
53
|
+
data.append(d)
|
54
|
+
|
55
|
+
print(json.dumps(data))
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
parser = argparse.ArgumentParser(description="Process job parameters")
|
4
|
+
parser.add_argument("--id", type=str, required=True, help="Path to the job script")
|
5
|
+
parser.add_argument("--executor", type=str, required=True, help="Executor to be used")
|
6
|
+
|
7
|
+
args = parser.parse_args()
|
8
|
+
|
9
|
+
from psij import Job, JobExecutor
|
10
|
+
|
11
|
+
ex = JobExecutor.get_instance(args.executor)
|
12
|
+
job = Job()
|
13
|
+
job._native_id = args.id
|
14
|
+
# catch exception
|
15
|
+
try:
|
16
|
+
ex.hold(job)
|
17
|
+
except Exception as e:
|
18
|
+
print(f"Invalid job id specified")
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import argparse
|
2
|
+
|
3
|
+
parser = argparse.ArgumentParser(description="Process job parameters")
|
4
|
+
parser.add_argument("--id", type=str, required=True, help="Path to the job script")
|
5
|
+
parser.add_argument("--executor", type=str, required=True, help="Executor to be used")
|
6
|
+
|
7
|
+
args = parser.parse_args()
|
8
|
+
|
9
|
+
from psij import Job, JobExecutor
|
10
|
+
|
11
|
+
ex = JobExecutor.get_instance(args.executor)
|
12
|
+
job = Job()
|
13
|
+
job._native_id = args.id
|
14
|
+
# catch exception
|
15
|
+
try:
|
16
|
+
ex.release(job)
|
17
|
+
except Exception as e:
|
18
|
+
print(f"Invalid job id specified")
|
@@ -0,0 +1,28 @@
|
|
1
|
+
import sys
|
2
|
+
from psij import Job, JobExecutor
|
3
|
+
from psij.serialize import JSONSerializer
|
4
|
+
from pathlib import Path
|
5
|
+
import json
|
6
|
+
import os
|
7
|
+
|
8
|
+
# create executor instance.
|
9
|
+
ex = JobExecutor.get_instance(sys.argv[1])
|
10
|
+
|
11
|
+
# deserialize json data to job spec.
|
12
|
+
deserialize = JSONSerializer()
|
13
|
+
d = sys.stdin.read()
|
14
|
+
j = json.loads(d)
|
15
|
+
spec = deserialize._to_spec(j)
|
16
|
+
|
17
|
+
# add executor string to each key of custom attributes.
|
18
|
+
if sys.argv[1] != "local" and spec.attributes.custom_attributes is not None:
|
19
|
+
h = {}
|
20
|
+
for k in spec.attributes.custom_attributes.keys():
|
21
|
+
h[f"{ex.name}.{k}"] = spec.attributes.custom_attributes[k]
|
22
|
+
spec.attributes.custom_attributes = h
|
23
|
+
|
24
|
+
spec.executable = os.path.expanduser(spec.executable)
|
25
|
+
job = Job(spec)
|
26
|
+
|
27
|
+
ex.submit(job)
|
28
|
+
print(job.native_id)
|
@@ -0,0 +1,410 @@
|
|
1
|
+
require "time"
|
2
|
+
require 'etc'
|
3
|
+
require "ood_core/refinements/hash_extensions"
|
4
|
+
require "ood_core/refinements/array_extensions"
|
5
|
+
require "ood_core/job/adapters/helper"
|
6
|
+
|
7
|
+
require 'json'
|
8
|
+
require 'pathname'
|
9
|
+
|
10
|
+
module OodCore
|
11
|
+
module Job
|
12
|
+
class Factory
|
13
|
+
|
14
|
+
using Refinements::HashExtensions
|
15
|
+
# Build the PSIJ adapter from a configuration
|
16
|
+
# @param config [#to_h] the configuration for job adapter
|
17
|
+
# @option config [Object] :bin (nil) Path to PSIJ binaries
|
18
|
+
# @option config [#to_h] :bin_overrides ({}) Optional overrides to PSIJ executables
|
19
|
+
def self.build_psij(config)
|
20
|
+
c = config.to_h.symbolize_keys
|
21
|
+
cluster = c.fetch(:cluster, nil)
|
22
|
+
conf = c.fetch(:conf, nil)
|
23
|
+
bin = c.fetch(:bin, nil)
|
24
|
+
bin_overrides = c.fetch(:bin_overrides, {})
|
25
|
+
submit_host = c.fetch(:submit_host, "")
|
26
|
+
strict_host_checking = c.fetch(:strict_host_checking, true)
|
27
|
+
executor = c.fetch(:executor, nil)
|
28
|
+
queue_name = c.fetch(:queue_name, nil)
|
29
|
+
psij = Adapters::PSIJ::Batch.new(cluster: cluster, conf: conf, bin: bin, bin_overrides: bin_overrides, submit_host: submit_host, strict_host_checking: strict_host_checking, executor: executor, queue_name: queue_name)
|
30
|
+
Adapters::PSIJ.new(psij: psij)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
module Adapters
|
35
|
+
class PSIJ < Adapter
|
36
|
+
using Refinements::HashExtensions
|
37
|
+
using Refinements::ArrayExtensions
|
38
|
+
class Batch
|
39
|
+
|
40
|
+
attr_reader :cluster
|
41
|
+
attr_reader :conf
|
42
|
+
attr_reader :bin
|
43
|
+
attr_reader :bin_overrides
|
44
|
+
attr_reader :submit_host
|
45
|
+
attr_reader :strict_host_checking
|
46
|
+
attr_reader :executor
|
47
|
+
attr_reader :queue_name
|
48
|
+
|
49
|
+
class Error < StandardError; end
|
50
|
+
|
51
|
+
def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {}, submit_host: "", strict_host_checking: true, executor: nil, queue_name: nil)
|
52
|
+
@cluster = cluster && cluster.to_s
|
53
|
+
@conf = conf && Pathname.new(conf.to_s)
|
54
|
+
@bin = Pathname.new(bin.to_s)
|
55
|
+
@bin_overrides = bin_overrides
|
56
|
+
@submit_host = submit_host.to_s
|
57
|
+
@strict_host_checking = strict_host_checking
|
58
|
+
@executor = executor
|
59
|
+
@queue_name = queue_name
|
60
|
+
end
|
61
|
+
|
62
|
+
def get_jobs(id: "", owner: nil)
|
63
|
+
id = id.to_s.strip()
|
64
|
+
params = {
|
65
|
+
id: id,
|
66
|
+
executor: executor,
|
67
|
+
}
|
68
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
69
|
+
get_info_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/get_info.py").to_s
|
70
|
+
jobs_data = call("python3", get_info_path, *args)
|
71
|
+
jobs_data = JSON.parse(jobs_data, symbolize_names: true)
|
72
|
+
jobs_data
|
73
|
+
end
|
74
|
+
|
75
|
+
def submit_job_path(args: [], chdir: nil, stdin: nil)
|
76
|
+
submit_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/submit.py").to_s
|
77
|
+
call("python3", submit_path, *args, chdir: chdir, stdin: stdin)
|
78
|
+
end
|
79
|
+
|
80
|
+
def delete_job(args: [])
|
81
|
+
delete_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/delete.py").to_s
|
82
|
+
call("python3", delete_path, *args)
|
83
|
+
rescue => e
|
84
|
+
raise JobAdapterError, e
|
85
|
+
end
|
86
|
+
|
87
|
+
def hold_job(args: [])
|
88
|
+
hold_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/hold.py").to_s
|
89
|
+
call("python3", hold_path, *args)
|
90
|
+
end
|
91
|
+
|
92
|
+
def release_job(args: [])
|
93
|
+
release_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/release.py").to_s
|
94
|
+
call("python3", release_path, *args)
|
95
|
+
end
|
96
|
+
|
97
|
+
def seconds_to_duration(time)
|
98
|
+
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
# Call a forked psij script for a given cluster
|
103
|
+
def call(cmd, *args, env: {}, stdin: "", chdir: nil)
|
104
|
+
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
105
|
+
cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
|
106
|
+
chdir ||= "."
|
107
|
+
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin, chdir: chdir.to_s)
|
108
|
+
s.success? ? o : raise(Error, e)
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
STATE_MAP = {
|
115
|
+
'NEW' => :undetermined,
|
116
|
+
'QUEUED' => :queued,
|
117
|
+
'HELD' => :queued_held,
|
118
|
+
'ACTIVE' => :running,
|
119
|
+
'COMPLETED' => :completed,
|
120
|
+
}
|
121
|
+
|
122
|
+
def initialize(opts = {})
|
123
|
+
o = opts.to_h.symbolize_keys
|
124
|
+
|
125
|
+
@psij = o.fetch(:psij) { raise ArgumentError, "No psij object specified. Missing argument: psij" }
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
# The `submit` method saves a job script as a file and prepares a command to submit the job.
|
130
|
+
# Each optional argument specifies job dependencies (after, afterok, afternotok, afterany).
|
131
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
132
|
+
# convert OOD interfaces to PSI/J interfaces.
|
133
|
+
# Conterted variables are shown as follows:
|
134
|
+
# OOD | PSI/J(JobSpec)
|
135
|
+
# --------------------+----------------------------------------------------
|
136
|
+
# submit_as_hold | X (not support)
|
137
|
+
# rerunnable | X
|
138
|
+
# email_on_started | X
|
139
|
+
# email_on_terminated | X
|
140
|
+
# args | JobAttributes.custom_attributes
|
141
|
+
# job_environment | environment
|
142
|
+
# workdir | directory
|
143
|
+
# email | X
|
144
|
+
# job_name | name
|
145
|
+
# shell_path | #!<shell_path>
|
146
|
+
# input_path | stdin_path
|
147
|
+
# output_path | stdout_path
|
148
|
+
# error_path | stderr_path
|
149
|
+
# reservation_id | JobAttributes.reservation_id
|
150
|
+
# queue_name | JobAttributes.queue_name
|
151
|
+
# priority | X
|
152
|
+
# start_time | X
|
153
|
+
# wall_time | JobAttributes.duration
|
154
|
+
# accounting_id | JobAttributes.account or project_name(duplicated)
|
155
|
+
# job_array_request | X
|
156
|
+
# qos | X
|
157
|
+
# gpus_per_node | ResourceSpec.gpu_cores_per_process
|
158
|
+
# native | executable (join script.content)
|
159
|
+
# copy_environment | inherit_envrionment
|
160
|
+
# cores | ResourceSpec.cpu_cores_per_process
|
161
|
+
# after | X
|
162
|
+
# afterok | X
|
163
|
+
# afternotok | X
|
164
|
+
# afterany | X
|
165
|
+
# OOD does not have following PSI/J's interfaces.
|
166
|
+
# JobSpec class:
|
167
|
+
# pre_launch, post_launch, launcher
|
168
|
+
# ResourceSpec class:
|
169
|
+
# node_count, process_count, processes_per_node, exclusive_node_use
|
170
|
+
|
171
|
+
content = if script.shell_path.nil?
|
172
|
+
script.content
|
173
|
+
else
|
174
|
+
"#!#{script.shell_path}\n#{script.content}"
|
175
|
+
end
|
176
|
+
|
177
|
+
if ! script.native.nil?
|
178
|
+
native = script.native.join("\n") unless script.native.nil?
|
179
|
+
script.content.concat(native)
|
180
|
+
end
|
181
|
+
|
182
|
+
relative_path = "~/ood_tmp/run.sh"
|
183
|
+
full_path = File.expand_path("~/ood_tmp/run.sh")
|
184
|
+
FileUtils.mkdir_p(File.dirname(full_path))
|
185
|
+
File.open(full_path, "w") do |file|
|
186
|
+
file.write(content)
|
187
|
+
end
|
188
|
+
|
189
|
+
File.chmod(0755, full_path)
|
190
|
+
|
191
|
+
# convert OOD interfaces to PSI/J interfaces.
|
192
|
+
params = {
|
193
|
+
environment: script.job_environment,
|
194
|
+
directory: script.workdir,
|
195
|
+
name: script.job_name,
|
196
|
+
executable: relative_path,
|
197
|
+
stdin_path: script.input_path,
|
198
|
+
stdout_path: script.output_path,
|
199
|
+
stderr_path: script.error_path,
|
200
|
+
inherit_environment: script.copy_environment,
|
201
|
+
attributes: {queue_name: script.queue_name,
|
202
|
+
reservation_id: script.reservation_id,
|
203
|
+
account: script.accounting_id,
|
204
|
+
duration: script.wall_time,
|
205
|
+
custom_attributes: script.args},
|
206
|
+
resources: {__version: 1,
|
207
|
+
gpu_cores_per_process: script.gpus_per_node,
|
208
|
+
cpu_cores_per_process: script.cores}
|
209
|
+
}
|
210
|
+
|
211
|
+
if params[:attributes][:queue_name].nil?
|
212
|
+
params[:attributes][:queue_name] = @psij.queue_name
|
213
|
+
end
|
214
|
+
if params[:stdout_path].nil?
|
215
|
+
params[:stdout_path] = File.join(Dir.pwd, "stdout.txt")
|
216
|
+
end
|
217
|
+
if params[:stderr_path].nil?
|
218
|
+
params[:stderr_path] = File.join(Dir.pwd, "stderr.txt")
|
219
|
+
end
|
220
|
+
|
221
|
+
# add script.native to params[:attributes][:custom_attributes] of PSI/J.
|
222
|
+
if script.native && !script.native.empty?
|
223
|
+
if params[:attributes][:custom_attributes].nil?
|
224
|
+
params[:attributes][:custom_attributes] = script.native
|
225
|
+
else
|
226
|
+
params[:attributes][:custom_attributes].concat(script.native)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
# Add script.native to params[:attributes][:cutsom_attributes] of PSI/J.
|
230
|
+
# Convert script.native array to hash.
|
231
|
+
# ['--<name>', 'value'] -> {name: value}
|
232
|
+
# ['--<name1>', '--<name2>'] -> {name1: "", name2: ""}
|
233
|
+
if ! params[:attributes][:custom_attributes].nil?
|
234
|
+
hash = {}
|
235
|
+
skip = false
|
236
|
+
len = params[:attributes][:custom_attributes].length()-1
|
237
|
+
for index in 0..len do
|
238
|
+
if skip
|
239
|
+
skip = false
|
240
|
+
next
|
241
|
+
end
|
242
|
+
v = params[:attributes][:custom_attributes][index]
|
243
|
+
has_hyphen = false
|
244
|
+
if v.start_with?("--")
|
245
|
+
name = v[2..-1]
|
246
|
+
has_hyphen = true
|
247
|
+
elsif v.start_with?("-")
|
248
|
+
name = v[1..-1]
|
249
|
+
has_hyphen = true
|
250
|
+
else
|
251
|
+
name = v
|
252
|
+
end
|
253
|
+
if index == len || !has_hyphen || params[:attributes][:custom_attributes][index+1].start_with?("-")
|
254
|
+
# if next value is not exist or start with "-", set empty string
|
255
|
+
hash[name] = ""
|
256
|
+
else
|
257
|
+
# if next value is exist and not start with "-", set value
|
258
|
+
hash[name] = params[:attributes][:custom_attributes][index+1]
|
259
|
+
skip = true
|
260
|
+
end
|
261
|
+
end
|
262
|
+
params[:attributes][:custom_attributes] = hash
|
263
|
+
end
|
264
|
+
|
265
|
+
# reject key which has nil value.
|
266
|
+
params[:attributes] = params[:attributes].reject {|_, value |value.nil?}
|
267
|
+
params[:resources] = params[:resources].reject {|_, value |value.nil?}
|
268
|
+
data = params.reject {|_, value |value.nil?}
|
269
|
+
|
270
|
+
# serialize params to JSON
|
271
|
+
args = []
|
272
|
+
args[0] = @psij.executor
|
273
|
+
|
274
|
+
@psij.submit_job_path(args: args, chdir: script.workdir, stdin: JSON.generate(data))
|
275
|
+
rescue Batch::Error => e
|
276
|
+
raise JobAdapterError, e
|
277
|
+
end
|
278
|
+
|
279
|
+
def cluster_info
|
280
|
+
end
|
281
|
+
|
282
|
+
def accounts
|
283
|
+
end
|
284
|
+
|
285
|
+
def delete(id)
|
286
|
+
id = id.to_s.strip()
|
287
|
+
params = {
|
288
|
+
id: id,
|
289
|
+
executor: @psij.executor,
|
290
|
+
}
|
291
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
292
|
+
@psij.delete_job(args: args)
|
293
|
+
rescue Batch::Error => e
|
294
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
295
|
+
end
|
296
|
+
|
297
|
+
def hold(id)
|
298
|
+
id = id.to_s.strip()
|
299
|
+
params = {
|
300
|
+
id: id,
|
301
|
+
executor: @psij.executor,
|
302
|
+
}
|
303
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
304
|
+
@psij.hold_job(args: args)
|
305
|
+
rescue Batch::Error => e
|
306
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
307
|
+
end
|
308
|
+
|
309
|
+
def release(id)
|
310
|
+
id = id.to_s.strip()
|
311
|
+
params = {
|
312
|
+
id: id,
|
313
|
+
executor: @psij.executor,
|
314
|
+
}
|
315
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
316
|
+
@psij.release_job(args: args)
|
317
|
+
rescue Batch::Error => e
|
318
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
319
|
+
end
|
320
|
+
|
321
|
+
|
322
|
+
def info(id)
|
323
|
+
id = id.to_s
|
324
|
+
|
325
|
+
job_infos = @psij.get_jobs(id: id).map do |v|
|
326
|
+
parse_job_info(v)
|
327
|
+
end
|
328
|
+
|
329
|
+
if job_infos.empty?
|
330
|
+
Info.new(id: id, status: :completed)
|
331
|
+
else
|
332
|
+
job_infos.first
|
333
|
+
end
|
334
|
+
rescue Batch::Error => e
|
335
|
+
# set completed status if can't find job id
|
336
|
+
if /Invalid job id specified/ =~ e.message
|
337
|
+
Info.new(
|
338
|
+
id: id,
|
339
|
+
status: :completed
|
340
|
+
)
|
341
|
+
else
|
342
|
+
raise JobAdapterError, e.message
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
def info_all(attrs: nil)
|
347
|
+
@psij.get_jobs.map do |v|
|
348
|
+
parse_job_info(v)
|
349
|
+
end
|
350
|
+
rescue Batch::Error => e
|
351
|
+
raise JobAdapterError, e.message
|
352
|
+
end
|
353
|
+
|
354
|
+
def info_where_owner(owner, attrs: nil)
|
355
|
+
owner = Array.wrap(owner).map(&:to_s).join(',')
|
356
|
+
@psij.get_jobs(owner: owner).map do |v|
|
357
|
+
parse_job_info(v)
|
358
|
+
end
|
359
|
+
rescue Batch::Error => e
|
360
|
+
raise JobAdapterError, e.message
|
361
|
+
end
|
362
|
+
|
363
|
+
def status(id)
|
364
|
+
info(id.to_s).status
|
365
|
+
end
|
366
|
+
|
367
|
+
def directive_prefix
|
368
|
+
end
|
369
|
+
|
370
|
+
private
|
371
|
+
def get_state(st)
|
372
|
+
STATE_MAP.fetch(st, :undetermined)
|
373
|
+
end
|
374
|
+
|
375
|
+
def parse_job_info(v)
|
376
|
+
# parse input hash to Info object
|
377
|
+
# if v don't have :reosurcelist, set empty array
|
378
|
+
if v[:resourcelist].nil? || v[:resourcelist].empty?
|
379
|
+
allocated_nodes = [ { name: "" } ]
|
380
|
+
else
|
381
|
+
allocated_nodes = v[:resourcelist]
|
382
|
+
end
|
383
|
+
if v[:cpu_time].nil?
|
384
|
+
cpu_time = nil
|
385
|
+
else
|
386
|
+
cpu_time = v[:cpu_time].to_i
|
387
|
+
end
|
388
|
+
Info.new(
|
389
|
+
id: v[:native_id],
|
390
|
+
status: get_state(v[:current_state]),
|
391
|
+
allocated_nodes: allocated_nodes,
|
392
|
+
submit_host: v[:submit_host],
|
393
|
+
job_name: v[:name],
|
394
|
+
job_owner: v[:owner],
|
395
|
+
accounting_id: v[:account],
|
396
|
+
procs: v[:process_count] ? v[:process_count].to_i : 0,
|
397
|
+
queue_name: v[:queue_name],
|
398
|
+
wallclock_time: v[:wall_time],
|
399
|
+
wallclock_limit: v[:duration],
|
400
|
+
cpu_time: cpu_time,
|
401
|
+
submission_time: v[:submission_time] ? Time.parse(v[:submission_time]): nil,
|
402
|
+
dispatch_time: v[:dispatch_time] ? Time.parse(v[:dispatch_time]): nil,
|
403
|
+
native: v
|
404
|
+
)
|
405
|
+
end
|
406
|
+
|
407
|
+
end
|
408
|
+
end
|
409
|
+
end
|
410
|
+
end
|
data/lib/ood_core/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.29.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2025-
|
13
|
+
date: 2025-08-08 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -210,6 +210,7 @@ files:
|
|
210
210
|
- lib/ood_core/job/adapters/drmaa.rb
|
211
211
|
- lib/ood_core/job/adapters/fujitsu_tcs.rb
|
212
212
|
- lib/ood_core/job/adapters/helper.rb
|
213
|
+
- lib/ood_core/job/adapters/htcondor.rb
|
213
214
|
- lib/ood_core/job/adapters/kubernetes.rb
|
214
215
|
- lib/ood_core/job/adapters/kubernetes/batch.rb
|
215
216
|
- lib/ood_core/job/adapters/kubernetes/helper.rb
|
@@ -224,6 +225,12 @@ files:
|
|
224
225
|
- lib/ood_core/job/adapters/lsf/batch.rb
|
225
226
|
- lib/ood_core/job/adapters/lsf/helper.rb
|
226
227
|
- lib/ood_core/job/adapters/pbspro.rb
|
228
|
+
- lib/ood_core/job/adapters/psij.rb
|
229
|
+
- lib/ood_core/job/adapters/psij/delete.py
|
230
|
+
- lib/ood_core/job/adapters/psij/get_info.py
|
231
|
+
- lib/ood_core/job/adapters/psij/hold.py
|
232
|
+
- lib/ood_core/job/adapters/psij/release.py
|
233
|
+
- lib/ood_core/job/adapters/psij/submit.py
|
227
234
|
- lib/ood_core/job/adapters/sge.rb
|
228
235
|
- lib/ood_core/job/adapters/sge/batch.rb
|
229
236
|
- lib/ood_core/job/adapters/sge/helper.rb
|