ood_core 0.28.0 → 0.30.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ood_core/cluster.rb +11 -1
- data/lib/ood_core/job/account_info.rb +0 -9
- data/lib/ood_core/job/adapter.rb +2 -1
- data/lib/ood_core/job/adapters/coder/batch.rb +46 -25
- data/lib/ood_core/job/adapters/coder/credentials.rb +16 -0
- data/lib/ood_core/job/adapters/coder/openstack_credentials.rb +118 -0
- data/lib/ood_core/job/adapters/coder.rb +10 -2
- data/lib/ood_core/job/adapters/htcondor.rb +549 -0
- data/lib/ood_core/job/adapters/linux_host/templates/script_wrapper.erb.sh +1 -1
- data/lib/ood_core/job/adapters/psij/delete.py +18 -0
- data/lib/ood_core/job/adapters/psij/get_info.py +55 -0
- data/lib/ood_core/job/adapters/psij/hold.py +18 -0
- data/lib/ood_core/job/adapters/psij/release.py +18 -0
- data/lib/ood_core/job/adapters/psij/submit.py +28 -0
- data/lib/ood_core/job/adapters/psij.rb +410 -0
- data/lib/ood_core/job/adapters/sge/batch.rb +13 -10
- data/lib/ood_core/job/adapters/slurm.rb +39 -11
- data/lib/ood_core/job/adapters/systemd/templates/script_wrapper.erb.sh +1 -1
- data/lib/ood_core/job/queue_info.rb +8 -2
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +1 -0
- metadata +25 -2
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from psij import Job, JobExecutor
|
|
3
|
+
from psij.serialize import JSONSerializer
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
# create executor instance.
|
|
9
|
+
ex = JobExecutor.get_instance(sys.argv[1])
|
|
10
|
+
|
|
11
|
+
# deserialize json data to job spec.
|
|
12
|
+
deserialize = JSONSerializer()
|
|
13
|
+
d = sys.stdin.read()
|
|
14
|
+
j = json.loads(d)
|
|
15
|
+
spec = deserialize._to_spec(j)
|
|
16
|
+
|
|
17
|
+
# add executor string to each key of custom attributes.
|
|
18
|
+
if sys.argv[1] != "local" and spec.attributes.custom_attributes is not None:
|
|
19
|
+
h = {}
|
|
20
|
+
for k in spec.attributes.custom_attributes.keys():
|
|
21
|
+
h[f"{ex.name}.{k}"] = spec.attributes.custom_attributes[k]
|
|
22
|
+
spec.attributes.custom_attributes = h
|
|
23
|
+
|
|
24
|
+
spec.executable = os.path.expanduser(spec.executable)
|
|
25
|
+
job = Job(spec)
|
|
26
|
+
|
|
27
|
+
ex.submit(job)
|
|
28
|
+
print(job.native_id)
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
require "time"
|
|
2
|
+
require 'etc'
|
|
3
|
+
require "ood_core/refinements/hash_extensions"
|
|
4
|
+
require "ood_core/refinements/array_extensions"
|
|
5
|
+
require "ood_core/job/adapters/helper"
|
|
6
|
+
|
|
7
|
+
require 'json'
|
|
8
|
+
require 'pathname'
|
|
9
|
+
|
|
10
|
+
module OodCore
|
|
11
|
+
module Job
|
|
12
|
+
class Factory
|
|
13
|
+
|
|
14
|
+
using Refinements::HashExtensions
|
|
15
|
+
# Build the PSIJ adapter from a configuration
|
|
16
|
+
# @param config [#to_h] the configuration for job adapter
|
|
17
|
+
# @option config [Object] :bin (nil) Path to PSIJ binaries
|
|
18
|
+
# @option config [#to_h] :bin_overrides ({}) Optional overrides to PSIJ executables
|
|
19
|
+
def self.build_psij(config)
|
|
20
|
+
c = config.to_h.symbolize_keys
|
|
21
|
+
cluster = c.fetch(:cluster, nil)
|
|
22
|
+
conf = c.fetch(:conf, nil)
|
|
23
|
+
bin = c.fetch(:bin, nil)
|
|
24
|
+
bin_overrides = c.fetch(:bin_overrides, {})
|
|
25
|
+
submit_host = c.fetch(:submit_host, "")
|
|
26
|
+
strict_host_checking = c.fetch(:strict_host_checking, true)
|
|
27
|
+
executor = c.fetch(:executor, nil)
|
|
28
|
+
queue_name = c.fetch(:queue_name, nil)
|
|
29
|
+
psij = Adapters::PSIJ::Batch.new(cluster: cluster, conf: conf, bin: bin, bin_overrides: bin_overrides, submit_host: submit_host, strict_host_checking: strict_host_checking, executor: executor, queue_name: queue_name)
|
|
30
|
+
Adapters::PSIJ.new(psij: psij)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
module Adapters
|
|
35
|
+
class PSIJ < Adapter
|
|
36
|
+
using Refinements::HashExtensions
|
|
37
|
+
using Refinements::ArrayExtensions
|
|
38
|
+
class Batch
|
|
39
|
+
|
|
40
|
+
attr_reader :cluster
|
|
41
|
+
attr_reader :conf
|
|
42
|
+
attr_reader :bin
|
|
43
|
+
attr_reader :bin_overrides
|
|
44
|
+
attr_reader :submit_host
|
|
45
|
+
attr_reader :strict_host_checking
|
|
46
|
+
attr_reader :executor
|
|
47
|
+
attr_reader :queue_name
|
|
48
|
+
|
|
49
|
+
class Error < StandardError; end
|
|
50
|
+
|
|
51
|
+
def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {}, submit_host: "", strict_host_checking: true, executor: nil, queue_name: nil)
|
|
52
|
+
@cluster = cluster && cluster.to_s
|
|
53
|
+
@conf = conf && Pathname.new(conf.to_s)
|
|
54
|
+
@bin = Pathname.new(bin.to_s)
|
|
55
|
+
@bin_overrides = bin_overrides
|
|
56
|
+
@submit_host = submit_host.to_s
|
|
57
|
+
@strict_host_checking = strict_host_checking
|
|
58
|
+
@executor = executor
|
|
59
|
+
@queue_name = queue_name
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def get_jobs(id: "", owner: nil)
|
|
63
|
+
id = id.to_s.strip()
|
|
64
|
+
params = {
|
|
65
|
+
id: id,
|
|
66
|
+
executor: executor,
|
|
67
|
+
}
|
|
68
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
|
69
|
+
get_info_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/get_info.py").to_s
|
|
70
|
+
jobs_data = call("python3", get_info_path, *args)
|
|
71
|
+
jobs_data = JSON.parse(jobs_data, symbolize_names: true)
|
|
72
|
+
jobs_data
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def submit_job_path(args: [], chdir: nil, stdin: nil)
|
|
76
|
+
submit_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/submit.py").to_s
|
|
77
|
+
call("python3", submit_path, *args, chdir: chdir, stdin: stdin)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def delete_job(args: [])
|
|
81
|
+
delete_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/delete.py").to_s
|
|
82
|
+
call("python3", delete_path, *args)
|
|
83
|
+
rescue => e
|
|
84
|
+
raise JobAdapterError, e
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def hold_job(args: [])
|
|
88
|
+
hold_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/hold.py").to_s
|
|
89
|
+
call("python3", hold_path, *args)
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def release_job(args: [])
|
|
93
|
+
release_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/release.py").to_s
|
|
94
|
+
call("python3", release_path, *args)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def seconds_to_duration(time)
|
|
98
|
+
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
private
|
|
102
|
+
# Call a forked psij script for a given cluster
|
|
103
|
+
def call(cmd, *args, env: {}, stdin: "", chdir: nil)
|
|
104
|
+
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
|
105
|
+
cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
|
|
106
|
+
chdir ||= "."
|
|
107
|
+
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin, chdir: chdir.to_s)
|
|
108
|
+
s.success? ? o : raise(Error, e)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
STATE_MAP = {
|
|
115
|
+
'NEW' => :undetermined,
|
|
116
|
+
'QUEUED' => :queued,
|
|
117
|
+
'HELD' => :queued_held,
|
|
118
|
+
'ACTIVE' => :running,
|
|
119
|
+
'COMPLETED' => :completed,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
def initialize(opts = {})
|
|
123
|
+
o = opts.to_h.symbolize_keys
|
|
124
|
+
|
|
125
|
+
@psij = o.fetch(:psij) { raise ArgumentError, "No psij object specified. Missing argument: psij" }
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# The `submit` method saves a job script as a file and prepares a command to submit the job.
|
|
130
|
+
# Each optional argument specifies job dependencies (after, afterok, afternotok, afterany).
|
|
131
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
|
132
|
+
# convert OOD interfaces to PSI/J interfaces.
|
|
133
|
+
# Conterted variables are shown as follows:
|
|
134
|
+
# OOD | PSI/J(JobSpec)
|
|
135
|
+
# --------------------+----------------------------------------------------
|
|
136
|
+
# submit_as_hold | X (not support)
|
|
137
|
+
# rerunnable | X
|
|
138
|
+
# email_on_started | X
|
|
139
|
+
# email_on_terminated | X
|
|
140
|
+
# args | JobAttributes.custom_attributes
|
|
141
|
+
# job_environment | environment
|
|
142
|
+
# workdir | directory
|
|
143
|
+
# email | X
|
|
144
|
+
# job_name | name
|
|
145
|
+
# shell_path | #!<shell_path>
|
|
146
|
+
# input_path | stdin_path
|
|
147
|
+
# output_path | stdout_path
|
|
148
|
+
# error_path | stderr_path
|
|
149
|
+
# reservation_id | JobAttributes.reservation_id
|
|
150
|
+
# queue_name | JobAttributes.queue_name
|
|
151
|
+
# priority | X
|
|
152
|
+
# start_time | X
|
|
153
|
+
# wall_time | JobAttributes.duration
|
|
154
|
+
# accounting_id | JobAttributes.account or project_name(duplicated)
|
|
155
|
+
# job_array_request | X
|
|
156
|
+
# qos | X
|
|
157
|
+
# gpus_per_node | ResourceSpec.gpu_cores_per_process
|
|
158
|
+
# native | executable (join script.content)
|
|
159
|
+
# copy_environment | inherit_envrionment
|
|
160
|
+
# cores | ResourceSpec.cpu_cores_per_process
|
|
161
|
+
# after | X
|
|
162
|
+
# afterok | X
|
|
163
|
+
# afternotok | X
|
|
164
|
+
# afterany | X
|
|
165
|
+
# OOD does not have following PSI/J's interfaces.
|
|
166
|
+
# JobSpec class:
|
|
167
|
+
# pre_launch, post_launch, launcher
|
|
168
|
+
# ResourceSpec class:
|
|
169
|
+
# node_count, process_count, processes_per_node, exclusive_node_use
|
|
170
|
+
|
|
171
|
+
content = if script.shell_path.nil?
|
|
172
|
+
script.content
|
|
173
|
+
else
|
|
174
|
+
"#!#{script.shell_path}\n#{script.content}"
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
if ! script.native.nil?
|
|
178
|
+
native = script.native.join("\n") unless script.native.nil?
|
|
179
|
+
script.content.concat(native)
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
relative_path = "~/ood_tmp/run.sh"
|
|
183
|
+
full_path = File.expand_path("~/ood_tmp/run.sh")
|
|
184
|
+
FileUtils.mkdir_p(File.dirname(full_path))
|
|
185
|
+
File.open(full_path, "w") do |file|
|
|
186
|
+
file.write(content)
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
File.chmod(0755, full_path)
|
|
190
|
+
|
|
191
|
+
# convert OOD interfaces to PSI/J interfaces.
|
|
192
|
+
params = {
|
|
193
|
+
environment: script.job_environment,
|
|
194
|
+
directory: script.workdir,
|
|
195
|
+
name: script.job_name,
|
|
196
|
+
executable: relative_path,
|
|
197
|
+
stdin_path: script.input_path,
|
|
198
|
+
stdout_path: script.output_path,
|
|
199
|
+
stderr_path: script.error_path,
|
|
200
|
+
inherit_environment: script.copy_environment,
|
|
201
|
+
attributes: {queue_name: script.queue_name,
|
|
202
|
+
reservation_id: script.reservation_id,
|
|
203
|
+
account: script.accounting_id,
|
|
204
|
+
duration: script.wall_time,
|
|
205
|
+
custom_attributes: script.args},
|
|
206
|
+
resources: {__version: 1,
|
|
207
|
+
gpu_cores_per_process: script.gpus_per_node,
|
|
208
|
+
cpu_cores_per_process: script.cores}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
if params[:attributes][:queue_name].nil?
|
|
212
|
+
params[:attributes][:queue_name] = @psij.queue_name
|
|
213
|
+
end
|
|
214
|
+
if params[:stdout_path].nil?
|
|
215
|
+
params[:stdout_path] = File.join(Dir.pwd, "stdout.txt")
|
|
216
|
+
end
|
|
217
|
+
if params[:stderr_path].nil?
|
|
218
|
+
params[:stderr_path] = File.join(Dir.pwd, "stderr.txt")
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# add script.native to params[:attributes][:custom_attributes] of PSI/J.
|
|
222
|
+
if script.native && !script.native.empty?
|
|
223
|
+
if params[:attributes][:custom_attributes].nil?
|
|
224
|
+
params[:attributes][:custom_attributes] = script.native
|
|
225
|
+
else
|
|
226
|
+
params[:attributes][:custom_attributes].concat(script.native)
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
# Add script.native to params[:attributes][:cutsom_attributes] of PSI/J.
|
|
230
|
+
# Convert script.native array to hash.
|
|
231
|
+
# ['--<name>', 'value'] -> {name: value}
|
|
232
|
+
# ['--<name1>', '--<name2>'] -> {name1: "", name2: ""}
|
|
233
|
+
if ! params[:attributes][:custom_attributes].nil?
|
|
234
|
+
hash = {}
|
|
235
|
+
skip = false
|
|
236
|
+
len = params[:attributes][:custom_attributes].length()-1
|
|
237
|
+
for index in 0..len do
|
|
238
|
+
if skip
|
|
239
|
+
skip = false
|
|
240
|
+
next
|
|
241
|
+
end
|
|
242
|
+
v = params[:attributes][:custom_attributes][index]
|
|
243
|
+
has_hyphen = false
|
|
244
|
+
if v.start_with?("--")
|
|
245
|
+
name = v[2..-1]
|
|
246
|
+
has_hyphen = true
|
|
247
|
+
elsif v.start_with?("-")
|
|
248
|
+
name = v[1..-1]
|
|
249
|
+
has_hyphen = true
|
|
250
|
+
else
|
|
251
|
+
name = v
|
|
252
|
+
end
|
|
253
|
+
if index == len || !has_hyphen || params[:attributes][:custom_attributes][index+1].start_with?("-")
|
|
254
|
+
# if next value is not exist or start with "-", set empty string
|
|
255
|
+
hash[name] = ""
|
|
256
|
+
else
|
|
257
|
+
# if next value is exist and not start with "-", set value
|
|
258
|
+
hash[name] = params[:attributes][:custom_attributes][index+1]
|
|
259
|
+
skip = true
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
params[:attributes][:custom_attributes] = hash
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
# reject key which has nil value.
|
|
266
|
+
params[:attributes] = params[:attributes].reject {|_, value |value.nil?}
|
|
267
|
+
params[:resources] = params[:resources].reject {|_, value |value.nil?}
|
|
268
|
+
data = params.reject {|_, value |value.nil?}
|
|
269
|
+
|
|
270
|
+
# serialize params to JSON
|
|
271
|
+
args = []
|
|
272
|
+
args[0] = @psij.executor
|
|
273
|
+
|
|
274
|
+
@psij.submit_job_path(args: args, chdir: script.workdir, stdin: JSON.generate(data))
|
|
275
|
+
rescue Batch::Error => e
|
|
276
|
+
raise JobAdapterError, e
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def cluster_info
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def accounts
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def delete(id)
|
|
286
|
+
id = id.to_s.strip()
|
|
287
|
+
params = {
|
|
288
|
+
id: id,
|
|
289
|
+
executor: @psij.executor,
|
|
290
|
+
}
|
|
291
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
|
292
|
+
@psij.delete_job(args: args)
|
|
293
|
+
rescue Batch::Error => e
|
|
294
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def hold(id)
|
|
298
|
+
id = id.to_s.strip()
|
|
299
|
+
params = {
|
|
300
|
+
id: id,
|
|
301
|
+
executor: @psij.executor,
|
|
302
|
+
}
|
|
303
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
|
304
|
+
@psij.hold_job(args: args)
|
|
305
|
+
rescue Batch::Error => e
|
|
306
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def release(id)
|
|
310
|
+
id = id.to_s.strip()
|
|
311
|
+
params = {
|
|
312
|
+
id: id,
|
|
313
|
+
executor: @psij.executor,
|
|
314
|
+
}
|
|
315
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
|
316
|
+
@psij.release_job(args: args)
|
|
317
|
+
rescue Batch::Error => e
|
|
318
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def info(id)
|
|
323
|
+
id = id.to_s
|
|
324
|
+
|
|
325
|
+
job_infos = @psij.get_jobs(id: id).map do |v|
|
|
326
|
+
parse_job_info(v)
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
if job_infos.empty?
|
|
330
|
+
Info.new(id: id, status: :completed)
|
|
331
|
+
else
|
|
332
|
+
job_infos.first
|
|
333
|
+
end
|
|
334
|
+
rescue Batch::Error => e
|
|
335
|
+
# set completed status if can't find job id
|
|
336
|
+
if /Invalid job id specified/ =~ e.message
|
|
337
|
+
Info.new(
|
|
338
|
+
id: id,
|
|
339
|
+
status: :completed
|
|
340
|
+
)
|
|
341
|
+
else
|
|
342
|
+
raise JobAdapterError, e.message
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
def info_all(attrs: nil)
|
|
347
|
+
@psij.get_jobs.map do |v|
|
|
348
|
+
parse_job_info(v)
|
|
349
|
+
end
|
|
350
|
+
rescue Batch::Error => e
|
|
351
|
+
raise JobAdapterError, e.message
|
|
352
|
+
end
|
|
353
|
+
|
|
354
|
+
def info_where_owner(owner, attrs: nil)
|
|
355
|
+
owner = Array.wrap(owner).map(&:to_s).join(',')
|
|
356
|
+
@psij.get_jobs(owner: owner).map do |v|
|
|
357
|
+
parse_job_info(v)
|
|
358
|
+
end
|
|
359
|
+
rescue Batch::Error => e
|
|
360
|
+
raise JobAdapterError, e.message
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
def status(id)
|
|
364
|
+
info(id.to_s).status
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
def directive_prefix
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
private
|
|
371
|
+
def get_state(st)
|
|
372
|
+
STATE_MAP.fetch(st, :undetermined)
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
def parse_job_info(v)
|
|
376
|
+
# parse input hash to Info object
|
|
377
|
+
# if v don't have :reosurcelist, set empty array
|
|
378
|
+
if v[:resourcelist].nil? || v[:resourcelist].empty?
|
|
379
|
+
allocated_nodes = [ { name: "" } ]
|
|
380
|
+
else
|
|
381
|
+
allocated_nodes = v[:resourcelist]
|
|
382
|
+
end
|
|
383
|
+
if v[:cpu_time].nil?
|
|
384
|
+
cpu_time = nil
|
|
385
|
+
else
|
|
386
|
+
cpu_time = v[:cpu_time].to_i
|
|
387
|
+
end
|
|
388
|
+
Info.new(
|
|
389
|
+
id: v[:native_id],
|
|
390
|
+
status: get_state(v[:current_state]),
|
|
391
|
+
allocated_nodes: allocated_nodes,
|
|
392
|
+
submit_host: v[:submit_host],
|
|
393
|
+
job_name: v[:name],
|
|
394
|
+
job_owner: v[:owner],
|
|
395
|
+
accounting_id: v[:account],
|
|
396
|
+
procs: v[:process_count] ? v[:process_count].to_i : 0,
|
|
397
|
+
queue_name: v[:queue_name],
|
|
398
|
+
wallclock_time: v[:wall_time],
|
|
399
|
+
wallclock_limit: v[:duration],
|
|
400
|
+
cpu_time: cpu_time,
|
|
401
|
+
submission_time: v[:submission_time] ? Time.parse(v[:submission_time]): nil,
|
|
402
|
+
dispatch_time: v[:dispatch_time] ? Time.parse(v[:dispatch_time]): nil,
|
|
403
|
+
native: v
|
|
404
|
+
)
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
end
|
|
408
|
+
end
|
|
409
|
+
end
|
|
410
|
+
end
|
|
@@ -62,16 +62,19 @@ class OodCore::Job::Adapters::Sge::Batch
|
|
|
62
62
|
# @param owner [#to_s] the owner or owner list
|
|
63
63
|
# @return [Array<OodCore::Job::Info>]
|
|
64
64
|
def get_all(owner: nil)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|job_hash|
|
|
72
|
-
**post_process_qstat_job_hash(job_hash)
|
|
73
|
-
|
|
74
|
-
|
|
65
|
+
begin
|
|
66
|
+
listener = QstatXmlRListener.new
|
|
67
|
+
argv = ['qstat', '-r', '-xml']
|
|
68
|
+
argv.concat ['-u', owner] unless owner.nil?
|
|
69
|
+
REXML::Parsers::StreamParser.new(call(*argv), listener).parse
|
|
70
|
+
|
|
71
|
+
listener.parsed_jobs.map do |job_hash|
|
|
72
|
+
OodCore::Job::Info.new(**post_process_qstat_job_hash(job_hash))
|
|
73
|
+
end
|
|
74
|
+
rescue REXML::ParseException => e
|
|
75
|
+
warn("Error parsing response: #{e}")
|
|
76
|
+
[]
|
|
77
|
+
end
|
|
75
78
|
end
|
|
76
79
|
|
|
77
80
|
# Get OodCore::Job::Info for a job_id that may still be in the queue
|
|
@@ -25,7 +25,13 @@ module OodCore
|
|
|
25
25
|
bin_overrides = c.fetch(:bin_overrides, {})
|
|
26
26
|
submit_host = c.fetch(:submit_host, "")
|
|
27
27
|
strict_host_checking = c.fetch(:strict_host_checking, true)
|
|
28
|
-
|
|
28
|
+
id = c.fetch(:id, 'unknown')
|
|
29
|
+
|
|
30
|
+
slurm = Adapters::Slurm::Batch.new(
|
|
31
|
+
cluster: cluster, conf: conf, bin: bin, bin_overrides: bin_overrides,
|
|
32
|
+
submit_host: submit_host, strict_host_checking: strict_host_checking,
|
|
33
|
+
id: id
|
|
34
|
+
)
|
|
29
35
|
Adapters::Slurm.new(slurm: slurm)
|
|
30
36
|
end
|
|
31
37
|
end
|
|
@@ -84,6 +90,11 @@ module OodCore
|
|
|
84
90
|
# @return [Bool]; true if empty
|
|
85
91
|
attr_reader :strict_host_checking
|
|
86
92
|
|
|
93
|
+
# The ID of the cluster.
|
|
94
|
+
# @example oakley
|
|
95
|
+
# @return [String]; The ID of the cluster.
|
|
96
|
+
attr_reader :id
|
|
97
|
+
|
|
87
98
|
# The root exception class that all Slurm-specific exceptions inherit
|
|
88
99
|
# from
|
|
89
100
|
class Error < StandardError; end
|
|
@@ -97,13 +108,14 @@ module OodCore
|
|
|
97
108
|
# @param bin_overrides [#to_h] a hash of bin ovverides to be used in job
|
|
98
109
|
# @param submit_host [#to_s] Submits the job on a login node via ssh
|
|
99
110
|
# @param strict_host_checking [Bool] Whether to use strict host checking when ssh to submit_host
|
|
100
|
-
def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {}, submit_host: "", strict_host_checking: true)
|
|
111
|
+
def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {}, submit_host: "", strict_host_checking: true, id: 'unknown')
|
|
101
112
|
@cluster = cluster && cluster.to_s
|
|
102
113
|
@conf = conf && Pathname.new(conf.to_s)
|
|
103
114
|
@bin = Pathname.new(bin.to_s)
|
|
104
115
|
@bin_overrides = bin_overrides
|
|
105
116
|
@submit_host = submit_host.to_s
|
|
106
117
|
@strict_host_checking = strict_host_checking
|
|
118
|
+
@id = id.to_s
|
|
107
119
|
end
|
|
108
120
|
|
|
109
121
|
# Get a ClusterInfo object containing information about the given cluster
|
|
@@ -182,22 +194,29 @@ module OodCore
|
|
|
182
194
|
|
|
183
195
|
def accounts
|
|
184
196
|
user = Etc.getlogin
|
|
185
|
-
args = [
|
|
197
|
+
args = [
|
|
198
|
+
'-nP', 'show', 'users', 'withassoc', 'format=account,qos',
|
|
199
|
+
'where', "user=#{user}", "cluster=#{id}"
|
|
200
|
+
]
|
|
186
201
|
|
|
187
|
-
[].tap do |
|
|
202
|
+
[].tap do |associations|
|
|
188
203
|
call('sacctmgr', *args).each_line do |line|
|
|
189
|
-
acct,
|
|
204
|
+
acct, qos = line.split('|')
|
|
190
205
|
next if acct.nil? || acct.chomp.empty?
|
|
191
206
|
|
|
192
|
-
|
|
207
|
+
associations << {
|
|
193
208
|
name: acct,
|
|
194
209
|
qos: qos.to_s.chomp.split(','),
|
|
195
|
-
cluster: cluster,
|
|
196
|
-
queue: queue.to_s.empty? ? nil : queue
|
|
197
210
|
}
|
|
198
|
-
info = OodCore::Job::AccountInfo.new(**args) unless acct.nil?
|
|
199
|
-
accts << info unless acct.nil?
|
|
200
211
|
end
|
|
212
|
+
end.group_by do |x|
|
|
213
|
+
[x[:name], x[:cluster]]
|
|
214
|
+
end.map do |(name, cluster), assocs|
|
|
215
|
+
OodCore::Job::AccountInfo.new(
|
|
216
|
+
name: name,
|
|
217
|
+
cluster: cluster,
|
|
218
|
+
qos: (assocs.flat_map { |x| x[:qos] }).uniq,
|
|
219
|
+
)
|
|
201
220
|
end
|
|
202
221
|
end
|
|
203
222
|
|
|
@@ -430,7 +449,6 @@ module OodCore
|
|
|
430
449
|
end.to_h.symbolize_keys
|
|
431
450
|
|
|
432
451
|
hsh[:name] = hsh[:PartitionName]
|
|
433
|
-
hsh[:qos] = hsh[:QoS].to_s == 'N/A' ? [] : hsh[:QoS].to_s.split(',')
|
|
434
452
|
hsh[:allow_accounts] = if hsh[:AllowAccounts].nil? || hsh[:AllowAccounts].to_s == 'ALL'
|
|
435
453
|
nil
|
|
436
454
|
else
|
|
@@ -443,6 +461,16 @@ module OodCore
|
|
|
443
461
|
hsh[:DenyAccounts].nil? ? [] : hsh[:DenyAccounts].to_s.split(',')
|
|
444
462
|
end
|
|
445
463
|
|
|
464
|
+
hsh[:allow_qos] = if hsh[:AllowQos].nil? || hsh[:AllowQos].to_s == 'ALL'
|
|
465
|
+
[]
|
|
466
|
+
else
|
|
467
|
+
hsh[:AllowQos].to_s.split(',')
|
|
468
|
+
end
|
|
469
|
+
hsh[:deny_qos] = if !hsh[:allow_qos].empty?
|
|
470
|
+
[] # manpage says that AllowQos negates DenyQos
|
|
471
|
+
else
|
|
472
|
+
hsh[:DenyQos].nil? ? [] : hsh[:DenyQos].to_s.split(',')
|
|
473
|
+
end
|
|
446
474
|
hsh[:tres] = case hsh[:TRES]
|
|
447
475
|
when nil, '(null)', ''
|
|
448
476
|
{}
|
|
@@ -10,7 +10,8 @@ class OodCore::Job::QueueInfo
|
|
|
10
10
|
alias to_s name
|
|
11
11
|
|
|
12
12
|
# The QoSes associated with this queue
|
|
13
|
-
attr_reader :
|
|
13
|
+
attr_reader :allow_qos
|
|
14
|
+
attr_reader :deny_qos
|
|
14
15
|
|
|
15
16
|
# The accounts that are allowed to use this queue.
|
|
16
17
|
#
|
|
@@ -25,7 +26,8 @@ class OodCore::Job::QueueInfo
|
|
|
25
26
|
|
|
26
27
|
def initialize(**opts)
|
|
27
28
|
@name = opts.fetch(:name, 'unknown')
|
|
28
|
-
@
|
|
29
|
+
@allow_qos = opts.fetch(:allow_qos, [])
|
|
30
|
+
@deny_qos = opts.fetch(:deny_qos, [])
|
|
29
31
|
@tres = opts.fetch(:tres, {})
|
|
30
32
|
|
|
31
33
|
allow_accounts = opts.fetch(:allow_accounts, nil)
|
|
@@ -50,4 +52,8 @@ class OodCore::Job::QueueInfo
|
|
|
50
52
|
def gpu?
|
|
51
53
|
tres.keys.any? { |name| name.to_s.match?(%r{^gres/gpu($|:)}i) }
|
|
52
54
|
end
|
|
55
|
+
|
|
56
|
+
def allow_all_qos?
|
|
57
|
+
allow_qos.empty? && deny_qos.empty?
|
|
58
|
+
end
|
|
53
59
|
end
|
data/lib/ood_core/version.rb
CHANGED
data/ood_core.gemspec
CHANGED
|
@@ -24,6 +24,7 @@ Gem::Specification.new do |spec|
|
|
|
24
24
|
|
|
25
25
|
spec.add_runtime_dependency "ood_support", "~> 0.0.2"
|
|
26
26
|
spec.add_runtime_dependency "ffi", "~> 1.16.3"
|
|
27
|
+
spec.add_runtime_dependency "fog-openstack", "~> 1.1.5"
|
|
27
28
|
spec.add_runtime_dependency "rexml", "~> 3.2"
|
|
28
29
|
spec.add_development_dependency "bundler", "~> 2.1"
|
|
29
30
|
spec.add_development_dependency "rake", "~> 13.3.0"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ood_core
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.30.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Franz
|
|
@@ -10,7 +10,7 @@ authors:
|
|
|
10
10
|
autorequire:
|
|
11
11
|
bindir: exe
|
|
12
12
|
cert_chain: []
|
|
13
|
-
date: 2025-
|
|
13
|
+
date: 2025-11-25 00:00:00.000000000 Z
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
16
16
|
name: ood_support
|
|
@@ -40,6 +40,20 @@ dependencies:
|
|
|
40
40
|
- - "~>"
|
|
41
41
|
- !ruby/object:Gem::Version
|
|
42
42
|
version: 1.16.3
|
|
43
|
+
- !ruby/object:Gem::Dependency
|
|
44
|
+
name: fog-openstack
|
|
45
|
+
requirement: !ruby/object:Gem::Requirement
|
|
46
|
+
requirements:
|
|
47
|
+
- - "~>"
|
|
48
|
+
- !ruby/object:Gem::Version
|
|
49
|
+
version: 1.1.5
|
|
50
|
+
type: :runtime
|
|
51
|
+
prerelease: false
|
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
53
|
+
requirements:
|
|
54
|
+
- - "~>"
|
|
55
|
+
- !ruby/object:Gem::Version
|
|
56
|
+
version: 1.1.5
|
|
43
57
|
- !ruby/object:Gem::Dependency
|
|
44
58
|
name: rexml
|
|
45
59
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -207,9 +221,12 @@ files:
|
|
|
207
221
|
- lib/ood_core/job/adapters/coder.rb
|
|
208
222
|
- lib/ood_core/job/adapters/coder/batch.rb
|
|
209
223
|
- lib/ood_core/job/adapters/coder/coder_job_info.rb
|
|
224
|
+
- lib/ood_core/job/adapters/coder/credentials.rb
|
|
225
|
+
- lib/ood_core/job/adapters/coder/openstack_credentials.rb
|
|
210
226
|
- lib/ood_core/job/adapters/drmaa.rb
|
|
211
227
|
- lib/ood_core/job/adapters/fujitsu_tcs.rb
|
|
212
228
|
- lib/ood_core/job/adapters/helper.rb
|
|
229
|
+
- lib/ood_core/job/adapters/htcondor.rb
|
|
213
230
|
- lib/ood_core/job/adapters/kubernetes.rb
|
|
214
231
|
- lib/ood_core/job/adapters/kubernetes/batch.rb
|
|
215
232
|
- lib/ood_core/job/adapters/kubernetes/helper.rb
|
|
@@ -224,6 +241,12 @@ files:
|
|
|
224
241
|
- lib/ood_core/job/adapters/lsf/batch.rb
|
|
225
242
|
- lib/ood_core/job/adapters/lsf/helper.rb
|
|
226
243
|
- lib/ood_core/job/adapters/pbspro.rb
|
|
244
|
+
- lib/ood_core/job/adapters/psij.rb
|
|
245
|
+
- lib/ood_core/job/adapters/psij/delete.py
|
|
246
|
+
- lib/ood_core/job/adapters/psij/get_info.py
|
|
247
|
+
- lib/ood_core/job/adapters/psij/hold.py
|
|
248
|
+
- lib/ood_core/job/adapters/psij/release.py
|
|
249
|
+
- lib/ood_core/job/adapters/psij/submit.py
|
|
227
250
|
- lib/ood_core/job/adapters/sge.rb
|
|
228
251
|
- lib/ood_core/job/adapters/sge/batch.rb
|
|
229
252
|
- lib/ood_core/job/adapters/sge/helper.rb
|