ood_core 0.27.1 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ood_core/batch_connect/template.rb +3 -2
- data/lib/ood_core/batch_connect/templates/vnc.rb +1 -1
- data/lib/ood_core/batch_connect/templates/vnc_container.rb +1 -1
- data/lib/ood_core/job/adapter.rb +12 -0
- data/lib/ood_core/job/adapters/coder/batch.rb +170 -0
- data/lib/ood_core/job/adapters/coder/coder_job_info.rb +8 -0
- data/lib/ood_core/job/adapters/coder.rb +120 -0
- data/lib/ood_core/job/adapters/htcondor.rb +549 -0
- data/lib/ood_core/job/adapters/psij/delete.py +18 -0
- data/lib/ood_core/job/adapters/psij/get_info.py +55 -0
- data/lib/ood_core/job/adapters/psij/hold.py +18 -0
- data/lib/ood_core/job/adapters/psij/release.py +18 -0
- data/lib/ood_core/job/adapters/psij/submit.py +28 -0
- data/lib/ood_core/job/adapters/psij.rb +410 -0
- data/lib/ood_core/job/adapters/slurm.rb +133 -3
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +1 -1
- metadata +14 -4
@@ -0,0 +1,28 @@
|
|
1
|
+
import sys
|
2
|
+
from psij import Job, JobExecutor
|
3
|
+
from psij.serialize import JSONSerializer
|
4
|
+
from pathlib import Path
|
5
|
+
import json
|
6
|
+
import os
|
7
|
+
|
8
|
+
# create executor instance.
|
9
|
+
ex = JobExecutor.get_instance(sys.argv[1])
|
10
|
+
|
11
|
+
# deserialize json data to job spec.
|
12
|
+
deserialize = JSONSerializer()
|
13
|
+
d = sys.stdin.read()
|
14
|
+
j = json.loads(d)
|
15
|
+
spec = deserialize._to_spec(j)
|
16
|
+
|
17
|
+
# add executor string to each key of custom attributes.
|
18
|
+
if sys.argv[1] != "local" and spec.attributes.custom_attributes is not None:
|
19
|
+
h = {}
|
20
|
+
for k in spec.attributes.custom_attributes.keys():
|
21
|
+
h[f"{ex.name}.{k}"] = spec.attributes.custom_attributes[k]
|
22
|
+
spec.attributes.custom_attributes = h
|
23
|
+
|
24
|
+
spec.executable = os.path.expanduser(spec.executable)
|
25
|
+
job = Job(spec)
|
26
|
+
|
27
|
+
ex.submit(job)
|
28
|
+
print(job.native_id)
|
@@ -0,0 +1,410 @@
|
|
1
|
+
require "time"
|
2
|
+
require 'etc'
|
3
|
+
require "ood_core/refinements/hash_extensions"
|
4
|
+
require "ood_core/refinements/array_extensions"
|
5
|
+
require "ood_core/job/adapters/helper"
|
6
|
+
|
7
|
+
require 'json'
|
8
|
+
require 'pathname'
|
9
|
+
|
10
|
+
module OodCore
|
11
|
+
module Job
|
12
|
+
class Factory
|
13
|
+
|
14
|
+
using Refinements::HashExtensions
|
15
|
+
# Build the PSIJ adapter from a configuration
|
16
|
+
# @param config [#to_h] the configuration for job adapter
|
17
|
+
# @option config [Object] :bin (nil) Path to PSIJ binaries
|
18
|
+
# @option config [#to_h] :bin_overrides ({}) Optional overrides to PSIJ executables
|
19
|
+
def self.build_psij(config)
|
20
|
+
c = config.to_h.symbolize_keys
|
21
|
+
cluster = c.fetch(:cluster, nil)
|
22
|
+
conf = c.fetch(:conf, nil)
|
23
|
+
bin = c.fetch(:bin, nil)
|
24
|
+
bin_overrides = c.fetch(:bin_overrides, {})
|
25
|
+
submit_host = c.fetch(:submit_host, "")
|
26
|
+
strict_host_checking = c.fetch(:strict_host_checking, true)
|
27
|
+
executor = c.fetch(:executor, nil)
|
28
|
+
queue_name = c.fetch(:queue_name, nil)
|
29
|
+
psij = Adapters::PSIJ::Batch.new(cluster: cluster, conf: conf, bin: bin, bin_overrides: bin_overrides, submit_host: submit_host, strict_host_checking: strict_host_checking, executor: executor, queue_name: queue_name)
|
30
|
+
Adapters::PSIJ.new(psij: psij)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
module Adapters
|
35
|
+
class PSIJ < Adapter
|
36
|
+
using Refinements::HashExtensions
|
37
|
+
using Refinements::ArrayExtensions
|
38
|
+
class Batch
|
39
|
+
|
40
|
+
attr_reader :cluster
|
41
|
+
attr_reader :conf
|
42
|
+
attr_reader :bin
|
43
|
+
attr_reader :bin_overrides
|
44
|
+
attr_reader :submit_host
|
45
|
+
attr_reader :strict_host_checking
|
46
|
+
attr_reader :executor
|
47
|
+
attr_reader :queue_name
|
48
|
+
|
49
|
+
class Error < StandardError; end
|
50
|
+
|
51
|
+
def initialize(cluster: nil, bin: nil, conf: nil, bin_overrides: {}, submit_host: "", strict_host_checking: true, executor: nil, queue_name: nil)
|
52
|
+
@cluster = cluster && cluster.to_s
|
53
|
+
@conf = conf && Pathname.new(conf.to_s)
|
54
|
+
@bin = Pathname.new(bin.to_s)
|
55
|
+
@bin_overrides = bin_overrides
|
56
|
+
@submit_host = submit_host.to_s
|
57
|
+
@strict_host_checking = strict_host_checking
|
58
|
+
@executor = executor
|
59
|
+
@queue_name = queue_name
|
60
|
+
end
|
61
|
+
|
62
|
+
def get_jobs(id: "", owner: nil)
|
63
|
+
id = id.to_s.strip()
|
64
|
+
params = {
|
65
|
+
id: id,
|
66
|
+
executor: executor,
|
67
|
+
}
|
68
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
69
|
+
get_info_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/get_info.py").to_s
|
70
|
+
jobs_data = call("python3", get_info_path, *args)
|
71
|
+
jobs_data = JSON.parse(jobs_data, symbolize_names: true)
|
72
|
+
jobs_data
|
73
|
+
end
|
74
|
+
|
75
|
+
def submit_job_path(args: [], chdir: nil, stdin: nil)
|
76
|
+
submit_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/submit.py").to_s
|
77
|
+
call("python3", submit_path, *args, chdir: chdir, stdin: stdin)
|
78
|
+
end
|
79
|
+
|
80
|
+
def delete_job(args: [])
|
81
|
+
delete_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/delete.py").to_s
|
82
|
+
call("python3", delete_path, *args)
|
83
|
+
rescue => e
|
84
|
+
raise JobAdapterError, e
|
85
|
+
end
|
86
|
+
|
87
|
+
def hold_job(args: [])
|
88
|
+
hold_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/hold.py").to_s
|
89
|
+
call("python3", hold_path, *args)
|
90
|
+
end
|
91
|
+
|
92
|
+
def release_job(args: [])
|
93
|
+
release_path = Pathname.new(__FILE__).dirname.expand_path.join("psij/release.py").to_s
|
94
|
+
call("python3", release_path, *args)
|
95
|
+
end
|
96
|
+
|
97
|
+
def seconds_to_duration(time)
|
98
|
+
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
# Call a forked psij script for a given cluster
|
103
|
+
def call(cmd, *args, env: {}, stdin: "", chdir: nil)
|
104
|
+
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
105
|
+
cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
|
106
|
+
chdir ||= "."
|
107
|
+
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin, chdir: chdir.to_s)
|
108
|
+
s.success? ? o : raise(Error, e)
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
|
113
|
+
|
114
|
+
STATE_MAP = {
|
115
|
+
'NEW' => :undetermined,
|
116
|
+
'QUEUED' => :queued,
|
117
|
+
'HELD' => :queued_held,
|
118
|
+
'ACTIVE' => :running,
|
119
|
+
'COMPLETED' => :completed,
|
120
|
+
}
|
121
|
+
|
122
|
+
def initialize(opts = {})
|
123
|
+
o = opts.to_h.symbolize_keys
|
124
|
+
|
125
|
+
@psij = o.fetch(:psij) { raise ArgumentError, "No psij object specified. Missing argument: psij" }
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
# The `submit` method saves a job script as a file and prepares a command to submit the job.
|
130
|
+
# Each optional argument specifies job dependencies (after, afterok, afternotok, afterany).
|
131
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
132
|
+
# convert OOD interfaces to PSI/J interfaces.
|
133
|
+
# Conterted variables are shown as follows:
|
134
|
+
# OOD | PSI/J(JobSpec)
|
135
|
+
# --------------------+----------------------------------------------------
|
136
|
+
# submit_as_hold | X (not support)
|
137
|
+
# rerunnable | X
|
138
|
+
# email_on_started | X
|
139
|
+
# email_on_terminated | X
|
140
|
+
# args | JobAttributes.custom_attributes
|
141
|
+
# job_environment | environment
|
142
|
+
# workdir | directory
|
143
|
+
# email | X
|
144
|
+
# job_name | name
|
145
|
+
# shell_path | #!<shell_path>
|
146
|
+
# input_path | stdin_path
|
147
|
+
# output_path | stdout_path
|
148
|
+
# error_path | stderr_path
|
149
|
+
# reservation_id | JobAttributes.reservation_id
|
150
|
+
# queue_name | JobAttributes.queue_name
|
151
|
+
# priority | X
|
152
|
+
# start_time | X
|
153
|
+
# wall_time | JobAttributes.duration
|
154
|
+
# accounting_id | JobAttributes.account or project_name(duplicated)
|
155
|
+
# job_array_request | X
|
156
|
+
# qos | X
|
157
|
+
# gpus_per_node | ResourceSpec.gpu_cores_per_process
|
158
|
+
# native | executable (join script.content)
|
159
|
+
# copy_environment | inherit_envrionment
|
160
|
+
# cores | ResourceSpec.cpu_cores_per_process
|
161
|
+
# after | X
|
162
|
+
# afterok | X
|
163
|
+
# afternotok | X
|
164
|
+
# afterany | X
|
165
|
+
# OOD does not have following PSI/J's interfaces.
|
166
|
+
# JobSpec class:
|
167
|
+
# pre_launch, post_launch, launcher
|
168
|
+
# ResourceSpec class:
|
169
|
+
# node_count, process_count, processes_per_node, exclusive_node_use
|
170
|
+
|
171
|
+
content = if script.shell_path.nil?
|
172
|
+
script.content
|
173
|
+
else
|
174
|
+
"#!#{script.shell_path}\n#{script.content}"
|
175
|
+
end
|
176
|
+
|
177
|
+
if ! script.native.nil?
|
178
|
+
native = script.native.join("\n") unless script.native.nil?
|
179
|
+
script.content.concat(native)
|
180
|
+
end
|
181
|
+
|
182
|
+
relative_path = "~/ood_tmp/run.sh"
|
183
|
+
full_path = File.expand_path("~/ood_tmp/run.sh")
|
184
|
+
FileUtils.mkdir_p(File.dirname(full_path))
|
185
|
+
File.open(full_path, "w") do |file|
|
186
|
+
file.write(content)
|
187
|
+
end
|
188
|
+
|
189
|
+
File.chmod(0755, full_path)
|
190
|
+
|
191
|
+
# convert OOD interfaces to PSI/J interfaces.
|
192
|
+
params = {
|
193
|
+
environment: script.job_environment,
|
194
|
+
directory: script.workdir,
|
195
|
+
name: script.job_name,
|
196
|
+
executable: relative_path,
|
197
|
+
stdin_path: script.input_path,
|
198
|
+
stdout_path: script.output_path,
|
199
|
+
stderr_path: script.error_path,
|
200
|
+
inherit_environment: script.copy_environment,
|
201
|
+
attributes: {queue_name: script.queue_name,
|
202
|
+
reservation_id: script.reservation_id,
|
203
|
+
account: script.accounting_id,
|
204
|
+
duration: script.wall_time,
|
205
|
+
custom_attributes: script.args},
|
206
|
+
resources: {__version: 1,
|
207
|
+
gpu_cores_per_process: script.gpus_per_node,
|
208
|
+
cpu_cores_per_process: script.cores}
|
209
|
+
}
|
210
|
+
|
211
|
+
if params[:attributes][:queue_name].nil?
|
212
|
+
params[:attributes][:queue_name] = @psij.queue_name
|
213
|
+
end
|
214
|
+
if params[:stdout_path].nil?
|
215
|
+
params[:stdout_path] = File.join(Dir.pwd, "stdout.txt")
|
216
|
+
end
|
217
|
+
if params[:stderr_path].nil?
|
218
|
+
params[:stderr_path] = File.join(Dir.pwd, "stderr.txt")
|
219
|
+
end
|
220
|
+
|
221
|
+
# add script.native to params[:attributes][:custom_attributes] of PSI/J.
|
222
|
+
if script.native && !script.native.empty?
|
223
|
+
if params[:attributes][:custom_attributes].nil?
|
224
|
+
params[:attributes][:custom_attributes] = script.native
|
225
|
+
else
|
226
|
+
params[:attributes][:custom_attributes].concat(script.native)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
# Add script.native to params[:attributes][:cutsom_attributes] of PSI/J.
|
230
|
+
# Convert script.native array to hash.
|
231
|
+
# ['--<name>', 'value'] -> {name: value}
|
232
|
+
# ['--<name1>', '--<name2>'] -> {name1: "", name2: ""}
|
233
|
+
if ! params[:attributes][:custom_attributes].nil?
|
234
|
+
hash = {}
|
235
|
+
skip = false
|
236
|
+
len = params[:attributes][:custom_attributes].length()-1
|
237
|
+
for index in 0..len do
|
238
|
+
if skip
|
239
|
+
skip = false
|
240
|
+
next
|
241
|
+
end
|
242
|
+
v = params[:attributes][:custom_attributes][index]
|
243
|
+
has_hyphen = false
|
244
|
+
if v.start_with?("--")
|
245
|
+
name = v[2..-1]
|
246
|
+
has_hyphen = true
|
247
|
+
elsif v.start_with?("-")
|
248
|
+
name = v[1..-1]
|
249
|
+
has_hyphen = true
|
250
|
+
else
|
251
|
+
name = v
|
252
|
+
end
|
253
|
+
if index == len || !has_hyphen || params[:attributes][:custom_attributes][index+1].start_with?("-")
|
254
|
+
# if next value is not exist or start with "-", set empty string
|
255
|
+
hash[name] = ""
|
256
|
+
else
|
257
|
+
# if next value is exist and not start with "-", set value
|
258
|
+
hash[name] = params[:attributes][:custom_attributes][index+1]
|
259
|
+
skip = true
|
260
|
+
end
|
261
|
+
end
|
262
|
+
params[:attributes][:custom_attributes] = hash
|
263
|
+
end
|
264
|
+
|
265
|
+
# reject key which has nil value.
|
266
|
+
params[:attributes] = params[:attributes].reject {|_, value |value.nil?}
|
267
|
+
params[:resources] = params[:resources].reject {|_, value |value.nil?}
|
268
|
+
data = params.reject {|_, value |value.nil?}
|
269
|
+
|
270
|
+
# serialize params to JSON
|
271
|
+
args = []
|
272
|
+
args[0] = @psij.executor
|
273
|
+
|
274
|
+
@psij.submit_job_path(args: args, chdir: script.workdir, stdin: JSON.generate(data))
|
275
|
+
rescue Batch::Error => e
|
276
|
+
raise JobAdapterError, e
|
277
|
+
end
|
278
|
+
|
279
|
+
def cluster_info
|
280
|
+
end
|
281
|
+
|
282
|
+
def accounts
|
283
|
+
end
|
284
|
+
|
285
|
+
def delete(id)
|
286
|
+
id = id.to_s.strip()
|
287
|
+
params = {
|
288
|
+
id: id,
|
289
|
+
executor: @psij.executor,
|
290
|
+
}
|
291
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
292
|
+
@psij.delete_job(args: args)
|
293
|
+
rescue Batch::Error => e
|
294
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
295
|
+
end
|
296
|
+
|
297
|
+
def hold(id)
|
298
|
+
id = id.to_s.strip()
|
299
|
+
params = {
|
300
|
+
id: id,
|
301
|
+
executor: @psij.executor,
|
302
|
+
}
|
303
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
304
|
+
@psij.hold_job(args: args)
|
305
|
+
rescue Batch::Error => e
|
306
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
307
|
+
end
|
308
|
+
|
309
|
+
def release(id)
|
310
|
+
id = id.to_s.strip()
|
311
|
+
params = {
|
312
|
+
id: id,
|
313
|
+
executor: @psij.executor,
|
314
|
+
}
|
315
|
+
args = params.map { |k, v| "--#{k}=#{v}" }
|
316
|
+
@psij.release_job(args: args)
|
317
|
+
rescue Batch::Error => e
|
318
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
319
|
+
end
|
320
|
+
|
321
|
+
|
322
|
+
def info(id)
|
323
|
+
id = id.to_s
|
324
|
+
|
325
|
+
job_infos = @psij.get_jobs(id: id).map do |v|
|
326
|
+
parse_job_info(v)
|
327
|
+
end
|
328
|
+
|
329
|
+
if job_infos.empty?
|
330
|
+
Info.new(id: id, status: :completed)
|
331
|
+
else
|
332
|
+
job_infos.first
|
333
|
+
end
|
334
|
+
rescue Batch::Error => e
|
335
|
+
# set completed status if can't find job id
|
336
|
+
if /Invalid job id specified/ =~ e.message
|
337
|
+
Info.new(
|
338
|
+
id: id,
|
339
|
+
status: :completed
|
340
|
+
)
|
341
|
+
else
|
342
|
+
raise JobAdapterError, e.message
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
def info_all(attrs: nil)
|
347
|
+
@psij.get_jobs.map do |v|
|
348
|
+
parse_job_info(v)
|
349
|
+
end
|
350
|
+
rescue Batch::Error => e
|
351
|
+
raise JobAdapterError, e.message
|
352
|
+
end
|
353
|
+
|
354
|
+
def info_where_owner(owner, attrs: nil)
|
355
|
+
owner = Array.wrap(owner).map(&:to_s).join(',')
|
356
|
+
@psij.get_jobs(owner: owner).map do |v|
|
357
|
+
parse_job_info(v)
|
358
|
+
end
|
359
|
+
rescue Batch::Error => e
|
360
|
+
raise JobAdapterError, e.message
|
361
|
+
end
|
362
|
+
|
363
|
+
def status(id)
|
364
|
+
info(id.to_s).status
|
365
|
+
end
|
366
|
+
|
367
|
+
def directive_prefix
|
368
|
+
end
|
369
|
+
|
370
|
+
private
|
371
|
+
def get_state(st)
|
372
|
+
STATE_MAP.fetch(st, :undetermined)
|
373
|
+
end
|
374
|
+
|
375
|
+
def parse_job_info(v)
|
376
|
+
# parse input hash to Info object
|
377
|
+
# if v don't have :reosurcelist, set empty array
|
378
|
+
if v[:resourcelist].nil? || v[:resourcelist].empty?
|
379
|
+
allocated_nodes = [ { name: "" } ]
|
380
|
+
else
|
381
|
+
allocated_nodes = v[:resourcelist]
|
382
|
+
end
|
383
|
+
if v[:cpu_time].nil?
|
384
|
+
cpu_time = nil
|
385
|
+
else
|
386
|
+
cpu_time = v[:cpu_time].to_i
|
387
|
+
end
|
388
|
+
Info.new(
|
389
|
+
id: v[:native_id],
|
390
|
+
status: get_state(v[:current_state]),
|
391
|
+
allocated_nodes: allocated_nodes,
|
392
|
+
submit_host: v[:submit_host],
|
393
|
+
job_name: v[:name],
|
394
|
+
job_owner: v[:owner],
|
395
|
+
accounting_id: v[:account],
|
396
|
+
procs: v[:process_count] ? v[:process_count].to_i : 0,
|
397
|
+
queue_name: v[:queue_name],
|
398
|
+
wallclock_time: v[:wall_time],
|
399
|
+
wallclock_limit: v[:duration],
|
400
|
+
cpu_time: cpu_time,
|
401
|
+
submission_time: v[:submission_time] ? Time.parse(v[:submission_time]): nil,
|
402
|
+
dispatch_time: v[:dispatch_time] ? Time.parse(v[:dispatch_time]): nil,
|
403
|
+
native: v
|
404
|
+
)
|
405
|
+
end
|
406
|
+
|
407
|
+
end
|
408
|
+
end
|
409
|
+
end
|
410
|
+
end
|
@@ -323,6 +323,46 @@ module OodCore
|
|
323
323
|
}
|
324
324
|
end
|
325
325
|
|
326
|
+
# Job info fields requested from a formatted `sacct` call
|
327
|
+
def sacct_info_fields
|
328
|
+
{
|
329
|
+
# The user name of the user who ran the job.
|
330
|
+
user: 'User',
|
331
|
+
# The group name of the user who ran the job.
|
332
|
+
group_name: 'Group',
|
333
|
+
# Job Id for reference
|
334
|
+
job_id: 'JobId',
|
335
|
+
# The name of the job or job step
|
336
|
+
job_name: 'JobName',
|
337
|
+
# The job's elapsed time.
|
338
|
+
elapsed: 'Elapsed',
|
339
|
+
# Minimum required memory for the job
|
340
|
+
req_mem: 'ReqMem',
|
341
|
+
# Count of allocated CPUs
|
342
|
+
alloc_cpus: 'AllocCPUS',
|
343
|
+
# Number of requested CPUs.
|
344
|
+
req_cpus: 'ReqCPUS',
|
345
|
+
# What the timelimit was/is for the job
|
346
|
+
time_limit: 'Timelimit',
|
347
|
+
# Displays the job status, or state
|
348
|
+
state: 'State',
|
349
|
+
# The sum of the SystemCPU and UserCPU time used by the job or job step
|
350
|
+
total_cpu: 'TotalCPU',
|
351
|
+
# Maximum resident set size of all tasks in job.
|
352
|
+
max_rss: 'MaxRSS',
|
353
|
+
# Identifies the partition on which the job ran.
|
354
|
+
partition: 'Partition',
|
355
|
+
# The time the job was submitted. In the same format as End.
|
356
|
+
submit_time: 'Submit',
|
357
|
+
# Initiation time of the job. In the same format as End.
|
358
|
+
start_time: 'Start',
|
359
|
+
# Termination time of the job.
|
360
|
+
end: 'End',
|
361
|
+
# Trackable resources. These are the minimum resource counts requested by the job/step at submission time.
|
362
|
+
gres: 'ReqTRES'
|
363
|
+
}
|
364
|
+
end
|
365
|
+
|
326
366
|
def queues
|
327
367
|
info_raw = call('scontrol', 'show', 'part', '-o')
|
328
368
|
|
@@ -357,6 +397,31 @@ module OodCore
|
|
357
397
|
end.compact
|
358
398
|
end
|
359
399
|
|
400
|
+
def sacct_info(job_ids, states, from, to, show_steps)
|
401
|
+
# https://slurm.schedmd.com/sacct.html
|
402
|
+
fields = sacct_info_fields
|
403
|
+
args = ['-P'] # Output will be delimited
|
404
|
+
args.concat ['--delimiter', UNIT_SEPARATOR]
|
405
|
+
args.concat ['-n'] # No header
|
406
|
+
args.concat ['--units', 'G'] # Memory units in GB
|
407
|
+
args.concat ['--allocations'] unless show_steps # Show statistics relevant to the job, not taking steps into consideration
|
408
|
+
args.concat ['-o', fields.values.join(',')] # Required data
|
409
|
+
args.concat ['--state', states.join(',')] unless states.empty? # Filter by these states
|
410
|
+
args.concat ['-j', job_ids.join(',')] unless job_ids.empty? # Filter by these job ids
|
411
|
+
args.concat ['-S', from] if from # Filter from This date
|
412
|
+
args.concat ['-E', to] if to # Filter until this date
|
413
|
+
|
414
|
+
jobs_info = []
|
415
|
+
StringIO.open(call('sacct', *args)) do |output|
|
416
|
+
output.each_line do |line|
|
417
|
+
# Replace blank values with nil
|
418
|
+
values = line.strip.split(UNIT_SEPARATOR).map{ |value| value.to_s.empty? ? nil : value }
|
419
|
+
jobs_info << Hash[fields.keys.zip(values)] unless values.empty?
|
420
|
+
end
|
421
|
+
end
|
422
|
+
jobs_info
|
423
|
+
end
|
424
|
+
|
360
425
|
private
|
361
426
|
def str_to_queue_info(line)
|
362
427
|
hsh = line.split(' ').map do |token|
|
@@ -372,7 +437,11 @@ module OodCore
|
|
372
437
|
hsh[:AllowAccounts].to_s.split(',')
|
373
438
|
end
|
374
439
|
|
375
|
-
hsh[:deny_accounts] = hsh[:
|
440
|
+
hsh[:deny_accounts] = if !hsh[:allow_accounts].nil?
|
441
|
+
[] # manpage says AllowAccounts negates DenyAccounts
|
442
|
+
else
|
443
|
+
hsh[:DenyAccounts].nil? ? [] : hsh[:DenyAccounts].to_s.split(',')
|
444
|
+
end
|
376
445
|
|
377
446
|
hsh[:tres] = case hsh[:TRES]
|
378
447
|
when nil, '(null)', ''
|
@@ -466,8 +535,23 @@ module OodCore
|
|
466
535
|
'SE' => :completed, # SPECIAL_EXIT
|
467
536
|
'ST' => :running, # STOPPED
|
468
537
|
'S' => :suspended, # SUSPENDED
|
469
|
-
'TO' => :completed,
|
470
|
-
'OOM' => :completed
|
538
|
+
'TO' => :completed, # TIMEOUT
|
539
|
+
'OOM' => :completed, # OUT_OF_MEMORY
|
540
|
+
|
541
|
+
'BOOT_FAIL' => :completed,
|
542
|
+
'CANCELED' => :completed,
|
543
|
+
'COMPLETED' => :completed,
|
544
|
+
'DEADLINE' => :completed,
|
545
|
+
'FAILED' => :completed,
|
546
|
+
'NODE_FAIL' => :completed,
|
547
|
+
'OUT_OF_MEMORY' => :completed,
|
548
|
+
'PENDING' => :queued,
|
549
|
+
'PREEMPTED' => :completed,
|
550
|
+
'RUNNING' => :running,
|
551
|
+
'REQUEUED' => :queued,
|
552
|
+
'REVOKED' => :completed,
|
553
|
+
'SUSPENDED' => :suspended,
|
554
|
+
'TIMEOUT' => :completed,
|
471
555
|
}
|
472
556
|
|
473
557
|
# @api private
|
@@ -586,6 +670,45 @@ module OodCore
|
|
586
670
|
raise JobAdapterError, e.message
|
587
671
|
end
|
588
672
|
|
673
|
+
# Retrieve historic info for all completed jobs from the resource manager.
|
674
|
+
#
|
675
|
+
# Known options:
|
676
|
+
# job_ids [Array<#to_s>] optional list of job ids to filter the results.
|
677
|
+
# states [Array<#to_s>] optional list of job state codes.
|
678
|
+
# Selects jobs based on their state during the time period given.
|
679
|
+
# from [#to_s] optional date string to filter jobs in any state after the specified time.
|
680
|
+
# If states are provided, filter jobs in these states after this period
|
681
|
+
# to [#to_s] optional date string to filter jobs in any state before the specified time.
|
682
|
+
# If states are provided, filter jobs in these states before this period.
|
683
|
+
# show_steps [#Boolean] optional boolean to filter job steps from the results.
|
684
|
+
#
|
685
|
+
# @return [Array<Info>] information describing submitted jobs
|
686
|
+
# @see Adapter#info_historic
|
687
|
+
def info_historic(opts: {})
|
688
|
+
job_ids = opts.fetch(:job_ids, [])
|
689
|
+
states = opts.fetch(:states, [])
|
690
|
+
from = opts.fetch(:from, nil)
|
691
|
+
to = opts.fetch(:to, nil)
|
692
|
+
show_steps = opts.fetch(:show_steps, false)
|
693
|
+
@slurm.sacct_info(job_ids, states, from, to, show_steps).map do |v|
|
694
|
+
Info.new(
|
695
|
+
id: v[:job_id],
|
696
|
+
status: get_state(v[:state]),
|
697
|
+
job_name: v[:job_name],
|
698
|
+
job_owner: v[:user],
|
699
|
+
procs: v[:alloc_cpus],
|
700
|
+
queue_name: v[:partition],
|
701
|
+
wallclock_time: duration_in_seconds(v[:elapsed]),
|
702
|
+
wallclock_limit: duration_in_seconds(v[:time_limit]),
|
703
|
+
cpu_time: duration_in_seconds(v[:total_cpu]),
|
704
|
+
submission_time: parse_time(v[:submit_time]),
|
705
|
+
dispatch_time: parse_time(v[:start_time]),
|
706
|
+
native: v,
|
707
|
+
gpus: self.class.gpus_from_gres(v[:gres])
|
708
|
+
)
|
709
|
+
end
|
710
|
+
end
|
711
|
+
|
589
712
|
# Retrieve job info from the resource manager
|
590
713
|
# @param id [#to_s] the id of the job
|
591
714
|
# @raise [JobAdapterError] if something goes wrong getting job info
|
@@ -718,6 +841,13 @@ module OodCore
|
|
718
841
|
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
|
719
842
|
end
|
720
843
|
|
844
|
+
# Parse date time string ignoring unknown values returned by Slurm
|
845
|
+
def parse_time(date_time)
|
846
|
+
return nil if date_time.empty? || %w[N/A NONE UNKNOWN].include?(date_time.to_s.upcase)
|
847
|
+
|
848
|
+
Time.parse(date_time)
|
849
|
+
end
|
850
|
+
|
721
851
|
# Convert host list string to individual nodes
|
722
852
|
# "em082"
|
723
853
|
# "em[014,055-056,161]"
|
data/lib/ood_core/version.rb
CHANGED
data/ood_core.gemspec
CHANGED
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_runtime_dependency "ffi", "~> 1.16.3"
|
27
27
|
spec.add_runtime_dependency "rexml", "~> 3.2"
|
28
28
|
spec.add_development_dependency "bundler", "~> 2.1"
|
29
|
-
spec.add_development_dependency "rake", "~> 13.
|
29
|
+
spec.add_development_dependency "rake", "~> 13.3.0"
|
30
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
31
|
spec.add_development_dependency "pry", "~> 0.10"
|
32
32
|
spec.add_development_dependency "timecop", "~> 0.8"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.29.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2025-
|
13
|
+
date: 2025-08-08 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -74,14 +74,14 @@ dependencies:
|
|
74
74
|
requirements:
|
75
75
|
- - "~>"
|
76
76
|
- !ruby/object:Gem::Version
|
77
|
-
version: 13.
|
77
|
+
version: 13.3.0
|
78
78
|
type: :development
|
79
79
|
prerelease: false
|
80
80
|
version_requirements: !ruby/object:Gem::Requirement
|
81
81
|
requirements:
|
82
82
|
- - "~>"
|
83
83
|
- !ruby/object:Gem::Version
|
84
|
-
version: 13.
|
84
|
+
version: 13.3.0
|
85
85
|
- !ruby/object:Gem::Dependency
|
86
86
|
name: rspec
|
87
87
|
requirement: !ruby/object:Gem::Requirement
|
@@ -204,9 +204,13 @@ files:
|
|
204
204
|
- lib/ood_core/job/account_info.rb
|
205
205
|
- lib/ood_core/job/adapter.rb
|
206
206
|
- lib/ood_core/job/adapters/ccq.rb
|
207
|
+
- lib/ood_core/job/adapters/coder.rb
|
208
|
+
- lib/ood_core/job/adapters/coder/batch.rb
|
209
|
+
- lib/ood_core/job/adapters/coder/coder_job_info.rb
|
207
210
|
- lib/ood_core/job/adapters/drmaa.rb
|
208
211
|
- lib/ood_core/job/adapters/fujitsu_tcs.rb
|
209
212
|
- lib/ood_core/job/adapters/helper.rb
|
213
|
+
- lib/ood_core/job/adapters/htcondor.rb
|
210
214
|
- lib/ood_core/job/adapters/kubernetes.rb
|
211
215
|
- lib/ood_core/job/adapters/kubernetes/batch.rb
|
212
216
|
- lib/ood_core/job/adapters/kubernetes/helper.rb
|
@@ -221,6 +225,12 @@ files:
|
|
221
225
|
- lib/ood_core/job/adapters/lsf/batch.rb
|
222
226
|
- lib/ood_core/job/adapters/lsf/helper.rb
|
223
227
|
- lib/ood_core/job/adapters/pbspro.rb
|
228
|
+
- lib/ood_core/job/adapters/psij.rb
|
229
|
+
- lib/ood_core/job/adapters/psij/delete.py
|
230
|
+
- lib/ood_core/job/adapters/psij/get_info.py
|
231
|
+
- lib/ood_core/job/adapters/psij/hold.py
|
232
|
+
- lib/ood_core/job/adapters/psij/release.py
|
233
|
+
- lib/ood_core/job/adapters/psij/submit.py
|
224
234
|
- lib/ood_core/job/adapters/sge.rb
|
225
235
|
- lib/ood_core/job/adapters/sge/batch.rb
|
226
236
|
- lib/ood_core/job/adapters/sge/helper.rb
|