ood_core 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +50 -0
- data/.rspec +2 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +60 -0
- data/Rakefile +6 -0
- data/bin/console +11 -0
- data/bin/setup +8 -0
- data/lib/ood_core.rb +34 -0
- data/lib/ood_core/acl/adapter.rb +17 -0
- data/lib/ood_core/acl/adapters/group.rb +59 -0
- data/lib/ood_core/acl/factory.rb +41 -0
- data/lib/ood_core/cluster.rb +143 -0
- data/lib/ood_core/clusters.rb +114 -0
- data/lib/ood_core/errors.rb +19 -0
- data/lib/ood_core/job/adapter.rb +89 -0
- data/lib/ood_core/job/adapters/lsf.rb +193 -0
- data/lib/ood_core/job/adapters/lsf/batch.rb +160 -0
- data/lib/ood_core/job/adapters/lsf/helper.rb +26 -0
- data/lib/ood_core/job/adapters/slurm.rb +470 -0
- data/lib/ood_core/job/adapters/torque.rb +274 -0
- data/lib/ood_core/job/factory.rb +41 -0
- data/lib/ood_core/job/info.rb +141 -0
- data/lib/ood_core/job/node_info.rb +47 -0
- data/lib/ood_core/job/node_request.rb +51 -0
- data/lib/ood_core/job/script.rb +235 -0
- data/lib/ood_core/job/status.rb +128 -0
- data/lib/ood_core/refinements/array_extensions.rb +22 -0
- data/lib/ood_core/refinements/hash_extensions.rb +25 -0
- data/lib/ood_core/version.rb +4 -0
- data/ood_core.gemspec +32 -0
- metadata +182 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
# Object used for simplified communication with a LSF batch server
|
2
|
+
#
|
3
|
+
# @api private
|
4
|
+
class OodCore::Job::Adapters::Lsf::Helper
|
5
|
+
|
6
|
+
# convert string in format "03/31-14:46:42" to Time object
|
7
|
+
# assumes time being parsed is a time that ocurred in the past
|
8
|
+
# not to be used for parsing times in the future (like estimated FINISH_TIME)
|
9
|
+
def parse_past_time(t, ignore_errors: false)
|
10
|
+
return nil if t.nil? || t.empty? || t == "-"
|
11
|
+
year = Time.now.year
|
12
|
+
time = Time.parse("#{year}/#{t}")
|
13
|
+
|
14
|
+
# handle edge case where job started before new year
|
15
|
+
time = Time.parse("#{year - 1}/#{t}") if time.month > Time.now.month
|
16
|
+
|
17
|
+
time
|
18
|
+
|
19
|
+
rescue ArgumentError => e
|
20
|
+
raise e unless ignore_errors
|
21
|
+
|
22
|
+
#TODO: warn via logger
|
23
|
+
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,470 @@
|
|
1
|
+
require "time"
|
2
|
+
require "ood_core/refinements/hash_extensions"
|
3
|
+
|
4
|
+
module OodCore
|
5
|
+
module Job
|
6
|
+
class Factory
|
7
|
+
using Refinements::HashExtensions
|
8
|
+
|
9
|
+
# Build the Slurm adapter from a configuration
|
10
|
+
# @param config [#to_h] the configuration for job adapter
|
11
|
+
# @option config [#to_s] :cluster ('') The cluster to communicate with
|
12
|
+
# @option config [#to_s] :bin ('') Path to slurm client binaries
|
13
|
+
def self.build_slurm(config)
|
14
|
+
c = config.to_h.symbolize_keys
|
15
|
+
cluster = c.fetch(:cluster, "").to_s
|
16
|
+
bin = c.fetch(:bin, "").to_s
|
17
|
+
slurm = Adapters::Slurm::Batch.new(cluster: cluster, bin: bin)
|
18
|
+
Adapters::Slurm.new(slurm: slurm)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
module Adapters
|
23
|
+
# An adapter object that describes the communication with a Slurm
|
24
|
+
# resource manager for job management.
|
25
|
+
class Slurm < Adapter
|
26
|
+
using Refinements::HashExtensions
|
27
|
+
|
28
|
+
# Object used for simplified communication with a Slurm batch server
|
29
|
+
# @api private
|
30
|
+
class Batch
|
31
|
+
# The cluster of the Slurm batch server
|
32
|
+
# @example CHPC's kingspeak cluster
|
33
|
+
# my_batch.cluster #=> "kingspeak"
|
34
|
+
# @return [String] the cluster name
|
35
|
+
attr_reader :cluster
|
36
|
+
|
37
|
+
# The path to the Slurm client installation binaries
|
38
|
+
# @example For Slurm 10.0.0
|
39
|
+
# my_batch.bin.to_s #=> "/usr/local/slurm/10.0.0/bin
|
40
|
+
# @return [Pathname] path to slurm binaries
|
41
|
+
attr_reader :bin
|
42
|
+
|
43
|
+
# The root exception class that all Slurm-specific exceptions inherit
|
44
|
+
# from
|
45
|
+
class Error < StandardError; end
|
46
|
+
|
47
|
+
# @param cluster [#to_s] the cluster name
|
48
|
+
# @param bin [#to_s] path to slurm installation binaries
|
49
|
+
def initialize(cluster: "", bin: "")
|
50
|
+
@cluster = cluster.to_s
|
51
|
+
@bin = Pathname.new(bin.to_s)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Get a list of hashes detailing each of the jobs on the batch server
|
55
|
+
# @example Status info for all jobs
|
56
|
+
# my_batch.get_jobs
|
57
|
+
# #=>
|
58
|
+
# #[
|
59
|
+
# # {
|
60
|
+
# # :account => "account",
|
61
|
+
# # :job_id => "my_job",
|
62
|
+
# # ...
|
63
|
+
# # },
|
64
|
+
# # {
|
65
|
+
# # :account => "account",
|
66
|
+
# # :job_id => "my_other_job",
|
67
|
+
# # ...
|
68
|
+
# # },
|
69
|
+
# # ...
|
70
|
+
# #]
|
71
|
+
# @param id [#to_s] the id of the job
|
72
|
+
# @param filters [Array<Symbol>] list of attributes to filter on
|
73
|
+
# @raise [Error] if `squeue` command exited unsuccessfully
|
74
|
+
# @return [Array<Hash>] list of details for jobs
|
75
|
+
def get_jobs(id: "", filters: [])
|
76
|
+
delim = "\x1F" # don't use "|" because FEATURES uses this
|
77
|
+
options = filters.empty? ? fields : fields.slice(*filters)
|
78
|
+
args = ["--all", "--states=all", "--noconvert"]
|
79
|
+
args += ["-o", "#{options.values.join(delim)}"]
|
80
|
+
args += ["-j", id.to_s] unless id.to_s.empty?
|
81
|
+
lines = call("squeue", *args).split("\n").map(&:strip)
|
82
|
+
|
83
|
+
lines.drop(cluster.empty? ? 1 : 2).map do |line|
|
84
|
+
Hash[options.keys.zip(line.split(delim))]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Put a specified job on hold
|
89
|
+
# @example Put job "1234" on hold
|
90
|
+
# my_batch.hold_job("1234")
|
91
|
+
# @param id [#to_s] the id of the job
|
92
|
+
# @raise [Error] if `scontrol` command exited unsuccessfully
|
93
|
+
# @return [void]
|
94
|
+
def hold_job(id)
|
95
|
+
call("scontrol", "hold", id.to_s)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Release a specified job that is on hold
|
99
|
+
# @example Release job "1234" from on hold
|
100
|
+
# my_batch.release_job("1234")
|
101
|
+
# @param id [#to_s] the id of the job
|
102
|
+
# @raise [Error] if `scontrol` command exited unsuccessfully
|
103
|
+
# @return [void]
|
104
|
+
def release_job(id)
|
105
|
+
call("scontrol", "release", id.to_s)
|
106
|
+
end
|
107
|
+
|
108
|
+
# Delete a specified job from batch server
|
109
|
+
# @example Delete job "1234"
|
110
|
+
# my_batch.delete_job("1234")
|
111
|
+
# @param id [#to_s] the id of the job
|
112
|
+
# @raise [Error] if `scancel` command exited unsuccessfully
|
113
|
+
# @return [void]
|
114
|
+
def delete_job(id)
|
115
|
+
call("scancel", id.to_s)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Submit a script expanded as a string to the batch server
|
119
|
+
# @param str [#to_s] script as a string
|
120
|
+
# @param args [Array<#to_s>] arguments passed to `sbatch` command
|
121
|
+
# @param env [Hash{#to_s => #to_s}] environment variables set
|
122
|
+
# @raise [Error] if `sbatch` command exited unsuccessfully
|
123
|
+
# @return [String] the id of the job that was created
|
124
|
+
def submit_string(str, args: [], env: {})
|
125
|
+
args = args.map(&:to_s) + ["--parsable"]
|
126
|
+
env = {"SBATCH_EXPORT" => "NONE"}.merge env.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
|
127
|
+
call("sbatch", *args, env: env, stdin: str.to_s).strip.split(";").first
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
# Call a forked Slurm command for a given cluster
|
132
|
+
def call(cmd, *args, env: {}, stdin: "")
|
133
|
+
cmd = bin.join(cmd.to_s).to_s
|
134
|
+
args = args.map(&:to_s)
|
135
|
+
args += ["-M", cluster] unless cluster.empty?
|
136
|
+
env = env.to_h
|
137
|
+
o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s)
|
138
|
+
s.success? ? o : raise(Error, e)
|
139
|
+
end
|
140
|
+
|
141
|
+
# Fields requested from a formatted `squeue` call
|
142
|
+
def fields
|
143
|
+
{
|
144
|
+
account: "%a",
|
145
|
+
job_id: "%A",
|
146
|
+
gres: "%b",
|
147
|
+
exec_host: "%B",
|
148
|
+
min_cpus: "%c",
|
149
|
+
cpus: "%C",
|
150
|
+
min_tmp_disk: "%d",
|
151
|
+
nodes: "%D",
|
152
|
+
end_time: "%e",
|
153
|
+
dependency: "%E",
|
154
|
+
features: "%f",
|
155
|
+
array_job_id: "%F",
|
156
|
+
group_name: "%g",
|
157
|
+
group_id: "%G",
|
158
|
+
over_subscribe: "%h",
|
159
|
+
sockets_per_node: "%H",
|
160
|
+
array_job_task_id: "%i",
|
161
|
+
cores_per_socket: "%I",
|
162
|
+
job_name: "%j",
|
163
|
+
threads_per_core: "%J",
|
164
|
+
comment: "%k",
|
165
|
+
array_task_id: "%K",
|
166
|
+
time_limit: "%l",
|
167
|
+
time_left: "%L",
|
168
|
+
min_memory: "%m",
|
169
|
+
time_used: "%M",
|
170
|
+
req_node: "%n",
|
171
|
+
node_list: "%N",
|
172
|
+
command: "%o",
|
173
|
+
contiguous: "%O",
|
174
|
+
qos: "%q",
|
175
|
+
partition: "%P",
|
176
|
+
priority: "%Q",
|
177
|
+
reason: "%r",
|
178
|
+
start_time: "%S",
|
179
|
+
state_compact: "%t",
|
180
|
+
state: "%T",
|
181
|
+
user: "%u",
|
182
|
+
user_id: "%U",
|
183
|
+
reservation: "%v",
|
184
|
+
submit_time: "%V",
|
185
|
+
wckey: "%w",
|
186
|
+
licenses: "%W",
|
187
|
+
excluded_nodes: "%x",
|
188
|
+
core_specialization: "%X",
|
189
|
+
nice: "%y",
|
190
|
+
scheduled_nodes: "%Y",
|
191
|
+
sockets_cores_threads: "%z",
|
192
|
+
work_dir: "%Z"
|
193
|
+
}
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# Mapping of state codes for Slurm
|
198
|
+
STATE_MAP = {
|
199
|
+
'BF' => :completed, # BOOT_FAIL
|
200
|
+
'CA' => :completed, # CANCELLED
|
201
|
+
'CD' => :completed, # COMPLETED
|
202
|
+
'CF' => :queued, # CONFIGURING
|
203
|
+
'CG' => :running, # COMPLETING
|
204
|
+
'F' => :completed, # FAILED
|
205
|
+
'NF' => :completed, # NODE_FAIL
|
206
|
+
'PD' => :queued, # PENDING
|
207
|
+
'PR' => :suspended, # PREEMPTED
|
208
|
+
'RV' => :completed, # REVOKED
|
209
|
+
'R' => :running, # RUNNING
|
210
|
+
'SE' => :completed, # SPECIAL_EXIT
|
211
|
+
'ST' => :running, # STOPPED
|
212
|
+
'S' => :suspended, # SUSPENDED
|
213
|
+
'TO' => :completed # TIMEOUT
|
214
|
+
}
|
215
|
+
|
216
|
+
# @api private
|
217
|
+
# @param opts [#to_h] the options defining this adapter
|
218
|
+
# @option opts [Batch] :slurm The Slurm batch object
|
219
|
+
# @see Factory.build_slurm
|
220
|
+
def initialize(opts = {})
|
221
|
+
o = opts.to_h.symbolize_keys
|
222
|
+
|
223
|
+
@slurm = o.fetch(:slurm) { raise ArgumentError, "No slurm object specified. Missing argument: slurm" }
|
224
|
+
end
|
225
|
+
|
226
|
+
# Submit a job with the attributes defined in the job template instance
|
227
|
+
# @param script [Script] script object that describes the script and
|
228
|
+
# attributes for the submitted job
|
229
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for
|
230
|
+
# execution at any point after dependent jobs have started execution
|
231
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
232
|
+
# execution only after dependent jobs have terminated with no errors
|
233
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
234
|
+
# execution only after dependent jobs have terminated with errors
|
235
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
236
|
+
# execution after dependent jobs have terminated
|
237
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
238
|
+
# @return [String] the job id returned after successfully submitting a
|
239
|
+
# job
|
240
|
+
# @see Adapter#submit
|
241
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
242
|
+
after = Array(after).map(&:to_s)
|
243
|
+
afterok = Array(afterok).map(&:to_s)
|
244
|
+
afternotok = Array(afternotok).map(&:to_s)
|
245
|
+
afterany = Array(afterany).map(&:to_s)
|
246
|
+
|
247
|
+
# Set sbatch options
|
248
|
+
args = []
|
249
|
+
# ignore args, don't know how to do this for slurm
|
250
|
+
args += ["-H"] if script.submit_as_hold
|
251
|
+
args += (script.rerunnable ? ["--requeue"] : ["--no-requeue"]) unless script.rerunnable.nil?
|
252
|
+
args += ["-D", script.workdir.to_s] unless script.workdir.nil?
|
253
|
+
args += ["--mail-user", script.email.join(",")] unless script.email.nil?
|
254
|
+
if script.email_on_started && script.email_on_terminated
|
255
|
+
args += ["--mail-type", "ALL"]
|
256
|
+
elsif script.email_on_started
|
257
|
+
args += ["--mail-type", "BEGIN"]
|
258
|
+
elsif script.email_on_terminated
|
259
|
+
args += ["--mail-type", "END"]
|
260
|
+
elsif script.email_on_started == false && script.email_on_terminated == false
|
261
|
+
args += ["--mail-type", "NONE"]
|
262
|
+
end
|
263
|
+
args += ["-J", script.job_name] unless script.job_name.nil?
|
264
|
+
args += ["-i", script.input_path] unless script.input_path.nil?
|
265
|
+
args += ["-o", script.output_path] unless script.output_path.nil?
|
266
|
+
args += ["-e", script.error_path] unless script.error_path.nil?
|
267
|
+
# ignore join_files, by default it joins stdout and stderr unless
|
268
|
+
# error_path is specified
|
269
|
+
args += ["--reservation", script.reservation_id] unless script.reservation_id.nil?
|
270
|
+
args += ["-p", script.queue_name] unless script.queue_name.nil?
|
271
|
+
args += ["--priority", script.priority] unless script.priority.nil?
|
272
|
+
args += ["--begin", script.start_time.localtime.strftime("%C%y-%m-%dT%H:%M:%S")] unless script.start_time.nil?
|
273
|
+
args += ["-A", script.accounting_id] unless script.accounting_id.nil?
|
274
|
+
args += ["--mem", "#{script.min_phys_memory}K"] unless script.min_phys_memory.nil?
|
275
|
+
args += ["-t", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
|
276
|
+
# ignore nodes, don't know how to do this for slurm
|
277
|
+
|
278
|
+
# Set dependencies
|
279
|
+
depend = []
|
280
|
+
depend << "after:#{after.join(":")}" unless after.empty?
|
281
|
+
depend << "afterok:#{afterok.join(":")}" unless afterok.empty?
|
282
|
+
depend << "afternotok:#{afternotok.join(":")}" unless afternotok.empty?
|
283
|
+
depend << "afterany:#{afterany.join(":")}" unless afterany.empty?
|
284
|
+
args += ["-d", depend.join(",")] unless depend.empty?
|
285
|
+
|
286
|
+
# Set environment variables
|
287
|
+
env = script.job_environment || {}
|
288
|
+
args += ["--export", script.job_environment.keys.join(",")] unless script.job_environment.nil? || script.job_environment.empty?
|
289
|
+
|
290
|
+
# Set native options
|
291
|
+
args += script.native if script.native
|
292
|
+
|
293
|
+
# Submit job
|
294
|
+
@slurm.submit_string(script.content, args: args, env: env)
|
295
|
+
rescue Batch::Error => e
|
296
|
+
raise JobAdapterError, e.message
|
297
|
+
end
|
298
|
+
|
299
|
+
# Retrieve info for all jobs from the resource manager
|
300
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
301
|
+
# @return [Array<Info>] information describing submitted jobs
|
302
|
+
# @see Adapter#info_all
|
303
|
+
def info_all
|
304
|
+
@slurm.get_jobs.map do |v|
|
305
|
+
parse_job_info(v)
|
306
|
+
end
|
307
|
+
rescue Batch::Error => e
|
308
|
+
raise JobAdapterError, e.message
|
309
|
+
end
|
310
|
+
|
311
|
+
# Retrieve job info from the resource manager
|
312
|
+
# @param id [#to_s] the id of the job
|
313
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
314
|
+
# @return [Info] information describing submitted job
|
315
|
+
# @see Adapter#info
|
316
|
+
def info(id)
|
317
|
+
id = id.to_s
|
318
|
+
info_ary = @slurm.get_jobs(id: id).map do |v|
|
319
|
+
parse_job_info(v)
|
320
|
+
end
|
321
|
+
|
322
|
+
# A job id can return multiple jobs if it corresponds to a job
|
323
|
+
# array id, so we need to find the job that corresponds to the
|
324
|
+
# given job id (if we can't find it, we assume it has completed)
|
325
|
+
info_ary.detect( -> { Info.new(id: id, status: :completed) } ) do |info|
|
326
|
+
# Match the job id or the formatted job & task id "1234_0"
|
327
|
+
info.id == id || info.native[:array_job_task_id] == id
|
328
|
+
end
|
329
|
+
rescue Batch::Error => e
|
330
|
+
# set completed status if can't find job id
|
331
|
+
if /Invalid job id specified/ =~ e.message
|
332
|
+
Info.new(
|
333
|
+
id: id,
|
334
|
+
status: :completed
|
335
|
+
)
|
336
|
+
else
|
337
|
+
raise JobAdapterError, e.message
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
# Retrieve job status from resource manager
|
342
|
+
# @param id [#to_s] the id of the job
|
343
|
+
# @raise [JobAdapterError] if something goes wrong getting job status
|
344
|
+
# @return [Status] status of job
|
345
|
+
# @see Adapter#status
|
346
|
+
def status(id)
|
347
|
+
id = id.to_s
|
348
|
+
jobs = @slurm.get_jobs(
|
349
|
+
id: id,
|
350
|
+
filters: [:job_id, :array_job_task_id, :state_compact]
|
351
|
+
)
|
352
|
+
# A job id can return multiple jobs if it corresponds to a job array
|
353
|
+
# id, so we need to find the job that corresponds to the given job id
|
354
|
+
# (if we can't find it, we assume it has completed)
|
355
|
+
#
|
356
|
+
# Match against the job id or the formatted job & task id "1234_0"
|
357
|
+
if job = jobs.detect { |j| j[:job_id] == id || j[:array_job_task_id] == id }
|
358
|
+
Status.new(state: get_state(job[:state_compact]))
|
359
|
+
else
|
360
|
+
# set completed status if can't find job id
|
361
|
+
Status.new(state: :completed)
|
362
|
+
end
|
363
|
+
rescue Batch::Error => e
|
364
|
+
# set completed status if can't find job id
|
365
|
+
if /Invalid job id specified/ =~ e.message
|
366
|
+
Status.new(state: :completed)
|
367
|
+
else
|
368
|
+
raise JobAdapterError, e.message
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
# Put the submitted job on hold
|
373
|
+
# @param id [#to_s] the id of the job
|
374
|
+
# @raise [JobAdapterError] if something goes wrong holding a job
|
375
|
+
# @return [void]
|
376
|
+
# @see Adapter#hold
|
377
|
+
def hold(id)
|
378
|
+
@slurm.hold_job(id.to_s)
|
379
|
+
rescue Batch::Error => e
|
380
|
+
# assume successful job hold if can't find job id
|
381
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
382
|
+
end
|
383
|
+
|
384
|
+
# Release the job that is on hold
|
385
|
+
# @param id [#to_s] the id of the job
|
386
|
+
# @raise [JobAdapterError] if something goes wrong releasing a job
|
387
|
+
# @return [void]
|
388
|
+
# @see Adapter#release
|
389
|
+
def release(id)
|
390
|
+
@slurm.release_job(id.to_s)
|
391
|
+
rescue Batch::Error => e
|
392
|
+
# assume successful job release if can't find job id
|
393
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
394
|
+
end
|
395
|
+
|
396
|
+
# Delete the submitted job
|
397
|
+
# @param id [#to_s] the id of the job
|
398
|
+
# @raise [JobAdapterError] if something goes wrong deleting a job
|
399
|
+
# @return [void]
|
400
|
+
# @see Adapter#delete
|
401
|
+
def delete(id)
|
402
|
+
@slurm.delete_job(id.to_s)
|
403
|
+
rescue Batch::Error => e
|
404
|
+
# assume successful job deletion if can't find job id
|
405
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
406
|
+
end
|
407
|
+
|
408
|
+
private
|
409
|
+
# Convert duration to seconds
|
410
|
+
def duration_in_seconds(time)
|
411
|
+
return 0 if time.nil?
|
412
|
+
time, days = time.split("-").reverse
|
413
|
+
days.to_i * 24 * 3600 +
|
414
|
+
time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
|
415
|
+
end
|
416
|
+
|
417
|
+
# Convert seconds to duration
|
418
|
+
def seconds_to_duration(time)
|
419
|
+
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
|
420
|
+
end
|
421
|
+
|
422
|
+
# Convert host list string to individual nodes
|
423
|
+
# "em082"
|
424
|
+
# "em[014,055-056,161]"
|
425
|
+
# "n0163/2,7,10-11+n0205/0-11+n0156/0-11"
|
426
|
+
def parse_nodes(node_list)
|
427
|
+
/^(?<prefix>[^\[]+)(\[(?<range>[^\]]+)\])?$/ =~ node_list
|
428
|
+
|
429
|
+
if range
|
430
|
+
range.split(",").map do |x|
|
431
|
+
x =~ /^(\d+)-(\d+)$/ ? ($1..$2).to_a : x
|
432
|
+
end.flatten.map do |n|
|
433
|
+
{ name: prefix + n, procs: nil }
|
434
|
+
end
|
435
|
+
elsif prefix
|
436
|
+
[ { name: prefix, procs: nil } ]
|
437
|
+
else
|
438
|
+
[]
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
# Determine state from Slurm state code
|
443
|
+
def get_state(st)
|
444
|
+
STATE_MAP.fetch(st, :undetermined)
|
445
|
+
end
|
446
|
+
|
447
|
+
# Parse hash describing Slurm job status
|
448
|
+
def parse_job_info(v)
|
449
|
+
allocated_nodes = parse_nodes(v[:node_list])
|
450
|
+
Info.new(
|
451
|
+
id: v[:job_id],
|
452
|
+
status: get_state(v[:state_compact]),
|
453
|
+
allocated_nodes: allocated_nodes,
|
454
|
+
submit_host: nil,
|
455
|
+
job_name: v[:job_name],
|
456
|
+
job_owner: v[:user],
|
457
|
+
accounting_id: v[:account],
|
458
|
+
procs: v[:cpus],
|
459
|
+
queue_name: v[:partition],
|
460
|
+
wallclock_time: duration_in_seconds(v[:time_used]),
|
461
|
+
cpu_time: nil,
|
462
|
+
submission_time: Time.parse(v[:submit_time]),
|
463
|
+
dispatch_time: v[:start_time] == "N/A" ? nil : Time.parse(v[:start_time]),
|
464
|
+
native: v
|
465
|
+
)
|
466
|
+
end
|
467
|
+
end
|
468
|
+
end
|
469
|
+
end
|
470
|
+
end
|