ood_core 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +50 -0
- data/.rspec +2 -0
- data/.travis.yml +9 -0
- data/CHANGELOG.md +5 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +60 -0
- data/Rakefile +6 -0
- data/bin/console +11 -0
- data/bin/setup +8 -0
- data/lib/ood_core.rb +34 -0
- data/lib/ood_core/acl/adapter.rb +17 -0
- data/lib/ood_core/acl/adapters/group.rb +59 -0
- data/lib/ood_core/acl/factory.rb +41 -0
- data/lib/ood_core/cluster.rb +143 -0
- data/lib/ood_core/clusters.rb +114 -0
- data/lib/ood_core/errors.rb +19 -0
- data/lib/ood_core/job/adapter.rb +89 -0
- data/lib/ood_core/job/adapters/lsf.rb +193 -0
- data/lib/ood_core/job/adapters/lsf/batch.rb +160 -0
- data/lib/ood_core/job/adapters/lsf/helper.rb +26 -0
- data/lib/ood_core/job/adapters/slurm.rb +470 -0
- data/lib/ood_core/job/adapters/torque.rb +274 -0
- data/lib/ood_core/job/factory.rb +41 -0
- data/lib/ood_core/job/info.rb +141 -0
- data/lib/ood_core/job/node_info.rb +47 -0
- data/lib/ood_core/job/node_request.rb +51 -0
- data/lib/ood_core/job/script.rb +235 -0
- data/lib/ood_core/job/status.rb +128 -0
- data/lib/ood_core/refinements/array_extensions.rb +22 -0
- data/lib/ood_core/refinements/hash_extensions.rb +25 -0
- data/lib/ood_core/version.rb +4 -0
- data/ood_core.gemspec +32 -0
- metadata +182 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
# Object used for simplified communication with a LSF batch server
|
2
|
+
#
|
3
|
+
# @api private
|
4
|
+
class OodCore::Job::Adapters::Lsf::Helper
|
5
|
+
|
6
|
+
# convert string in format "03/31-14:46:42" to Time object
|
7
|
+
# assumes time being parsed is a time that ocurred in the past
|
8
|
+
# not to be used for parsing times in the future (like estimated FINISH_TIME)
|
9
|
+
def parse_past_time(t, ignore_errors: false)
|
10
|
+
return nil if t.nil? || t.empty? || t == "-"
|
11
|
+
year = Time.now.year
|
12
|
+
time = Time.parse("#{year}/#{t}")
|
13
|
+
|
14
|
+
# handle edge case where job started before new year
|
15
|
+
time = Time.parse("#{year - 1}/#{t}") if time.month > Time.now.month
|
16
|
+
|
17
|
+
time
|
18
|
+
|
19
|
+
rescue ArgumentError => e
|
20
|
+
raise e unless ignore_errors
|
21
|
+
|
22
|
+
#TODO: warn via logger
|
23
|
+
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,470 @@
|
|
1
|
+
require "time"
|
2
|
+
require "ood_core/refinements/hash_extensions"
|
3
|
+
|
4
|
+
module OodCore
|
5
|
+
module Job
|
6
|
+
class Factory
|
7
|
+
using Refinements::HashExtensions
|
8
|
+
|
9
|
+
# Build the Slurm adapter from a configuration
|
10
|
+
# @param config [#to_h] the configuration for job adapter
|
11
|
+
# @option config [#to_s] :cluster ('') The cluster to communicate with
|
12
|
+
# @option config [#to_s] :bin ('') Path to slurm client binaries
|
13
|
+
def self.build_slurm(config)
|
14
|
+
c = config.to_h.symbolize_keys
|
15
|
+
cluster = c.fetch(:cluster, "").to_s
|
16
|
+
bin = c.fetch(:bin, "").to_s
|
17
|
+
slurm = Adapters::Slurm::Batch.new(cluster: cluster, bin: bin)
|
18
|
+
Adapters::Slurm.new(slurm: slurm)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
module Adapters
|
23
|
+
# An adapter object that describes the communication with a Slurm
|
24
|
+
# resource manager for job management.
|
25
|
+
class Slurm < Adapter
|
26
|
+
using Refinements::HashExtensions
|
27
|
+
|
28
|
+
# Object used for simplified communication with a Slurm batch server
|
29
|
+
# @api private
|
30
|
+
class Batch
|
31
|
+
# The cluster of the Slurm batch server
|
32
|
+
# @example CHPC's kingspeak cluster
|
33
|
+
# my_batch.cluster #=> "kingspeak"
|
34
|
+
# @return [String] the cluster name
|
35
|
+
attr_reader :cluster
|
36
|
+
|
37
|
+
# The path to the Slurm client installation binaries
|
38
|
+
# @example For Slurm 10.0.0
|
39
|
+
# my_batch.bin.to_s #=> "/usr/local/slurm/10.0.0/bin
|
40
|
+
# @return [Pathname] path to slurm binaries
|
41
|
+
attr_reader :bin
|
42
|
+
|
43
|
+
# The root exception class that all Slurm-specific exceptions inherit
|
44
|
+
# from
|
45
|
+
class Error < StandardError; end
|
46
|
+
|
47
|
+
# @param cluster [#to_s] the cluster name
|
48
|
+
# @param bin [#to_s] path to slurm installation binaries
|
49
|
+
def initialize(cluster: "", bin: "")
|
50
|
+
@cluster = cluster.to_s
|
51
|
+
@bin = Pathname.new(bin.to_s)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Get a list of hashes detailing each of the jobs on the batch server
|
55
|
+
# @example Status info for all jobs
|
56
|
+
# my_batch.get_jobs
|
57
|
+
# #=>
|
58
|
+
# #[
|
59
|
+
# # {
|
60
|
+
# # :account => "account",
|
61
|
+
# # :job_id => "my_job",
|
62
|
+
# # ...
|
63
|
+
# # },
|
64
|
+
# # {
|
65
|
+
# # :account => "account",
|
66
|
+
# # :job_id => "my_other_job",
|
67
|
+
# # ...
|
68
|
+
# # },
|
69
|
+
# # ...
|
70
|
+
# #]
|
71
|
+
# @param id [#to_s] the id of the job
|
72
|
+
# @param filters [Array<Symbol>] list of attributes to filter on
|
73
|
+
# @raise [Error] if `squeue` command exited unsuccessfully
|
74
|
+
# @return [Array<Hash>] list of details for jobs
|
75
|
+
def get_jobs(id: "", filters: [])
|
76
|
+
delim = "\x1F" # don't use "|" because FEATURES uses this
|
77
|
+
options = filters.empty? ? fields : fields.slice(*filters)
|
78
|
+
args = ["--all", "--states=all", "--noconvert"]
|
79
|
+
args += ["-o", "#{options.values.join(delim)}"]
|
80
|
+
args += ["-j", id.to_s] unless id.to_s.empty?
|
81
|
+
lines = call("squeue", *args).split("\n").map(&:strip)
|
82
|
+
|
83
|
+
lines.drop(cluster.empty? ? 1 : 2).map do |line|
|
84
|
+
Hash[options.keys.zip(line.split(delim))]
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Put a specified job on hold
|
89
|
+
# @example Put job "1234" on hold
|
90
|
+
# my_batch.hold_job("1234")
|
91
|
+
# @param id [#to_s] the id of the job
|
92
|
+
# @raise [Error] if `scontrol` command exited unsuccessfully
|
93
|
+
# @return [void]
|
94
|
+
def hold_job(id)
|
95
|
+
call("scontrol", "hold", id.to_s)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Release a specified job that is on hold
|
99
|
+
# @example Release job "1234" from on hold
|
100
|
+
# my_batch.release_job("1234")
|
101
|
+
# @param id [#to_s] the id of the job
|
102
|
+
# @raise [Error] if `scontrol` command exited unsuccessfully
|
103
|
+
# @return [void]
|
104
|
+
def release_job(id)
|
105
|
+
call("scontrol", "release", id.to_s)
|
106
|
+
end
|
107
|
+
|
108
|
+
# Delete a specified job from batch server
|
109
|
+
# @example Delete job "1234"
|
110
|
+
# my_batch.delete_job("1234")
|
111
|
+
# @param id [#to_s] the id of the job
|
112
|
+
# @raise [Error] if `scancel` command exited unsuccessfully
|
113
|
+
# @return [void]
|
114
|
+
def delete_job(id)
|
115
|
+
call("scancel", id.to_s)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Submit a script expanded as a string to the batch server
|
119
|
+
# @param str [#to_s] script as a string
|
120
|
+
# @param args [Array<#to_s>] arguments passed to `sbatch` command
|
121
|
+
# @param env [Hash{#to_s => #to_s}] environment variables set
|
122
|
+
# @raise [Error] if `sbatch` command exited unsuccessfully
|
123
|
+
# @return [String] the id of the job that was created
|
124
|
+
def submit_string(str, args: [], env: {})
|
125
|
+
args = args.map(&:to_s) + ["--parsable"]
|
126
|
+
env = {"SBATCH_EXPORT" => "NONE"}.merge env.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
|
127
|
+
call("sbatch", *args, env: env, stdin: str.to_s).strip.split(";").first
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
# Call a forked Slurm command for a given cluster
|
132
|
+
def call(cmd, *args, env: {}, stdin: "")
|
133
|
+
cmd = bin.join(cmd.to_s).to_s
|
134
|
+
args = args.map(&:to_s)
|
135
|
+
args += ["-M", cluster] unless cluster.empty?
|
136
|
+
env = env.to_h
|
137
|
+
o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s)
|
138
|
+
s.success? ? o : raise(Error, e)
|
139
|
+
end
|
140
|
+
|
141
|
+
# Fields requested from a formatted `squeue` call
|
142
|
+
def fields
|
143
|
+
{
|
144
|
+
account: "%a",
|
145
|
+
job_id: "%A",
|
146
|
+
gres: "%b",
|
147
|
+
exec_host: "%B",
|
148
|
+
min_cpus: "%c",
|
149
|
+
cpus: "%C",
|
150
|
+
min_tmp_disk: "%d",
|
151
|
+
nodes: "%D",
|
152
|
+
end_time: "%e",
|
153
|
+
dependency: "%E",
|
154
|
+
features: "%f",
|
155
|
+
array_job_id: "%F",
|
156
|
+
group_name: "%g",
|
157
|
+
group_id: "%G",
|
158
|
+
over_subscribe: "%h",
|
159
|
+
sockets_per_node: "%H",
|
160
|
+
array_job_task_id: "%i",
|
161
|
+
cores_per_socket: "%I",
|
162
|
+
job_name: "%j",
|
163
|
+
threads_per_core: "%J",
|
164
|
+
comment: "%k",
|
165
|
+
array_task_id: "%K",
|
166
|
+
time_limit: "%l",
|
167
|
+
time_left: "%L",
|
168
|
+
min_memory: "%m",
|
169
|
+
time_used: "%M",
|
170
|
+
req_node: "%n",
|
171
|
+
node_list: "%N",
|
172
|
+
command: "%o",
|
173
|
+
contiguous: "%O",
|
174
|
+
qos: "%q",
|
175
|
+
partition: "%P",
|
176
|
+
priority: "%Q",
|
177
|
+
reason: "%r",
|
178
|
+
start_time: "%S",
|
179
|
+
state_compact: "%t",
|
180
|
+
state: "%T",
|
181
|
+
user: "%u",
|
182
|
+
user_id: "%U",
|
183
|
+
reservation: "%v",
|
184
|
+
submit_time: "%V",
|
185
|
+
wckey: "%w",
|
186
|
+
licenses: "%W",
|
187
|
+
excluded_nodes: "%x",
|
188
|
+
core_specialization: "%X",
|
189
|
+
nice: "%y",
|
190
|
+
scheduled_nodes: "%Y",
|
191
|
+
sockets_cores_threads: "%z",
|
192
|
+
work_dir: "%Z"
|
193
|
+
}
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# Mapping of state codes for Slurm
|
198
|
+
STATE_MAP = {
|
199
|
+
'BF' => :completed, # BOOT_FAIL
|
200
|
+
'CA' => :completed, # CANCELLED
|
201
|
+
'CD' => :completed, # COMPLETED
|
202
|
+
'CF' => :queued, # CONFIGURING
|
203
|
+
'CG' => :running, # COMPLETING
|
204
|
+
'F' => :completed, # FAILED
|
205
|
+
'NF' => :completed, # NODE_FAIL
|
206
|
+
'PD' => :queued, # PENDING
|
207
|
+
'PR' => :suspended, # PREEMPTED
|
208
|
+
'RV' => :completed, # REVOKED
|
209
|
+
'R' => :running, # RUNNING
|
210
|
+
'SE' => :completed, # SPECIAL_EXIT
|
211
|
+
'ST' => :running, # STOPPED
|
212
|
+
'S' => :suspended, # SUSPENDED
|
213
|
+
'TO' => :completed # TIMEOUT
|
214
|
+
}
|
215
|
+
|
216
|
+
# @api private
|
217
|
+
# @param opts [#to_h] the options defining this adapter
|
218
|
+
# @option opts [Batch] :slurm The Slurm batch object
|
219
|
+
# @see Factory.build_slurm
|
220
|
+
def initialize(opts = {})
|
221
|
+
o = opts.to_h.symbolize_keys
|
222
|
+
|
223
|
+
@slurm = o.fetch(:slurm) { raise ArgumentError, "No slurm object specified. Missing argument: slurm" }
|
224
|
+
end
|
225
|
+
|
226
|
+
# Submit a job with the attributes defined in the job template instance
|
227
|
+
# @param script [Script] script object that describes the script and
|
228
|
+
# attributes for the submitted job
|
229
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for
|
230
|
+
# execution at any point after dependent jobs have started execution
|
231
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
232
|
+
# execution only after dependent jobs have terminated with no errors
|
233
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
234
|
+
# execution only after dependent jobs have terminated with errors
|
235
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
236
|
+
# execution after dependent jobs have terminated
|
237
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
238
|
+
# @return [String] the job id returned after successfully submitting a
|
239
|
+
# job
|
240
|
+
# @see Adapter#submit
|
241
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
242
|
+
after = Array(after).map(&:to_s)
|
243
|
+
afterok = Array(afterok).map(&:to_s)
|
244
|
+
afternotok = Array(afternotok).map(&:to_s)
|
245
|
+
afterany = Array(afterany).map(&:to_s)
|
246
|
+
|
247
|
+
# Set sbatch options
|
248
|
+
args = []
|
249
|
+
# ignore args, don't know how to do this for slurm
|
250
|
+
args += ["-H"] if script.submit_as_hold
|
251
|
+
args += (script.rerunnable ? ["--requeue"] : ["--no-requeue"]) unless script.rerunnable.nil?
|
252
|
+
args += ["-D", script.workdir.to_s] unless script.workdir.nil?
|
253
|
+
args += ["--mail-user", script.email.join(",")] unless script.email.nil?
|
254
|
+
if script.email_on_started && script.email_on_terminated
|
255
|
+
args += ["--mail-type", "ALL"]
|
256
|
+
elsif script.email_on_started
|
257
|
+
args += ["--mail-type", "BEGIN"]
|
258
|
+
elsif script.email_on_terminated
|
259
|
+
args += ["--mail-type", "END"]
|
260
|
+
elsif script.email_on_started == false && script.email_on_terminated == false
|
261
|
+
args += ["--mail-type", "NONE"]
|
262
|
+
end
|
263
|
+
args += ["-J", script.job_name] unless script.job_name.nil?
|
264
|
+
args += ["-i", script.input_path] unless script.input_path.nil?
|
265
|
+
args += ["-o", script.output_path] unless script.output_path.nil?
|
266
|
+
args += ["-e", script.error_path] unless script.error_path.nil?
|
267
|
+
# ignore join_files, by default it joins stdout and stderr unless
|
268
|
+
# error_path is specified
|
269
|
+
args += ["--reservation", script.reservation_id] unless script.reservation_id.nil?
|
270
|
+
args += ["-p", script.queue_name] unless script.queue_name.nil?
|
271
|
+
args += ["--priority", script.priority] unless script.priority.nil?
|
272
|
+
args += ["--begin", script.start_time.localtime.strftime("%C%y-%m-%dT%H:%M:%S")] unless script.start_time.nil?
|
273
|
+
args += ["-A", script.accounting_id] unless script.accounting_id.nil?
|
274
|
+
args += ["--mem", "#{script.min_phys_memory}K"] unless script.min_phys_memory.nil?
|
275
|
+
args += ["-t", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
|
276
|
+
# ignore nodes, don't know how to do this for slurm
|
277
|
+
|
278
|
+
# Set dependencies
|
279
|
+
depend = []
|
280
|
+
depend << "after:#{after.join(":")}" unless after.empty?
|
281
|
+
depend << "afterok:#{afterok.join(":")}" unless afterok.empty?
|
282
|
+
depend << "afternotok:#{afternotok.join(":")}" unless afternotok.empty?
|
283
|
+
depend << "afterany:#{afterany.join(":")}" unless afterany.empty?
|
284
|
+
args += ["-d", depend.join(",")] unless depend.empty?
|
285
|
+
|
286
|
+
# Set environment variables
|
287
|
+
env = script.job_environment || {}
|
288
|
+
args += ["--export", script.job_environment.keys.join(",")] unless script.job_environment.nil? || script.job_environment.empty?
|
289
|
+
|
290
|
+
# Set native options
|
291
|
+
args += script.native if script.native
|
292
|
+
|
293
|
+
# Submit job
|
294
|
+
@slurm.submit_string(script.content, args: args, env: env)
|
295
|
+
rescue Batch::Error => e
|
296
|
+
raise JobAdapterError, e.message
|
297
|
+
end
|
298
|
+
|
299
|
+
# Retrieve info for all jobs from the resource manager
|
300
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
301
|
+
# @return [Array<Info>] information describing submitted jobs
|
302
|
+
# @see Adapter#info_all
|
303
|
+
def info_all
|
304
|
+
@slurm.get_jobs.map do |v|
|
305
|
+
parse_job_info(v)
|
306
|
+
end
|
307
|
+
rescue Batch::Error => e
|
308
|
+
raise JobAdapterError, e.message
|
309
|
+
end
|
310
|
+
|
311
|
+
# Retrieve job info from the resource manager
|
312
|
+
# @param id [#to_s] the id of the job
|
313
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
314
|
+
# @return [Info] information describing submitted job
|
315
|
+
# @see Adapter#info
|
316
|
+
def info(id)
|
317
|
+
id = id.to_s
|
318
|
+
info_ary = @slurm.get_jobs(id: id).map do |v|
|
319
|
+
parse_job_info(v)
|
320
|
+
end
|
321
|
+
|
322
|
+
# A job id can return multiple jobs if it corresponds to a job
|
323
|
+
# array id, so we need to find the job that corresponds to the
|
324
|
+
# given job id (if we can't find it, we assume it has completed)
|
325
|
+
info_ary.detect( -> { Info.new(id: id, status: :completed) } ) do |info|
|
326
|
+
# Match the job id or the formatted job & task id "1234_0"
|
327
|
+
info.id == id || info.native[:array_job_task_id] == id
|
328
|
+
end
|
329
|
+
rescue Batch::Error => e
|
330
|
+
# set completed status if can't find job id
|
331
|
+
if /Invalid job id specified/ =~ e.message
|
332
|
+
Info.new(
|
333
|
+
id: id,
|
334
|
+
status: :completed
|
335
|
+
)
|
336
|
+
else
|
337
|
+
raise JobAdapterError, e.message
|
338
|
+
end
|
339
|
+
end
|
340
|
+
|
341
|
+
# Retrieve job status from resource manager
|
342
|
+
# @param id [#to_s] the id of the job
|
343
|
+
# @raise [JobAdapterError] if something goes wrong getting job status
|
344
|
+
# @return [Status] status of job
|
345
|
+
# @see Adapter#status
|
346
|
+
def status(id)
|
347
|
+
id = id.to_s
|
348
|
+
jobs = @slurm.get_jobs(
|
349
|
+
id: id,
|
350
|
+
filters: [:job_id, :array_job_task_id, :state_compact]
|
351
|
+
)
|
352
|
+
# A job id can return multiple jobs if it corresponds to a job array
|
353
|
+
# id, so we need to find the job that corresponds to the given job id
|
354
|
+
# (if we can't find it, we assume it has completed)
|
355
|
+
#
|
356
|
+
# Match against the job id or the formatted job & task id "1234_0"
|
357
|
+
if job = jobs.detect { |j| j[:job_id] == id || j[:array_job_task_id] == id }
|
358
|
+
Status.new(state: get_state(job[:state_compact]))
|
359
|
+
else
|
360
|
+
# set completed status if can't find job id
|
361
|
+
Status.new(state: :completed)
|
362
|
+
end
|
363
|
+
rescue Batch::Error => e
|
364
|
+
# set completed status if can't find job id
|
365
|
+
if /Invalid job id specified/ =~ e.message
|
366
|
+
Status.new(state: :completed)
|
367
|
+
else
|
368
|
+
raise JobAdapterError, e.message
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
# Put the submitted job on hold
|
373
|
+
# @param id [#to_s] the id of the job
|
374
|
+
# @raise [JobAdapterError] if something goes wrong holding a job
|
375
|
+
# @return [void]
|
376
|
+
# @see Adapter#hold
|
377
|
+
def hold(id)
|
378
|
+
@slurm.hold_job(id.to_s)
|
379
|
+
rescue Batch::Error => e
|
380
|
+
# assume successful job hold if can't find job id
|
381
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
382
|
+
end
|
383
|
+
|
384
|
+
# Release the job that is on hold
|
385
|
+
# @param id [#to_s] the id of the job
|
386
|
+
# @raise [JobAdapterError] if something goes wrong releasing a job
|
387
|
+
# @return [void]
|
388
|
+
# @see Adapter#release
|
389
|
+
def release(id)
|
390
|
+
@slurm.release_job(id.to_s)
|
391
|
+
rescue Batch::Error => e
|
392
|
+
# assume successful job release if can't find job id
|
393
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
394
|
+
end
|
395
|
+
|
396
|
+
# Delete the submitted job
|
397
|
+
# @param id [#to_s] the id of the job
|
398
|
+
# @raise [JobAdapterError] if something goes wrong deleting a job
|
399
|
+
# @return [void]
|
400
|
+
# @see Adapter#delete
|
401
|
+
def delete(id)
|
402
|
+
@slurm.delete_job(id.to_s)
|
403
|
+
rescue Batch::Error => e
|
404
|
+
# assume successful job deletion if can't find job id
|
405
|
+
raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
|
406
|
+
end
|
407
|
+
|
408
|
+
private
|
409
|
+
# Convert duration to seconds
|
410
|
+
def duration_in_seconds(time)
|
411
|
+
return 0 if time.nil?
|
412
|
+
time, days = time.split("-").reverse
|
413
|
+
days.to_i * 24 * 3600 +
|
414
|
+
time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
|
415
|
+
end
|
416
|
+
|
417
|
+
# Convert seconds to duration
|
418
|
+
def seconds_to_duration(time)
|
419
|
+
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
|
420
|
+
end
|
421
|
+
|
422
|
+
# Convert host list string to individual nodes
|
423
|
+
# "em082"
|
424
|
+
# "em[014,055-056,161]"
|
425
|
+
# "n0163/2,7,10-11+n0205/0-11+n0156/0-11"
|
426
|
+
def parse_nodes(node_list)
|
427
|
+
/^(?<prefix>[^\[]+)(\[(?<range>[^\]]+)\])?$/ =~ node_list
|
428
|
+
|
429
|
+
if range
|
430
|
+
range.split(",").map do |x|
|
431
|
+
x =~ /^(\d+)-(\d+)$/ ? ($1..$2).to_a : x
|
432
|
+
end.flatten.map do |n|
|
433
|
+
{ name: prefix + n, procs: nil }
|
434
|
+
end
|
435
|
+
elsif prefix
|
436
|
+
[ { name: prefix, procs: nil } ]
|
437
|
+
else
|
438
|
+
[]
|
439
|
+
end
|
440
|
+
end
|
441
|
+
|
442
|
+
# Determine state from Slurm state code
|
443
|
+
def get_state(st)
|
444
|
+
STATE_MAP.fetch(st, :undetermined)
|
445
|
+
end
|
446
|
+
|
447
|
+
# Parse hash describing Slurm job status
|
448
|
+
def parse_job_info(v)
|
449
|
+
allocated_nodes = parse_nodes(v[:node_list])
|
450
|
+
Info.new(
|
451
|
+
id: v[:job_id],
|
452
|
+
status: get_state(v[:state_compact]),
|
453
|
+
allocated_nodes: allocated_nodes,
|
454
|
+
submit_host: nil,
|
455
|
+
job_name: v[:job_name],
|
456
|
+
job_owner: v[:user],
|
457
|
+
accounting_id: v[:account],
|
458
|
+
procs: v[:cpus],
|
459
|
+
queue_name: v[:partition],
|
460
|
+
wallclock_time: duration_in_seconds(v[:time_used]),
|
461
|
+
cpu_time: nil,
|
462
|
+
submission_time: Time.parse(v[:submit_time]),
|
463
|
+
dispatch_time: v[:start_time] == "N/A" ? nil : Time.parse(v[:start_time]),
|
464
|
+
native: v
|
465
|
+
)
|
466
|
+
end
|
467
|
+
end
|
468
|
+
end
|
469
|
+
end
|
470
|
+
end
|