ood_core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ # Object used for simplified communication with a LSF batch server
2
+ #
3
+ # @api private
4
+ class OodCore::Job::Adapters::Lsf::Helper
5
+
6
+ # convert string in format "03/31-14:46:42" to Time object
7
+ # assumes time being parsed is a time that ocurred in the past
8
+ # not to be used for parsing times in the future (like estimated FINISH_TIME)
9
+ def parse_past_time(t, ignore_errors: false)
10
+ return nil if t.nil? || t.empty? || t == "-"
11
+ year = Time.now.year
12
+ time = Time.parse("#{year}/#{t}")
13
+
14
+ # handle edge case where job started before new year
15
+ time = Time.parse("#{year - 1}/#{t}") if time.month > Time.now.month
16
+
17
+ time
18
+
19
+ rescue ArgumentError => e
20
+ raise e unless ignore_errors
21
+
22
+ #TODO: warn via logger
23
+
24
+ nil
25
+ end
26
+ end
@@ -0,0 +1,470 @@
1
+ require "time"
2
+ require "ood_core/refinements/hash_extensions"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ # Build the Slurm adapter from a configuration
10
+ # @param config [#to_h] the configuration for job adapter
11
+ # @option config [#to_s] :cluster ('') The cluster to communicate with
12
+ # @option config [#to_s] :bin ('') Path to slurm client binaries
13
+ def self.build_slurm(config)
14
+ c = config.to_h.symbolize_keys
15
+ cluster = c.fetch(:cluster, "").to_s
16
+ bin = c.fetch(:bin, "").to_s
17
+ slurm = Adapters::Slurm::Batch.new(cluster: cluster, bin: bin)
18
+ Adapters::Slurm.new(slurm: slurm)
19
+ end
20
+ end
21
+
22
+ module Adapters
23
+ # An adapter object that describes the communication with a Slurm
24
+ # resource manager for job management.
25
+ class Slurm < Adapter
26
+ using Refinements::HashExtensions
27
+
28
+ # Object used for simplified communication with a Slurm batch server
29
+ # @api private
30
+ class Batch
31
+ # The cluster of the Slurm batch server
32
+ # @example CHPC's kingspeak cluster
33
+ # my_batch.cluster #=> "kingspeak"
34
+ # @return [String] the cluster name
35
+ attr_reader :cluster
36
+
37
+ # The path to the Slurm client installation binaries
38
+ # @example For Slurm 10.0.0
39
+ # my_batch.bin.to_s #=> "/usr/local/slurm/10.0.0/bin
40
+ # @return [Pathname] path to slurm binaries
41
+ attr_reader :bin
42
+
43
+ # The root exception class that all Slurm-specific exceptions inherit
44
+ # from
45
+ class Error < StandardError; end
46
+
47
+ # @param cluster [#to_s] the cluster name
48
+ # @param bin [#to_s] path to slurm installation binaries
49
+ def initialize(cluster: "", bin: "")
50
+ @cluster = cluster.to_s
51
+ @bin = Pathname.new(bin.to_s)
52
+ end
53
+
54
+ # Get a list of hashes detailing each of the jobs on the batch server
55
+ # @example Status info for all jobs
56
+ # my_batch.get_jobs
57
+ # #=>
58
+ # #[
59
+ # # {
60
+ # # :account => "account",
61
+ # # :job_id => "my_job",
62
+ # # ...
63
+ # # },
64
+ # # {
65
+ # # :account => "account",
66
+ # # :job_id => "my_other_job",
67
+ # # ...
68
+ # # },
69
+ # # ...
70
+ # #]
71
+ # @param id [#to_s] the id of the job
72
+ # @param filters [Array<Symbol>] list of attributes to filter on
73
+ # @raise [Error] if `squeue` command exited unsuccessfully
74
+ # @return [Array<Hash>] list of details for jobs
75
+ def get_jobs(id: "", filters: [])
76
+ delim = "\x1F" # don't use "|" because FEATURES uses this
77
+ options = filters.empty? ? fields : fields.slice(*filters)
78
+ args = ["--all", "--states=all", "--noconvert"]
79
+ args += ["-o", "#{options.values.join(delim)}"]
80
+ args += ["-j", id.to_s] unless id.to_s.empty?
81
+ lines = call("squeue", *args).split("\n").map(&:strip)
82
+
83
+ lines.drop(cluster.empty? ? 1 : 2).map do |line|
84
+ Hash[options.keys.zip(line.split(delim))]
85
+ end
86
+ end
87
+
88
+ # Put a specified job on hold
89
+ # @example Put job "1234" on hold
90
+ # my_batch.hold_job("1234")
91
+ # @param id [#to_s] the id of the job
92
+ # @raise [Error] if `scontrol` command exited unsuccessfully
93
+ # @return [void]
94
+ def hold_job(id)
95
+ call("scontrol", "hold", id.to_s)
96
+ end
97
+
98
+ # Release a specified job that is on hold
99
+ # @example Release job "1234" from on hold
100
+ # my_batch.release_job("1234")
101
+ # @param id [#to_s] the id of the job
102
+ # @raise [Error] if `scontrol` command exited unsuccessfully
103
+ # @return [void]
104
+ def release_job(id)
105
+ call("scontrol", "release", id.to_s)
106
+ end
107
+
108
+ # Delete a specified job from batch server
109
+ # @example Delete job "1234"
110
+ # my_batch.delete_job("1234")
111
+ # @param id [#to_s] the id of the job
112
+ # @raise [Error] if `scancel` command exited unsuccessfully
113
+ # @return [void]
114
+ def delete_job(id)
115
+ call("scancel", id.to_s)
116
+ end
117
+
118
+ # Submit a script expanded as a string to the batch server
119
+ # @param str [#to_s] script as a string
120
+ # @param args [Array<#to_s>] arguments passed to `sbatch` command
121
+ # @param env [Hash{#to_s => #to_s}] environment variables set
122
+ # @raise [Error] if `sbatch` command exited unsuccessfully
123
+ # @return [String] the id of the job that was created
124
+ def submit_string(str, args: [], env: {})
125
+ args = args.map(&:to_s) + ["--parsable"]
126
+ env = {"SBATCH_EXPORT" => "NONE"}.merge env.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
127
+ call("sbatch", *args, env: env, stdin: str.to_s).strip.split(";").first
128
+ end
129
+
130
+ private
131
+ # Call a forked Slurm command for a given cluster
132
+ def call(cmd, *args, env: {}, stdin: "")
133
+ cmd = bin.join(cmd.to_s).to_s
134
+ args = args.map(&:to_s)
135
+ args += ["-M", cluster] unless cluster.empty?
136
+ env = env.to_h
137
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s)
138
+ s.success? ? o : raise(Error, e)
139
+ end
140
+
141
+ # Fields requested from a formatted `squeue` call
142
+ def fields
143
+ {
144
+ account: "%a",
145
+ job_id: "%A",
146
+ gres: "%b",
147
+ exec_host: "%B",
148
+ min_cpus: "%c",
149
+ cpus: "%C",
150
+ min_tmp_disk: "%d",
151
+ nodes: "%D",
152
+ end_time: "%e",
153
+ dependency: "%E",
154
+ features: "%f",
155
+ array_job_id: "%F",
156
+ group_name: "%g",
157
+ group_id: "%G",
158
+ over_subscribe: "%h",
159
+ sockets_per_node: "%H",
160
+ array_job_task_id: "%i",
161
+ cores_per_socket: "%I",
162
+ job_name: "%j",
163
+ threads_per_core: "%J",
164
+ comment: "%k",
165
+ array_task_id: "%K",
166
+ time_limit: "%l",
167
+ time_left: "%L",
168
+ min_memory: "%m",
169
+ time_used: "%M",
170
+ req_node: "%n",
171
+ node_list: "%N",
172
+ command: "%o",
173
+ contiguous: "%O",
174
+ qos: "%q",
175
+ partition: "%P",
176
+ priority: "%Q",
177
+ reason: "%r",
178
+ start_time: "%S",
179
+ state_compact: "%t",
180
+ state: "%T",
181
+ user: "%u",
182
+ user_id: "%U",
183
+ reservation: "%v",
184
+ submit_time: "%V",
185
+ wckey: "%w",
186
+ licenses: "%W",
187
+ excluded_nodes: "%x",
188
+ core_specialization: "%X",
189
+ nice: "%y",
190
+ scheduled_nodes: "%Y",
191
+ sockets_cores_threads: "%z",
192
+ work_dir: "%Z"
193
+ }
194
+ end
195
+ end
196
+
197
+ # Mapping of state codes for Slurm
198
+ STATE_MAP = {
199
+ 'BF' => :completed, # BOOT_FAIL
200
+ 'CA' => :completed, # CANCELLED
201
+ 'CD' => :completed, # COMPLETED
202
+ 'CF' => :queued, # CONFIGURING
203
+ 'CG' => :running, # COMPLETING
204
+ 'F' => :completed, # FAILED
205
+ 'NF' => :completed, # NODE_FAIL
206
+ 'PD' => :queued, # PENDING
207
+ 'PR' => :suspended, # PREEMPTED
208
+ 'RV' => :completed, # REVOKED
209
+ 'R' => :running, # RUNNING
210
+ 'SE' => :completed, # SPECIAL_EXIT
211
+ 'ST' => :running, # STOPPED
212
+ 'S' => :suspended, # SUSPENDED
213
+ 'TO' => :completed # TIMEOUT
214
+ }
215
+
216
+ # @api private
217
+ # @param opts [#to_h] the options defining this adapter
218
+ # @option opts [Batch] :slurm The Slurm batch object
219
+ # @see Factory.build_slurm
220
+ def initialize(opts = {})
221
+ o = opts.to_h.symbolize_keys
222
+
223
+ @slurm = o.fetch(:slurm) { raise ArgumentError, "No slurm object specified. Missing argument: slurm" }
224
+ end
225
+
226
+ # Submit a job with the attributes defined in the job template instance
227
+ # @param script [Script] script object that describes the script and
228
+ # attributes for the submitted job
229
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
230
+ # execution at any point after dependent jobs have started execution
231
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
232
+ # execution only after dependent jobs have terminated with no errors
233
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
234
+ # execution only after dependent jobs have terminated with errors
235
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
236
+ # execution after dependent jobs have terminated
237
+ # @raise [JobAdapterError] if something goes wrong submitting a job
238
+ # @return [String] the job id returned after successfully submitting a
239
+ # job
240
+ # @see Adapter#submit
241
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
242
+ after = Array(after).map(&:to_s)
243
+ afterok = Array(afterok).map(&:to_s)
244
+ afternotok = Array(afternotok).map(&:to_s)
245
+ afterany = Array(afterany).map(&:to_s)
246
+
247
+ # Set sbatch options
248
+ args = []
249
+ # ignore args, don't know how to do this for slurm
250
+ args += ["-H"] if script.submit_as_hold
251
+ args += (script.rerunnable ? ["--requeue"] : ["--no-requeue"]) unless script.rerunnable.nil?
252
+ args += ["-D", script.workdir.to_s] unless script.workdir.nil?
253
+ args += ["--mail-user", script.email.join(",")] unless script.email.nil?
254
+ if script.email_on_started && script.email_on_terminated
255
+ args += ["--mail-type", "ALL"]
256
+ elsif script.email_on_started
257
+ args += ["--mail-type", "BEGIN"]
258
+ elsif script.email_on_terminated
259
+ args += ["--mail-type", "END"]
260
+ elsif script.email_on_started == false && script.email_on_terminated == false
261
+ args += ["--mail-type", "NONE"]
262
+ end
263
+ args += ["-J", script.job_name] unless script.job_name.nil?
264
+ args += ["-i", script.input_path] unless script.input_path.nil?
265
+ args += ["-o", script.output_path] unless script.output_path.nil?
266
+ args += ["-e", script.error_path] unless script.error_path.nil?
267
+ # ignore join_files, by default it joins stdout and stderr unless
268
+ # error_path is specified
269
+ args += ["--reservation", script.reservation_id] unless script.reservation_id.nil?
270
+ args += ["-p", script.queue_name] unless script.queue_name.nil?
271
+ args += ["--priority", script.priority] unless script.priority.nil?
272
+ args += ["--begin", script.start_time.localtime.strftime("%C%y-%m-%dT%H:%M:%S")] unless script.start_time.nil?
273
+ args += ["-A", script.accounting_id] unless script.accounting_id.nil?
274
+ args += ["--mem", "#{script.min_phys_memory}K"] unless script.min_phys_memory.nil?
275
+ args += ["-t", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
276
+ # ignore nodes, don't know how to do this for slurm
277
+
278
+ # Set dependencies
279
+ depend = []
280
+ depend << "after:#{after.join(":")}" unless after.empty?
281
+ depend << "afterok:#{afterok.join(":")}" unless afterok.empty?
282
+ depend << "afternotok:#{afternotok.join(":")}" unless afternotok.empty?
283
+ depend << "afterany:#{afterany.join(":")}" unless afterany.empty?
284
+ args += ["-d", depend.join(",")] unless depend.empty?
285
+
286
+ # Set environment variables
287
+ env = script.job_environment || {}
288
+ args += ["--export", script.job_environment.keys.join(",")] unless script.job_environment.nil? || script.job_environment.empty?
289
+
290
+ # Set native options
291
+ args += script.native if script.native
292
+
293
+ # Submit job
294
+ @slurm.submit_string(script.content, args: args, env: env)
295
+ rescue Batch::Error => e
296
+ raise JobAdapterError, e.message
297
+ end
298
+
299
+ # Retrieve info for all jobs from the resource manager
300
+ # @raise [JobAdapterError] if something goes wrong getting job info
301
+ # @return [Array<Info>] information describing submitted jobs
302
+ # @see Adapter#info_all
303
+ def info_all
304
+ @slurm.get_jobs.map do |v|
305
+ parse_job_info(v)
306
+ end
307
+ rescue Batch::Error => e
308
+ raise JobAdapterError, e.message
309
+ end
310
+
311
+ # Retrieve job info from the resource manager
312
+ # @param id [#to_s] the id of the job
313
+ # @raise [JobAdapterError] if something goes wrong getting job info
314
+ # @return [Info] information describing submitted job
315
+ # @see Adapter#info
316
+ def info(id)
317
+ id = id.to_s
318
+ info_ary = @slurm.get_jobs(id: id).map do |v|
319
+ parse_job_info(v)
320
+ end
321
+
322
+ # A job id can return multiple jobs if it corresponds to a job
323
+ # array id, so we need to find the job that corresponds to the
324
+ # given job id (if we can't find it, we assume it has completed)
325
+ info_ary.detect( -> { Info.new(id: id, status: :completed) } ) do |info|
326
+ # Match the job id or the formatted job & task id "1234_0"
327
+ info.id == id || info.native[:array_job_task_id] == id
328
+ end
329
+ rescue Batch::Error => e
330
+ # set completed status if can't find job id
331
+ if /Invalid job id specified/ =~ e.message
332
+ Info.new(
333
+ id: id,
334
+ status: :completed
335
+ )
336
+ else
337
+ raise JobAdapterError, e.message
338
+ end
339
+ end
340
+
341
+ # Retrieve job status from resource manager
342
+ # @param id [#to_s] the id of the job
343
+ # @raise [JobAdapterError] if something goes wrong getting job status
344
+ # @return [Status] status of job
345
+ # @see Adapter#status
346
+ def status(id)
347
+ id = id.to_s
348
+ jobs = @slurm.get_jobs(
349
+ id: id,
350
+ filters: [:job_id, :array_job_task_id, :state_compact]
351
+ )
352
+ # A job id can return multiple jobs if it corresponds to a job array
353
+ # id, so we need to find the job that corresponds to the given job id
354
+ # (if we can't find it, we assume it has completed)
355
+ #
356
+ # Match against the job id or the formatted job & task id "1234_0"
357
+ if job = jobs.detect { |j| j[:job_id] == id || j[:array_job_task_id] == id }
358
+ Status.new(state: get_state(job[:state_compact]))
359
+ else
360
+ # set completed status if can't find job id
361
+ Status.new(state: :completed)
362
+ end
363
+ rescue Batch::Error => e
364
+ # set completed status if can't find job id
365
+ if /Invalid job id specified/ =~ e.message
366
+ Status.new(state: :completed)
367
+ else
368
+ raise JobAdapterError, e.message
369
+ end
370
+ end
371
+
372
+ # Put the submitted job on hold
373
+ # @param id [#to_s] the id of the job
374
+ # @raise [JobAdapterError] if something goes wrong holding a job
375
+ # @return [void]
376
+ # @see Adapter#hold
377
+ def hold(id)
378
+ @slurm.hold_job(id.to_s)
379
+ rescue Batch::Error => e
380
+ # assume successful job hold if can't find job id
381
+ raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
382
+ end
383
+
384
+ # Release the job that is on hold
385
+ # @param id [#to_s] the id of the job
386
+ # @raise [JobAdapterError] if something goes wrong releasing a job
387
+ # @return [void]
388
+ # @see Adapter#release
389
+ def release(id)
390
+ @slurm.release_job(id.to_s)
391
+ rescue Batch::Error => e
392
+ # assume successful job release if can't find job id
393
+ raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
394
+ end
395
+
396
+ # Delete the submitted job
397
+ # @param id [#to_s] the id of the job
398
+ # @raise [JobAdapterError] if something goes wrong deleting a job
399
+ # @return [void]
400
+ # @see Adapter#delete
401
+ def delete(id)
402
+ @slurm.delete_job(id.to_s)
403
+ rescue Batch::Error => e
404
+ # assume successful job deletion if can't find job id
405
+ raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
406
+ end
407
+
408
+ private
409
+ # Convert duration to seconds
410
+ def duration_in_seconds(time)
411
+ return 0 if time.nil?
412
+ time, days = time.split("-").reverse
413
+ days.to_i * 24 * 3600 +
414
+ time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
415
+ end
416
+
417
+ # Convert seconds to duration
418
+ def seconds_to_duration(time)
419
+ "%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
420
+ end
421
+
422
+ # Convert host list string to individual nodes
423
+ # "em082"
424
+ # "em[014,055-056,161]"
425
+ # "n0163/2,7,10-11+n0205/0-11+n0156/0-11"
426
+ def parse_nodes(node_list)
427
+ /^(?<prefix>[^\[]+)(\[(?<range>[^\]]+)\])?$/ =~ node_list
428
+
429
+ if range
430
+ range.split(",").map do |x|
431
+ x =~ /^(\d+)-(\d+)$/ ? ($1..$2).to_a : x
432
+ end.flatten.map do |n|
433
+ { name: prefix + n, procs: nil }
434
+ end
435
+ elsif prefix
436
+ [ { name: prefix, procs: nil } ]
437
+ else
438
+ []
439
+ end
440
+ end
441
+
442
+ # Determine state from Slurm state code
443
+ def get_state(st)
444
+ STATE_MAP.fetch(st, :undetermined)
445
+ end
446
+
447
+ # Parse hash describing Slurm job status
448
+ def parse_job_info(v)
449
+ allocated_nodes = parse_nodes(v[:node_list])
450
+ Info.new(
451
+ id: v[:job_id],
452
+ status: get_state(v[:state_compact]),
453
+ allocated_nodes: allocated_nodes,
454
+ submit_host: nil,
455
+ job_name: v[:job_name],
456
+ job_owner: v[:user],
457
+ accounting_id: v[:account],
458
+ procs: v[:cpus],
459
+ queue_name: v[:partition],
460
+ wallclock_time: duration_in_seconds(v[:time_used]),
461
+ cpu_time: nil,
462
+ submission_time: Time.parse(v[:submit_time]),
463
+ dispatch_time: v[:start_time] == "N/A" ? nil : Time.parse(v[:start_time]),
464
+ native: v
465
+ )
466
+ end
467
+ end
468
+ end
469
+ end
470
+ end