ood_core 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,26 @@
1
+ # Object used for simplified communication with a LSF batch server
2
+ #
3
+ # @api private
4
+ class OodCore::Job::Adapters::Lsf::Helper
5
+
6
+ # convert string in format "03/31-14:46:42" to Time object
7
+ # assumes time being parsed is a time that ocurred in the past
8
+ # not to be used for parsing times in the future (like estimated FINISH_TIME)
9
+ def parse_past_time(t, ignore_errors: false)
10
+ return nil if t.nil? || t.empty? || t == "-"
11
+ year = Time.now.year
12
+ time = Time.parse("#{year}/#{t}")
13
+
14
+ # handle edge case where job started before new year
15
+ time = Time.parse("#{year - 1}/#{t}") if time.month > Time.now.month
16
+
17
+ time
18
+
19
+ rescue ArgumentError => e
20
+ raise e unless ignore_errors
21
+
22
+ #TODO: warn via logger
23
+
24
+ nil
25
+ end
26
+ end
@@ -0,0 +1,470 @@
1
+ require "time"
2
+ require "ood_core/refinements/hash_extensions"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ # Build the Slurm adapter from a configuration
10
+ # @param config [#to_h] the configuration for job adapter
11
+ # @option config [#to_s] :cluster ('') The cluster to communicate with
12
+ # @option config [#to_s] :bin ('') Path to slurm client binaries
13
+ def self.build_slurm(config)
14
+ c = config.to_h.symbolize_keys
15
+ cluster = c.fetch(:cluster, "").to_s
16
+ bin = c.fetch(:bin, "").to_s
17
+ slurm = Adapters::Slurm::Batch.new(cluster: cluster, bin: bin)
18
+ Adapters::Slurm.new(slurm: slurm)
19
+ end
20
+ end
21
+
22
+ module Adapters
23
+ # An adapter object that describes the communication with a Slurm
24
+ # resource manager for job management.
25
+ class Slurm < Adapter
26
+ using Refinements::HashExtensions
27
+
28
+ # Object used for simplified communication with a Slurm batch server
29
+ # @api private
30
+ class Batch
31
+ # The cluster of the Slurm batch server
32
+ # @example CHPC's kingspeak cluster
33
+ # my_batch.cluster #=> "kingspeak"
34
+ # @return [String] the cluster name
35
+ attr_reader :cluster
36
+
37
+ # The path to the Slurm client installation binaries
38
+ # @example For Slurm 10.0.0
39
+ # my_batch.bin.to_s #=> "/usr/local/slurm/10.0.0/bin
40
+ # @return [Pathname] path to slurm binaries
41
+ attr_reader :bin
42
+
43
+ # The root exception class that all Slurm-specific exceptions inherit
44
+ # from
45
+ class Error < StandardError; end
46
+
47
+ # @param cluster [#to_s] the cluster name
48
+ # @param bin [#to_s] path to slurm installation binaries
49
+ def initialize(cluster: "", bin: "")
50
+ @cluster = cluster.to_s
51
+ @bin = Pathname.new(bin.to_s)
52
+ end
53
+
54
+ # Get a list of hashes detailing each of the jobs on the batch server
55
+ # @example Status info for all jobs
56
+ # my_batch.get_jobs
57
+ # #=>
58
+ # #[
59
+ # # {
60
+ # # :account => "account",
61
+ # # :job_id => "my_job",
62
+ # # ...
63
+ # # },
64
+ # # {
65
+ # # :account => "account",
66
+ # # :job_id => "my_other_job",
67
+ # # ...
68
+ # # },
69
+ # # ...
70
+ # #]
71
+ # @param id [#to_s] the id of the job
72
+ # @param filters [Array<Symbol>] list of attributes to filter on
73
+ # @raise [Error] if `squeue` command exited unsuccessfully
74
+ # @return [Array<Hash>] list of details for jobs
75
+ def get_jobs(id: "", filters: [])
76
+ delim = "\x1F" # don't use "|" because FEATURES uses this
77
+ options = filters.empty? ? fields : fields.slice(*filters)
78
+ args = ["--all", "--states=all", "--noconvert"]
79
+ args += ["-o", "#{options.values.join(delim)}"]
80
+ args += ["-j", id.to_s] unless id.to_s.empty?
81
+ lines = call("squeue", *args).split("\n").map(&:strip)
82
+
83
+ lines.drop(cluster.empty? ? 1 : 2).map do |line|
84
+ Hash[options.keys.zip(line.split(delim))]
85
+ end
86
+ end
87
+
88
+ # Put a specified job on hold
89
+ # @example Put job "1234" on hold
90
+ # my_batch.hold_job("1234")
91
+ # @param id [#to_s] the id of the job
92
+ # @raise [Error] if `scontrol` command exited unsuccessfully
93
+ # @return [void]
94
+ def hold_job(id)
95
+ call("scontrol", "hold", id.to_s)
96
+ end
97
+
98
+ # Release a specified job that is on hold
99
+ # @example Release job "1234" from on hold
100
+ # my_batch.release_job("1234")
101
+ # @param id [#to_s] the id of the job
102
+ # @raise [Error] if `scontrol` command exited unsuccessfully
103
+ # @return [void]
104
+ def release_job(id)
105
+ call("scontrol", "release", id.to_s)
106
+ end
107
+
108
+ # Delete a specified job from batch server
109
+ # @example Delete job "1234"
110
+ # my_batch.delete_job("1234")
111
+ # @param id [#to_s] the id of the job
112
+ # @raise [Error] if `scancel` command exited unsuccessfully
113
+ # @return [void]
114
+ def delete_job(id)
115
+ call("scancel", id.to_s)
116
+ end
117
+
118
+ # Submit a script expanded as a string to the batch server
119
+ # @param str [#to_s] script as a string
120
+ # @param args [Array<#to_s>] arguments passed to `sbatch` command
121
+ # @param env [Hash{#to_s => #to_s}] environment variables set
122
+ # @raise [Error] if `sbatch` command exited unsuccessfully
123
+ # @return [String] the id of the job that was created
124
+ def submit_string(str, args: [], env: {})
125
+ args = args.map(&:to_s) + ["--parsable"]
126
+ env = {"SBATCH_EXPORT" => "NONE"}.merge env.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
127
+ call("sbatch", *args, env: env, stdin: str.to_s).strip.split(";").first
128
+ end
129
+
130
+ private
131
+ # Call a forked Slurm command for a given cluster
132
+ def call(cmd, *args, env: {}, stdin: "")
133
+ cmd = bin.join(cmd.to_s).to_s
134
+ args = args.map(&:to_s)
135
+ args += ["-M", cluster] unless cluster.empty?
136
+ env = env.to_h
137
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s)
138
+ s.success? ? o : raise(Error, e)
139
+ end
140
+
141
+ # Fields requested from a formatted `squeue` call
142
+ def fields
143
+ {
144
+ account: "%a",
145
+ job_id: "%A",
146
+ gres: "%b",
147
+ exec_host: "%B",
148
+ min_cpus: "%c",
149
+ cpus: "%C",
150
+ min_tmp_disk: "%d",
151
+ nodes: "%D",
152
+ end_time: "%e",
153
+ dependency: "%E",
154
+ features: "%f",
155
+ array_job_id: "%F",
156
+ group_name: "%g",
157
+ group_id: "%G",
158
+ over_subscribe: "%h",
159
+ sockets_per_node: "%H",
160
+ array_job_task_id: "%i",
161
+ cores_per_socket: "%I",
162
+ job_name: "%j",
163
+ threads_per_core: "%J",
164
+ comment: "%k",
165
+ array_task_id: "%K",
166
+ time_limit: "%l",
167
+ time_left: "%L",
168
+ min_memory: "%m",
169
+ time_used: "%M",
170
+ req_node: "%n",
171
+ node_list: "%N",
172
+ command: "%o",
173
+ contiguous: "%O",
174
+ qos: "%q",
175
+ partition: "%P",
176
+ priority: "%Q",
177
+ reason: "%r",
178
+ start_time: "%S",
179
+ state_compact: "%t",
180
+ state: "%T",
181
+ user: "%u",
182
+ user_id: "%U",
183
+ reservation: "%v",
184
+ submit_time: "%V",
185
+ wckey: "%w",
186
+ licenses: "%W",
187
+ excluded_nodes: "%x",
188
+ core_specialization: "%X",
189
+ nice: "%y",
190
+ scheduled_nodes: "%Y",
191
+ sockets_cores_threads: "%z",
192
+ work_dir: "%Z"
193
+ }
194
+ end
195
+ end
196
+
197
+ # Mapping of state codes for Slurm
198
+ STATE_MAP = {
199
+ 'BF' => :completed, # BOOT_FAIL
200
+ 'CA' => :completed, # CANCELLED
201
+ 'CD' => :completed, # COMPLETED
202
+ 'CF' => :queued, # CONFIGURING
203
+ 'CG' => :running, # COMPLETING
204
+ 'F' => :completed, # FAILED
205
+ 'NF' => :completed, # NODE_FAIL
206
+ 'PD' => :queued, # PENDING
207
+ 'PR' => :suspended, # PREEMPTED
208
+ 'RV' => :completed, # REVOKED
209
+ 'R' => :running, # RUNNING
210
+ 'SE' => :completed, # SPECIAL_EXIT
211
+ 'ST' => :running, # STOPPED
212
+ 'S' => :suspended, # SUSPENDED
213
+ 'TO' => :completed # TIMEOUT
214
+ }
215
+
216
+ # @api private
217
+ # @param opts [#to_h] the options defining this adapter
218
+ # @option opts [Batch] :slurm The Slurm batch object
219
+ # @see Factory.build_slurm
220
+ def initialize(opts = {})
221
+ o = opts.to_h.symbolize_keys
222
+
223
+ @slurm = o.fetch(:slurm) { raise ArgumentError, "No slurm object specified. Missing argument: slurm" }
224
+ end
225
+
226
+ # Submit a job with the attributes defined in the job template instance
227
+ # @param script [Script] script object that describes the script and
228
+ # attributes for the submitted job
229
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
230
+ # execution at any point after dependent jobs have started execution
231
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
232
+ # execution only after dependent jobs have terminated with no errors
233
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
234
+ # execution only after dependent jobs have terminated with errors
235
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
236
+ # execution after dependent jobs have terminated
237
+ # @raise [JobAdapterError] if something goes wrong submitting a job
238
+ # @return [String] the job id returned after successfully submitting a
239
+ # job
240
+ # @see Adapter#submit
241
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
242
+ after = Array(after).map(&:to_s)
243
+ afterok = Array(afterok).map(&:to_s)
244
+ afternotok = Array(afternotok).map(&:to_s)
245
+ afterany = Array(afterany).map(&:to_s)
246
+
247
+ # Set sbatch options
248
+ args = []
249
+ # ignore args, don't know how to do this for slurm
250
+ args += ["-H"] if script.submit_as_hold
251
+ args += (script.rerunnable ? ["--requeue"] : ["--no-requeue"]) unless script.rerunnable.nil?
252
+ args += ["-D", script.workdir.to_s] unless script.workdir.nil?
253
+ args += ["--mail-user", script.email.join(",")] unless script.email.nil?
254
+ if script.email_on_started && script.email_on_terminated
255
+ args += ["--mail-type", "ALL"]
256
+ elsif script.email_on_started
257
+ args += ["--mail-type", "BEGIN"]
258
+ elsif script.email_on_terminated
259
+ args += ["--mail-type", "END"]
260
+ elsif script.email_on_started == false && script.email_on_terminated == false
261
+ args += ["--mail-type", "NONE"]
262
+ end
263
+ args += ["-J", script.job_name] unless script.job_name.nil?
264
+ args += ["-i", script.input_path] unless script.input_path.nil?
265
+ args += ["-o", script.output_path] unless script.output_path.nil?
266
+ args += ["-e", script.error_path] unless script.error_path.nil?
267
+ # ignore join_files, by default it joins stdout and stderr unless
268
+ # error_path is specified
269
+ args += ["--reservation", script.reservation_id] unless script.reservation_id.nil?
270
+ args += ["-p", script.queue_name] unless script.queue_name.nil?
271
+ args += ["--priority", script.priority] unless script.priority.nil?
272
+ args += ["--begin", script.start_time.localtime.strftime("%C%y-%m-%dT%H:%M:%S")] unless script.start_time.nil?
273
+ args += ["-A", script.accounting_id] unless script.accounting_id.nil?
274
+ args += ["--mem", "#{script.min_phys_memory}K"] unless script.min_phys_memory.nil?
275
+ args += ["-t", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
276
+ # ignore nodes, don't know how to do this for slurm
277
+
278
+ # Set dependencies
279
+ depend = []
280
+ depend << "after:#{after.join(":")}" unless after.empty?
281
+ depend << "afterok:#{afterok.join(":")}" unless afterok.empty?
282
+ depend << "afternotok:#{afternotok.join(":")}" unless afternotok.empty?
283
+ depend << "afterany:#{afterany.join(":")}" unless afterany.empty?
284
+ args += ["-d", depend.join(",")] unless depend.empty?
285
+
286
+ # Set environment variables
287
+ env = script.job_environment || {}
288
+ args += ["--export", script.job_environment.keys.join(",")] unless script.job_environment.nil? || script.job_environment.empty?
289
+
290
+ # Set native options
291
+ args += script.native if script.native
292
+
293
+ # Submit job
294
+ @slurm.submit_string(script.content, args: args, env: env)
295
+ rescue Batch::Error => e
296
+ raise JobAdapterError, e.message
297
+ end
298
+
299
+ # Retrieve info for all jobs from the resource manager
300
+ # @raise [JobAdapterError] if something goes wrong getting job info
301
+ # @return [Array<Info>] information describing submitted jobs
302
+ # @see Adapter#info_all
303
+ def info_all
304
+ @slurm.get_jobs.map do |v|
305
+ parse_job_info(v)
306
+ end
307
+ rescue Batch::Error => e
308
+ raise JobAdapterError, e.message
309
+ end
310
+
311
+ # Retrieve job info from the resource manager
312
+ # @param id [#to_s] the id of the job
313
+ # @raise [JobAdapterError] if something goes wrong getting job info
314
+ # @return [Info] information describing submitted job
315
+ # @see Adapter#info
316
+ def info(id)
317
+ id = id.to_s
318
+ info_ary = @slurm.get_jobs(id: id).map do |v|
319
+ parse_job_info(v)
320
+ end
321
+
322
+ # A job id can return multiple jobs if it corresponds to a job
323
+ # array id, so we need to find the job that corresponds to the
324
+ # given job id (if we can't find it, we assume it has completed)
325
+ info_ary.detect( -> { Info.new(id: id, status: :completed) } ) do |info|
326
+ # Match the job id or the formatted job & task id "1234_0"
327
+ info.id == id || info.native[:array_job_task_id] == id
328
+ end
329
+ rescue Batch::Error => e
330
+ # set completed status if can't find job id
331
+ if /Invalid job id specified/ =~ e.message
332
+ Info.new(
333
+ id: id,
334
+ status: :completed
335
+ )
336
+ else
337
+ raise JobAdapterError, e.message
338
+ end
339
+ end
340
+
341
+ # Retrieve job status from resource manager
342
+ # @param id [#to_s] the id of the job
343
+ # @raise [JobAdapterError] if something goes wrong getting job status
344
+ # @return [Status] status of job
345
+ # @see Adapter#status
346
+ def status(id)
347
+ id = id.to_s
348
+ jobs = @slurm.get_jobs(
349
+ id: id,
350
+ filters: [:job_id, :array_job_task_id, :state_compact]
351
+ )
352
+ # A job id can return multiple jobs if it corresponds to a job array
353
+ # id, so we need to find the job that corresponds to the given job id
354
+ # (if we can't find it, we assume it has completed)
355
+ #
356
+ # Match against the job id or the formatted job & task id "1234_0"
357
+ if job = jobs.detect { |j| j[:job_id] == id || j[:array_job_task_id] == id }
358
+ Status.new(state: get_state(job[:state_compact]))
359
+ else
360
+ # set completed status if can't find job id
361
+ Status.new(state: :completed)
362
+ end
363
+ rescue Batch::Error => e
364
+ # set completed status if can't find job id
365
+ if /Invalid job id specified/ =~ e.message
366
+ Status.new(state: :completed)
367
+ else
368
+ raise JobAdapterError, e.message
369
+ end
370
+ end
371
+
372
+ # Put the submitted job on hold
373
+ # @param id [#to_s] the id of the job
374
+ # @raise [JobAdapterError] if something goes wrong holding a job
375
+ # @return [void]
376
+ # @see Adapter#hold
377
+ def hold(id)
378
+ @slurm.hold_job(id.to_s)
379
+ rescue Batch::Error => e
380
+ # assume successful job hold if can't find job id
381
+ raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
382
+ end
383
+
384
+ # Release the job that is on hold
385
+ # @param id [#to_s] the id of the job
386
+ # @raise [JobAdapterError] if something goes wrong releasing a job
387
+ # @return [void]
388
+ # @see Adapter#release
389
+ def release(id)
390
+ @slurm.release_job(id.to_s)
391
+ rescue Batch::Error => e
392
+ # assume successful job release if can't find job id
393
+ raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
394
+ end
395
+
396
+ # Delete the submitted job
397
+ # @param id [#to_s] the id of the job
398
+ # @raise [JobAdapterError] if something goes wrong deleting a job
399
+ # @return [void]
400
+ # @see Adapter#delete
401
+ def delete(id)
402
+ @slurm.delete_job(id.to_s)
403
+ rescue Batch::Error => e
404
+ # assume successful job deletion if can't find job id
405
+ raise JobAdapterError, e.message unless /Invalid job id specified/ =~ e.message
406
+ end
407
+
408
+ private
409
+ # Convert duration to seconds
410
+ def duration_in_seconds(time)
411
+ return 0 if time.nil?
412
+ time, days = time.split("-").reverse
413
+ days.to_i * 24 * 3600 +
414
+ time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
415
+ end
416
+
417
+ # Convert seconds to duration
418
+ def seconds_to_duration(time)
419
+ "%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
420
+ end
421
+
422
+ # Convert host list string to individual nodes
423
+ # "em082"
424
+ # "em[014,055-056,161]"
425
+ # "n0163/2,7,10-11+n0205/0-11+n0156/0-11"
426
+ def parse_nodes(node_list)
427
+ /^(?<prefix>[^\[]+)(\[(?<range>[^\]]+)\])?$/ =~ node_list
428
+
429
+ if range
430
+ range.split(",").map do |x|
431
+ x =~ /^(\d+)-(\d+)$/ ? ($1..$2).to_a : x
432
+ end.flatten.map do |n|
433
+ { name: prefix + n, procs: nil }
434
+ end
435
+ elsif prefix
436
+ [ { name: prefix, procs: nil } ]
437
+ else
438
+ []
439
+ end
440
+ end
441
+
442
+ # Determine state from Slurm state code
443
+ def get_state(st)
444
+ STATE_MAP.fetch(st, :undetermined)
445
+ end
446
+
447
+ # Parse hash describing Slurm job status
448
+ def parse_job_info(v)
449
+ allocated_nodes = parse_nodes(v[:node_list])
450
+ Info.new(
451
+ id: v[:job_id],
452
+ status: get_state(v[:state_compact]),
453
+ allocated_nodes: allocated_nodes,
454
+ submit_host: nil,
455
+ job_name: v[:job_name],
456
+ job_owner: v[:user],
457
+ accounting_id: v[:account],
458
+ procs: v[:cpus],
459
+ queue_name: v[:partition],
460
+ wallclock_time: duration_in_seconds(v[:time_used]),
461
+ cpu_time: nil,
462
+ submission_time: Time.parse(v[:submit_time]),
463
+ dispatch_time: v[:start_time] == "N/A" ? nil : Time.parse(v[:start_time]),
464
+ native: v
465
+ )
466
+ end
467
+ end
468
+ end
469
+ end
470
+ end