ood_core 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG.md +14 -1
- data/lib/ood_core/cluster.rb +1 -1
- data/lib/ood_core/job/adapters/lsf.rb +41 -18
- data/lib/ood_core/job/adapters/lsf/batch.rb +2 -2
- data/lib/ood_core/job/adapters/lsf/helper.rb +2 -0
- data/lib/ood_core/job/adapters/pbspro.rb +48 -12
- data/lib/ood_core/job/adapters/sge/batch.rb +9 -5
- data/lib/ood_core/job/adapters/sge/helper.rb +38 -1
- data/lib/ood_core/job/adapters/slurm.rb +173 -69
- data/lib/ood_core/job/info.rb +15 -0
- data/lib/ood_core/job/task.rb +6 -5
- data/lib/ood_core/version.rb +1 -1
- metadata +3 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: ac5caf10cd563acf0e8ef6a4b7d421b5718dc097
|
|
4
|
+
data.tar.gz: c9e401652e388868a2d583751ef94d50ccb2f22a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: db745be6e2bcc4a7c4bfcd31d0a47c50bb948be84d24c2bc5f45c7bab6bbf46e22d82ed86120087ce6c0e6d554d323acef53b4adde3ffc7eb801216cc419f986
|
|
7
|
+
data.tar.gz: '096513b3c128b32c81b19784ef56164ef74e158dbd72d37b47d51063cd6b89daac03a00abf632421cf4e3687390930b5bb3c3771b5eb7290c91fb54a2757bf21'
|
data/CHANGELOG.md
CHANGED
|
@@ -6,6 +6,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
|
6
6
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
|
+
## [0.9.0] - 2019-05-04
|
|
10
|
+
### Added
|
|
11
|
+
- Job array support for LSF and PBSPro
|
|
12
|
+
- Slurm adapter uses `squeue` owner filter (`-u`) for `info_where_owner`
|
|
13
|
+
|
|
14
|
+
### Fixed
|
|
15
|
+
- Grid Engine adapter now starts scripts in the current directory like all other adapters
|
|
16
|
+
- Fixed issue where Slurm comment field might break job info parsing
|
|
17
|
+
- Fixed possible crash when comparing two clusters if the id of one of the clusters is nil
|
|
18
|
+
- Fixed bug with the live system test that impacted non-Torque systems
|
|
19
|
+
- Fixed bug with Slurm adapter when submit time is not available
|
|
20
|
+
|
|
9
21
|
## [0.8.0] - 2019-01-29
|
|
10
22
|
### Added
|
|
11
23
|
- info_all_each and info_where_owner_each super class methods
|
|
@@ -165,7 +177,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
165
177
|
### Added
|
|
166
178
|
- Initial release!
|
|
167
179
|
|
|
168
|
-
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.
|
|
180
|
+
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.9.0...HEAD
|
|
181
|
+
[0.9.0]: https://github.com/OSC/ood_core/compare/v0.8.0...v0.9.0
|
|
169
182
|
[0.8.0]: https://github.com/OSC/ood_core/compare/v0.7.1...v0.8.0
|
|
170
183
|
[0.7.1]: https://github.com/OSC/ood_core/compare/v0.7.0...v0.7.1
|
|
171
184
|
[0.7.0]: https://github.com/OSC/ood_core/compare/v0.6.0...v0.7.0
|
data/lib/ood_core/cluster.rb
CHANGED
|
@@ -90,16 +90,8 @@ module OodCore
|
|
|
90
90
|
# @return [Info] information describing submitted job
|
|
91
91
|
# @see Adapter#info
|
|
92
92
|
def info(id)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
if job
|
|
96
|
-
info_for_batch_hash(job)
|
|
97
|
-
else
|
|
98
|
-
Info.new(
|
|
99
|
-
id: id,
|
|
100
|
-
status: :completed
|
|
101
|
-
)
|
|
102
|
-
end
|
|
93
|
+
info_ary = batch.get_job(id: id).map{|v| info_for_batch_hash(v)}
|
|
94
|
+
handle_job_array(info_ary, id)
|
|
103
95
|
rescue Batch::Error => e
|
|
104
96
|
raise JobAdapterError, e.message
|
|
105
97
|
end
|
|
@@ -131,19 +123,13 @@ module OodCore
|
|
|
131
123
|
raise JobAdapterError, e.message
|
|
132
124
|
end
|
|
133
125
|
|
|
134
|
-
def supports_job_arrays?
|
|
135
|
-
false
|
|
136
|
-
end
|
|
137
|
-
|
|
138
126
|
# Retrieve job status from resource manager
|
|
139
127
|
# @param id [#to_s] the id of the job
|
|
140
128
|
# @raise [JobAdapterError] if something goes wrong getting job status
|
|
141
129
|
# @return [Status] status of job
|
|
142
130
|
# @see Adapter#status
|
|
143
131
|
def status(id)
|
|
144
|
-
|
|
145
|
-
state = job ? get_state(job[:status]) : :completed
|
|
146
|
-
Status.new(state: state)
|
|
132
|
+
info(id).status
|
|
147
133
|
rescue Batch::Error => e
|
|
148
134
|
raise JobAdapterError, e.message
|
|
149
135
|
end
|
|
@@ -196,8 +182,11 @@ module OodCore
|
|
|
196
182
|
dispatch_time = helper.parse_past_time(v[:start_time], ignore_errors: true)
|
|
197
183
|
finish_time = helper.parse_past_time(v[:finish_time], ignore_errors: true)
|
|
198
184
|
|
|
185
|
+
# Detect job array index from name
|
|
186
|
+
array_index = /(\[\d+\])$/.match(v[:name])
|
|
187
|
+
|
|
199
188
|
Info.new(
|
|
200
|
-
id: v[:id],
|
|
189
|
+
id: (array_index) ? "#{v[:id]}#{array_index[1]}" : v[:id],
|
|
201
190
|
status: get_state(v[:status]),
|
|
202
191
|
allocated_nodes: nodes,
|
|
203
192
|
submit_host: v[:from_host],
|
|
@@ -214,6 +203,40 @@ module OodCore
|
|
|
214
203
|
native: v
|
|
215
204
|
)
|
|
216
205
|
end
|
|
206
|
+
|
|
207
|
+
def handle_job_array(info_ary, id)
|
|
208
|
+
return Info.new(id: id, status: :completed) if info_ary.nil? || info_ary.empty?
|
|
209
|
+
return info_ary.first if info_ary.size == 1
|
|
210
|
+
|
|
211
|
+
parent_task_hash = build_proxy_parent(info_ary.first, id)
|
|
212
|
+
|
|
213
|
+
info_ary.map do |task_info|
|
|
214
|
+
parent_task_hash[:tasks] << {:id => task_info.id, :status => task_info.status}
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
parent_task_hash[:status] = parent_task_hash[:tasks].map{|task| task[:status]}.max
|
|
218
|
+
|
|
219
|
+
Info.new(**parent_task_hash)
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Proxy the first element as the parent hash delete non-shared attributes
|
|
223
|
+
def build_proxy_parent(info, id)
|
|
224
|
+
info.to_h.merge({
|
|
225
|
+
:tasks => [],
|
|
226
|
+
:id => id
|
|
227
|
+
}).delete_if{
|
|
228
|
+
|key, _| [
|
|
229
|
+
:allocated_nodes, :dispatch_time,
|
|
230
|
+
:cpu_time, :wallclock_time, :status
|
|
231
|
+
].include?(key)
|
|
232
|
+
}.tap{
|
|
233
|
+
# Remove the child array index from the :job_name
|
|
234
|
+
|
|
235
|
+
# Note that a true representation of the parent should have the
|
|
236
|
+
# full array spec in the name. Worth attempting to reconstruct?
|
|
237
|
+
|h| h[:job_name] = h[:job_name].gsub(/\[[^\]]+\]/, '')
|
|
238
|
+
}
|
|
239
|
+
end
|
|
217
240
|
end
|
|
218
241
|
end
|
|
219
242
|
end
|
|
@@ -43,10 +43,10 @@ class OodCore::Job::Adapters::Lsf::Batch
|
|
|
43
43
|
# Get hash detailing the specified job
|
|
44
44
|
# @param id [#to_s] the id of the job to check
|
|
45
45
|
# @raise [Error] if `bjobs` command exited unsuccessfully
|
|
46
|
-
# @return [Hash] details of specified job
|
|
46
|
+
# @return [Array<Hash>] details of specified job
|
|
47
47
|
def get_job(id:)
|
|
48
48
|
args = %W( -a -w -W #{id.to_s} )
|
|
49
|
-
parse_bjobs_output(call("bjobs", *args))
|
|
49
|
+
parse_bjobs_output(call("bjobs", *args))
|
|
50
50
|
end
|
|
51
51
|
|
|
52
52
|
# status fields available from bjobs
|
|
@@ -81,6 +81,8 @@ class OodCore::Job::Adapters::Lsf::Helper
|
|
|
81
81
|
args += ["-P", script.accounting_id] unless script.accounting_id.nil?
|
|
82
82
|
args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
|
|
83
83
|
args += ["-J", script.job_name] unless script.job_name.nil?
|
|
84
|
+
args[-1] += "[#{script.job_array_request}]" unless script.job_array_request.nil?
|
|
85
|
+
|
|
84
86
|
args += ["-q", script.queue_name] unless script.queue_name.nil?
|
|
85
87
|
args += ["-U", script.reservation_id] unless script.reservation_id.nil?
|
|
86
88
|
args += ["-sp", script.priority] unless script.priority.nil?
|
|
@@ -86,8 +86,7 @@ module OodCore
|
|
|
86
86
|
# @raise [Error] if `qstat` command exited unsuccessfully
|
|
87
87
|
# @return [Array<Hash>] list of details for jobs
|
|
88
88
|
def get_jobs(id: "")
|
|
89
|
-
args = ["-f"] # display all information
|
|
90
|
-
args += ["-t"] # list subjobs
|
|
89
|
+
args = ["-f", "-t"] # display all information
|
|
91
90
|
args += [id.to_s] unless id.to_s.empty?
|
|
92
91
|
lines = call("qstat", *args).gsub("\n\t", "").split("\n").map(&:strip)
|
|
93
92
|
|
|
@@ -101,7 +100,8 @@ module OodCore
|
|
|
101
100
|
k2 ? ( hsh[k1] ||= {} and hsh[k1][k2] = value ) : ( hsh[k1] = value )
|
|
102
101
|
end
|
|
103
102
|
end
|
|
104
|
-
|
|
103
|
+
|
|
104
|
+
jobs
|
|
105
105
|
end
|
|
106
106
|
|
|
107
107
|
# Select batch jobs from the batch server
|
|
@@ -181,8 +181,8 @@ module OodCore
|
|
|
181
181
|
'U' => :suspended, # cycle-harvesting job is suspended due to keyboard activity
|
|
182
182
|
'E' => :running, # job is exiting after having run
|
|
183
183
|
'F' => :completed, # job is finished
|
|
184
|
-
'X' => :completed
|
|
185
|
-
#
|
|
184
|
+
'X' => :completed, # subjob has completed execution or has been deleted
|
|
185
|
+
'B' => :running # job array has at least one child running
|
|
186
186
|
}
|
|
187
187
|
|
|
188
188
|
# What percentage of jobs a user owns out of all jobs, used to decide
|
|
@@ -266,6 +266,8 @@ module OodCore
|
|
|
266
266
|
# mimics what the other resource managers do)
|
|
267
267
|
args += ["-j", "oe"] if script.error_path.nil?
|
|
268
268
|
|
|
269
|
+
args += ["-J", script.job_array_request] unless script.job_array_request.nil?
|
|
270
|
+
|
|
269
271
|
# Set native options
|
|
270
272
|
args += script.native if script.native
|
|
271
273
|
|
|
@@ -303,13 +305,21 @@ module OodCore
|
|
|
303
305
|
if usr_jobs.size > (qstat_factor * all_jobs.size)
|
|
304
306
|
super
|
|
305
307
|
else
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
308
|
+
begin
|
|
309
|
+
user_job_infos = []
|
|
310
|
+
usr_jobs.each do |id|
|
|
311
|
+
job = info(id)
|
|
312
|
+
user_job_infos << job
|
|
309
313
|
|
|
310
|
-
|
|
311
|
-
|
|
314
|
+
job.tasks.each {|task| user_job_infos << job.build_child_info(task)}
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
user_job_infos
|
|
318
|
+
rescue Batch::Error => e
|
|
319
|
+
raise JobAdapterError, e.message
|
|
320
|
+
end
|
|
312
321
|
end
|
|
322
|
+
end
|
|
313
323
|
|
|
314
324
|
# Retrieve job info from the resource manager
|
|
315
325
|
# @param id [#to_s] the id of the job
|
|
@@ -318,9 +328,18 @@ module OodCore
|
|
|
318
328
|
# @see Adapter#info
|
|
319
329
|
def info(id)
|
|
320
330
|
id = id.to_s
|
|
321
|
-
|
|
331
|
+
|
|
332
|
+
job_infos = @pbspro.get_jobs(id: id).map do |v|
|
|
322
333
|
parse_job_info(v)
|
|
323
|
-
end
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
if job_infos.empty?
|
|
337
|
+
Info.new(id: id, status: :completed)
|
|
338
|
+
elsif job_infos.length == 1
|
|
339
|
+
job_infos.first
|
|
340
|
+
else
|
|
341
|
+
process_job_array(id, job_infos)
|
|
342
|
+
end
|
|
324
343
|
rescue Batch::Error => e
|
|
325
344
|
# set completed status if can't find job id
|
|
326
345
|
if /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
|
@@ -434,6 +453,23 @@ module OodCore
|
|
|
434
453
|
native: v
|
|
435
454
|
)
|
|
436
455
|
end
|
|
456
|
+
|
|
457
|
+
# Combine the array parent with the states of its children
|
|
458
|
+
def process_job_array(id, jobs)
|
|
459
|
+
parent_job = jobs.select { |j| /\[\]/ =~ j.id }.first
|
|
460
|
+
parent = (parent_job) ? parent_job.to_h : {:id => id, :status => :undetermined}
|
|
461
|
+
|
|
462
|
+
# create task hashes from children
|
|
463
|
+
parent[:tasks] = jobs.reject { |j| /\[\]/ =~ j.id }.map do |j|
|
|
464
|
+
{
|
|
465
|
+
:id => j.id,
|
|
466
|
+
:status => j.status.to_sym,
|
|
467
|
+
:wallclock_time => j.wallclock_time
|
|
468
|
+
}
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
Info.new(**parent)
|
|
472
|
+
end
|
|
437
473
|
end
|
|
438
474
|
end
|
|
439
475
|
end
|
|
@@ -95,7 +95,12 @@ class OodCore::Job::Adapters::Sge::Batch
|
|
|
95
95
|
|
|
96
96
|
job_hash = listener.parsed_job
|
|
97
97
|
|
|
98
|
-
|
|
98
|
+
if job_hash[:id]
|
|
99
|
+
update_job_hash_status!(job_hash)
|
|
100
|
+
else
|
|
101
|
+
job_hash[:id] = job_id
|
|
102
|
+
job_hash[:status] = :completed
|
|
103
|
+
end
|
|
99
104
|
|
|
100
105
|
job_info = OodCore::Job::Info.new(**job_hash)
|
|
101
106
|
rescue REXML::ParseException => e
|
|
@@ -115,8 +120,8 @@ class OodCore::Job::Adapters::Sge::Batch
|
|
|
115
120
|
if get_status_from_drmaa?(job_hash)
|
|
116
121
|
begin
|
|
117
122
|
job_hash[:status] = get_status_from_drmma(job_hash[:id])
|
|
118
|
-
rescue DRMAA::
|
|
119
|
-
|
|
123
|
+
rescue DRMAA::DRMAAException => e
|
|
124
|
+
# log DRMAA error?
|
|
120
125
|
end
|
|
121
126
|
end
|
|
122
127
|
end
|
|
@@ -156,8 +161,7 @@ class OodCore::Job::Adapters::Sge::Batch
|
|
|
156
161
|
# @param job_id [#to_s]
|
|
157
162
|
# @return job_id [String]
|
|
158
163
|
def submit(content, args)
|
|
159
|
-
|
|
160
|
-
@helper.parse_job_id_from_qsub(call(*cmd, :stdin => content))
|
|
164
|
+
@helper.parse_job_id_from_qsub(call('qsub', *args, :stdin => content))
|
|
161
165
|
end
|
|
162
166
|
|
|
163
167
|
# Call a forked SGE command for a given batch server
|
|
@@ -20,7 +20,12 @@ class OodCore::Job::Adapters::Sge::Helper
|
|
|
20
20
|
args += ['-h'] if script.submit_as_hold
|
|
21
21
|
args += ['-r', 'yes'] if script.rerunnable
|
|
22
22
|
script.job_environment.each_pair {|k, v| args += ['-v', "#{k.to_s}=#{v.to_s}"]} unless script.job_environment.nil?
|
|
23
|
-
|
|
23
|
+
|
|
24
|
+
if script.workdir
|
|
25
|
+
args += ['-wd', script.workdir]
|
|
26
|
+
elsif ! script_contains_wd_directive?(script.content)
|
|
27
|
+
args += ['-cwd']
|
|
28
|
+
end
|
|
24
29
|
|
|
25
30
|
on_event_email = []
|
|
26
31
|
on_event_email << 'b' if script.email_on_started # beginning
|
|
@@ -47,6 +52,38 @@ class OodCore::Job::Adapters::Sge::Helper
|
|
|
47
52
|
args
|
|
48
53
|
end
|
|
49
54
|
|
|
55
|
+
# @brief Detect whether script content contains either -cwd or -wd
|
|
56
|
+
#
|
|
57
|
+
# @param content The script content
|
|
58
|
+
#
|
|
59
|
+
# Examples:
|
|
60
|
+
# #$-wd /home/ood/ondemand # should match
|
|
61
|
+
# #$ -wd /home/ood/ondemand # should match
|
|
62
|
+
# #$ -cwd /home/ood/ondemand # should match
|
|
63
|
+
# #$ -j yes -wd /home/ood/ondemand # should match
|
|
64
|
+
# #$ -j yes -o this-wd /home/ood/ondemand # should NOT match
|
|
65
|
+
# #$ -t 1-10:5 -wd /home/ood/ondemand # should NOT match
|
|
66
|
+
#
|
|
67
|
+
# @return [bool]
|
|
68
|
+
#
|
|
69
|
+
def script_contains_wd_directive?(content)
|
|
70
|
+
content.slice(
|
|
71
|
+
# Only search within the script's first 1024 characters in case the user is
|
|
72
|
+
# putting lots of non-line delimited data into their scripts.
|
|
73
|
+
0, 1024
|
|
74
|
+
).split(
|
|
75
|
+
"\n"
|
|
76
|
+
).any? {
|
|
77
|
+
|line|
|
|
78
|
+
# String must start with #$
|
|
79
|
+
# Match may be:
|
|
80
|
+
# Immediate -c?wd
|
|
81
|
+
# Eventual space or tab followed by -c?wd
|
|
82
|
+
# String may end with multiple characters
|
|
83
|
+
/^#\$(?:-c?wd|.*[ \t]+-c?wd).*$/ =~ line
|
|
84
|
+
}
|
|
85
|
+
end
|
|
86
|
+
|
|
50
87
|
# Raise exceptions when adapter is asked to perform an action that SGE does not support
|
|
51
88
|
# @raise [Error] when an incompatible action is requested
|
|
52
89
|
def raise_error_on_unsupported_args(script, after:, afterok:, afternotok:, afterany:)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
require "time"
|
|
2
2
|
require "ood_core/refinements/hash_extensions"
|
|
3
|
+
require "ood_core/refinements/array_extensions"
|
|
3
4
|
require "ood_core/job/adapters/helper"
|
|
4
5
|
|
|
5
6
|
module OodCore
|
|
@@ -29,10 +30,14 @@ module OodCore
|
|
|
29
30
|
# resource manager for job management.
|
|
30
31
|
class Slurm < Adapter
|
|
31
32
|
using Refinements::HashExtensions
|
|
33
|
+
using Refinements::ArrayExtensions
|
|
32
34
|
|
|
33
35
|
# Object used for simplified communication with a Slurm batch server
|
|
34
36
|
# @api private
|
|
35
37
|
class Batch
|
|
38
|
+
UNIT_SEPARATOR = "\x1F"
|
|
39
|
+
RECORD_SEPARATOR = "\x1E"
|
|
40
|
+
|
|
36
41
|
# The cluster of the Slurm batch server
|
|
37
42
|
# @example CHPC's kingspeak cluster
|
|
38
43
|
# my_batch.cluster #=> "kingspeak"
|
|
@@ -89,22 +94,66 @@ module OodCore
|
|
|
89
94
|
# # ...
|
|
90
95
|
# #]
|
|
91
96
|
# @param id [#to_s] the id of the job
|
|
92
|
-
# @param
|
|
97
|
+
# @param owner [String] the owner(s) of the job
|
|
98
|
+
# @param attrs [Array<Symbol>, nil] list of attributes request when calling squeue
|
|
93
99
|
# @raise [Error] if `squeue` command exited unsuccessfully
|
|
94
100
|
# @return [Array<Hash>] list of details for jobs
|
|
95
|
-
def get_jobs(id: "",
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
101
|
+
def get_jobs(id: "", owner: nil, attrs: nil)
|
|
102
|
+
fields = squeue_fields(attrs)
|
|
103
|
+
args = squeue_args(id: id, owner: owner, options: fields.values)
|
|
104
|
+
|
|
105
|
+
#TODO: switch mock of Open3 to be the squeue mock script
|
|
106
|
+
# then you can use that for performance metrics
|
|
107
|
+
StringIO.open(call("squeue", *args)) do |output|
|
|
108
|
+
advance_past_squeue_header!(output)
|
|
109
|
+
|
|
110
|
+
jobs = []
|
|
111
|
+
output.each_line(RECORD_SEPARATOR) do |line|
|
|
112
|
+
# TODO: once you can do performance metrics you can test zip against some other tools
|
|
113
|
+
# or just small optimizations
|
|
114
|
+
# for example, fields is ALREADY A HASH and we are setting the VALUES to
|
|
115
|
+
# "line.strip.split(unit_separator)" array
|
|
116
|
+
#
|
|
117
|
+
# i.e. store keys in an array, do Hash[[keys, values].transpose]
|
|
118
|
+
#
|
|
119
|
+
# or
|
|
120
|
+
#
|
|
121
|
+
# job = {}
|
|
122
|
+
# keys.each_with_index { |key, index| [key] = values[index] }
|
|
123
|
+
# jobs << job
|
|
124
|
+
#
|
|
125
|
+
# assuming keys and values are same length! if not we have an error!
|
|
126
|
+
values = line.chomp(RECORD_SEPARATOR).strip.split(UNIT_SEPARATOR)
|
|
127
|
+
jobs << Hash[fields.keys.zip(values)] unless values.empty?
|
|
128
|
+
end
|
|
129
|
+
jobs
|
|
130
|
+
end
|
|
131
|
+
end
|
|
102
132
|
|
|
103
|
-
|
|
104
|
-
|
|
133
|
+
def squeue_fields(attrs)
|
|
134
|
+
if attrs.nil?
|
|
135
|
+
all_squeue_fields
|
|
136
|
+
else
|
|
137
|
+
all_squeue_fields.slice(*squeue_attrs_for_info_attrs(Array.wrap(attrs) + squeue_required_fields))
|
|
105
138
|
end
|
|
106
139
|
end
|
|
107
140
|
|
|
141
|
+
def squeue_required_fields
|
|
142
|
+
#TODO: does this need to include ::array_job_task_id?
|
|
143
|
+
#TODO: does it matter that order of the output can vary depending on the arguments and if "squeue_required_fields" are included?
|
|
144
|
+
# previously the order was "fields.keys"; i don't think it does
|
|
145
|
+
[:job_id, :state_compact]
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
#TODO: write some barebones test for this? like 2 options and id or no id
|
|
149
|
+
def squeue_args(id: "", owner: nil, options: [])
|
|
150
|
+
args = ["--all", "--states=all", "--noconvert"]
|
|
151
|
+
args += ["-o", "#{RECORD_SEPARATOR}#{options.join(UNIT_SEPARATOR)}"]
|
|
152
|
+
args += ["-u", owner.to_s] unless owner.to_s.empty?
|
|
153
|
+
args += ["-j", id.to_s] unless id.to_s.empty?
|
|
154
|
+
args
|
|
155
|
+
end
|
|
156
|
+
|
|
108
157
|
# Put a specified job on hold
|
|
109
158
|
# @example Put job "1234" on hold
|
|
110
159
|
# my_batch.hold_job("1234")
|
|
@@ -147,7 +196,82 @@ module OodCore
|
|
|
147
196
|
call("sbatch", *args, env: env, stdin: str.to_s).strip.split(";").first
|
|
148
197
|
end
|
|
149
198
|
|
|
199
|
+
# Fields requested from a formatted `squeue` call
|
|
200
|
+
# Note that the order of these fields is important
|
|
201
|
+
def all_squeue_fields
|
|
202
|
+
{
|
|
203
|
+
account: "%a",
|
|
204
|
+
job_id: "%A",
|
|
205
|
+
exec_host: "%B",
|
|
206
|
+
min_cpus: "%c",
|
|
207
|
+
cpus: "%C",
|
|
208
|
+
min_tmp_disk: "%d",
|
|
209
|
+
nodes: "%D",
|
|
210
|
+
end_time: "%e",
|
|
211
|
+
dependency: "%E",
|
|
212
|
+
features: "%f",
|
|
213
|
+
array_job_id: "%F",
|
|
214
|
+
group_name: "%g",
|
|
215
|
+
group_id: "%G",
|
|
216
|
+
over_subscribe: "%h",
|
|
217
|
+
sockets_per_node: "%H",
|
|
218
|
+
array_job_task_id: "%i",
|
|
219
|
+
cores_per_socket: "%I",
|
|
220
|
+
job_name: "%j",
|
|
221
|
+
threads_per_core: "%J",
|
|
222
|
+
comment: "%k",
|
|
223
|
+
array_task_id: "%K",
|
|
224
|
+
time_limit: "%l",
|
|
225
|
+
time_left: "%L",
|
|
226
|
+
min_memory: "%m",
|
|
227
|
+
time_used: "%M",
|
|
228
|
+
req_node: "%n",
|
|
229
|
+
node_list: "%N",
|
|
230
|
+
command: "%o",
|
|
231
|
+
contiguous: "%O",
|
|
232
|
+
qos: "%q",
|
|
233
|
+
partition: "%P",
|
|
234
|
+
priority: "%Q",
|
|
235
|
+
reason: "%r",
|
|
236
|
+
start_time: "%S",
|
|
237
|
+
state_compact: "%t",
|
|
238
|
+
state: "%T",
|
|
239
|
+
user: "%u",
|
|
240
|
+
user_id: "%U",
|
|
241
|
+
reservation: "%v",
|
|
242
|
+
submit_time: "%V",
|
|
243
|
+
wckey: "%w",
|
|
244
|
+
licenses: "%W",
|
|
245
|
+
excluded_nodes: "%x",
|
|
246
|
+
core_specialization: "%X",
|
|
247
|
+
nice: "%y",
|
|
248
|
+
scheduled_nodes: "%Y",
|
|
249
|
+
sockets_cores_threads: "%z",
|
|
250
|
+
work_dir: "%Z",
|
|
251
|
+
gres: "%b", # must come at the end to fix a bug with Slurm 18
|
|
252
|
+
}
|
|
253
|
+
end
|
|
254
|
+
|
|
150
255
|
private
|
|
256
|
+
# Modify the StringIO instance by advancing past the squeue header
|
|
257
|
+
#
|
|
258
|
+
# The first two "records" should always be discarded. Consider the
|
|
259
|
+
# following squeue with -M output (invisible characters shown):
|
|
260
|
+
#
|
|
261
|
+
# CLUSTER: slurm_cluster_name\n
|
|
262
|
+
# \x1EJOBID\x1F\x1FSTATE\n
|
|
263
|
+
# \x1E1\x1F\x1FR\n
|
|
264
|
+
# \x1E2\x1F\x1FPD\n
|
|
265
|
+
#
|
|
266
|
+
# Splitting on the record separator first gives the Cluster header,
|
|
267
|
+
# and then the regular header. If -M or --cluster is not specified
|
|
268
|
+
# the effect is the same because the record separator is at the
|
|
269
|
+
# start of the format string, so the first "record" would simply be
|
|
270
|
+
# empty.
|
|
271
|
+
def advance_past_squeue_header!(squeue_output)
|
|
272
|
+
2.times { squeue_output.gets(RECORD_SEPARATOR) }
|
|
273
|
+
end
|
|
274
|
+
|
|
151
275
|
# Call a forked Slurm command for a given cluster
|
|
152
276
|
def call(cmd, *args, env: {}, stdin: "")
|
|
153
277
|
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
|
@@ -159,60 +283,25 @@ module OodCore
|
|
|
159
283
|
s.success? ? o : raise(Error, e)
|
|
160
284
|
end
|
|
161
285
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
array_job_task_id: "%i",
|
|
182
|
-
cores_per_socket: "%I",
|
|
183
|
-
job_name: "%j",
|
|
184
|
-
threads_per_core: "%J",
|
|
185
|
-
comment: "%k",
|
|
186
|
-
array_task_id: "%K",
|
|
187
|
-
time_limit: "%l",
|
|
188
|
-
time_left: "%L",
|
|
189
|
-
min_memory: "%m",
|
|
190
|
-
time_used: "%M",
|
|
191
|
-
req_node: "%n",
|
|
192
|
-
node_list: "%N",
|
|
193
|
-
command: "%o",
|
|
194
|
-
contiguous: "%O",
|
|
195
|
-
qos: "%q",
|
|
196
|
-
partition: "%P",
|
|
197
|
-
priority: "%Q",
|
|
198
|
-
reason: "%r",
|
|
199
|
-
start_time: "%S",
|
|
200
|
-
state_compact: "%t",
|
|
201
|
-
state: "%T",
|
|
202
|
-
user: "%u",
|
|
203
|
-
user_id: "%U",
|
|
204
|
-
reservation: "%v",
|
|
205
|
-
submit_time: "%V",
|
|
206
|
-
wckey: "%w",
|
|
207
|
-
licenses: "%W",
|
|
208
|
-
excluded_nodes: "%x",
|
|
209
|
-
core_specialization: "%X",
|
|
210
|
-
nice: "%y",
|
|
211
|
-
scheduled_nodes: "%Y",
|
|
212
|
-
sockets_cores_threads: "%z",
|
|
213
|
-
work_dir: "%Z",
|
|
214
|
-
gres: "%b", # must come at the end to fix a bug with Slurm 18
|
|
215
|
-
}
|
|
286
|
+
def squeue_attrs_for_info_attrs(attrs)
|
|
287
|
+
attrs.map { |a|
|
|
288
|
+
{
|
|
289
|
+
id: :job_id,
|
|
290
|
+
status: :state_compact,
|
|
291
|
+
allocated_nodes: [:node_list, :scheduled_nodes],
|
|
292
|
+
# submit_host: nil,
|
|
293
|
+
job_name: :job_name,
|
|
294
|
+
job_owner: :user,
|
|
295
|
+
accounting_id: :account,
|
|
296
|
+
procs: :cpus,
|
|
297
|
+
queue_name: :partition,
|
|
298
|
+
wallclock_time: :time_used,
|
|
299
|
+
wallclock_limit: :time_limit,
|
|
300
|
+
# cpu_time: nil,
|
|
301
|
+
submission_time: :submit_time,
|
|
302
|
+
dispatch_time: :start_time
|
|
303
|
+
}.fetch(a, a)
|
|
304
|
+
}.flatten
|
|
216
305
|
end
|
|
217
306
|
end
|
|
218
307
|
|
|
@@ -328,7 +417,7 @@ module OodCore
|
|
|
328
417
|
# @return [Array<Info>] information describing submitted jobs
|
|
329
418
|
# @see Adapter#info_all
|
|
330
419
|
def info_all(attrs: nil)
|
|
331
|
-
@slurm.get_jobs.map do |v|
|
|
420
|
+
@slurm.get_jobs(attrs: attrs).map do |v|
|
|
332
421
|
parse_job_info(v)
|
|
333
422
|
end
|
|
334
423
|
rescue Batch::Error => e
|
|
@@ -360,6 +449,20 @@ module OodCore
|
|
|
360
449
|
end
|
|
361
450
|
end
|
|
362
451
|
|
|
452
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
|
453
|
+
# resource manager
|
|
454
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
|
455
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
|
456
|
+
# @return [Array<Info>] information describing submitted jobs
|
|
457
|
+
def info_where_owner(owner, attrs: nil)
|
|
458
|
+
owner = Array.wrap(owner).map(&:to_s).join(',')
|
|
459
|
+
@slurm.get_jobs(owner: owner).map do |v|
|
|
460
|
+
parse_job_info(v)
|
|
461
|
+
end
|
|
462
|
+
rescue Batch::Error => e
|
|
463
|
+
raise JobAdapterError, e.message
|
|
464
|
+
end
|
|
465
|
+
|
|
363
466
|
# Retrieve job status from resource manager
|
|
364
467
|
# @param id [#to_s] the id of the job
|
|
365
468
|
# @raise [JobAdapterError] if something goes wrong getting job status
|
|
@@ -369,7 +472,7 @@ module OodCore
|
|
|
369
472
|
id = id.to_s
|
|
370
473
|
jobs = @slurm.get_jobs(
|
|
371
474
|
id: id,
|
|
372
|
-
|
|
475
|
+
attrs: [:job_id, :array_job_task_id, :state_compact]
|
|
373
476
|
)
|
|
374
477
|
# A job id can return multiple jobs if it corresponds to a job array
|
|
375
478
|
# id, so we need to find the job that corresponds to the given job id
|
|
@@ -478,6 +581,7 @@ module OodCore
|
|
|
478
581
|
allocated_nodes = [ { name: nil } ] * v[:nodes].to_i
|
|
479
582
|
end
|
|
480
583
|
end
|
|
584
|
+
|
|
481
585
|
Info.new(
|
|
482
586
|
id: v[:job_id],
|
|
483
587
|
status: get_state(v[:state_compact]),
|
|
@@ -491,8 +595,8 @@ module OodCore
|
|
|
491
595
|
wallclock_time: duration_in_seconds(v[:time_used]),
|
|
492
596
|
wallclock_limit: duration_in_seconds(v[:time_limit]),
|
|
493
597
|
cpu_time: nil,
|
|
494
|
-
submission_time: Time.parse(v[:submit_time]),
|
|
495
|
-
dispatch_time: v[:start_time] == "N/A" ? nil : Time.parse(v[:start_time]),
|
|
598
|
+
submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil,
|
|
599
|
+
dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]),
|
|
496
600
|
native: v
|
|
497
601
|
)
|
|
498
602
|
end
|
|
@@ -500,7 +604,7 @@ module OodCore
|
|
|
500
604
|
def handle_job_array(info_ary, id)
|
|
501
605
|
# If only one job was returned we return it
|
|
502
606
|
return info_ary.first unless info_ary.length > 1
|
|
503
|
-
|
|
607
|
+
|
|
504
608
|
parent_task_hash = {:tasks => []}
|
|
505
609
|
|
|
506
610
|
info_ary.map do |task_info|
|
data/lib/ood_core/job/info.rb
CHANGED
|
@@ -113,6 +113,21 @@ module OodCore
|
|
|
113
113
|
@native = native
|
|
114
114
|
end
|
|
115
115
|
|
|
116
|
+
# Create a new Info for a child task
|
|
117
|
+
# @return [Info] merging the parent and the child task
|
|
118
|
+
def build_child_info(task)
|
|
119
|
+
parent_only_keys = [
|
|
120
|
+
:allocated_nodes,
|
|
121
|
+
:procs,
|
|
122
|
+
:cpu_time,
|
|
123
|
+
:dispatch_time,
|
|
124
|
+
:native,
|
|
125
|
+
:tasks
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
new(**to_h.merge(task.to_h).delete_if{|k, v| parent_only_keys.include?(k)})
|
|
129
|
+
end
|
|
130
|
+
|
|
116
131
|
# Convert object to hash
|
|
117
132
|
# @return [Hash] object as hash
|
|
118
133
|
def to_h
|
data/lib/ood_core/job/task.rb
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
module OodCore
|
|
2
2
|
module Job
|
|
3
3
|
class Task
|
|
4
|
-
attr_reader :id
|
|
5
|
-
attr_reader :status
|
|
4
|
+
attr_reader :id, :status, :wallclock_time
|
|
6
5
|
|
|
7
|
-
def initialize(id:, status:, **_)
|
|
8
|
-
@
|
|
6
|
+
def initialize(id:, status:, wallclock_time: nil, **_)
|
|
7
|
+
@id = id.to_s
|
|
9
8
|
@status = OodCore::Job::Status.new(state: status)
|
|
9
|
+
@wallclock_time = wallclock_time && wallclock_time.to_i
|
|
10
10
|
end
|
|
11
11
|
|
|
12
12
|
def to_h
|
|
13
13
|
{
|
|
14
14
|
:id => id,
|
|
15
|
-
:status => status
|
|
15
|
+
:status => status,
|
|
16
|
+
:wallclock_time => wallclock_time
|
|
16
17
|
}
|
|
17
18
|
end
|
|
18
19
|
|
data/lib/ood_core/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ood_core
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.9.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Franz
|
|
@@ -10,7 +10,7 @@ authors:
|
|
|
10
10
|
autorequire:
|
|
11
11
|
bindir: exe
|
|
12
12
|
cert_chain: []
|
|
13
|
-
date: 2019-
|
|
13
|
+
date: 2019-05-03 00:00:00.000000000 Z
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
16
16
|
name: ood_support
|
|
@@ -147,7 +147,6 @@ files:
|
|
|
147
147
|
- lib/ood_core/cluster.rb
|
|
148
148
|
- lib/ood_core/clusters.rb
|
|
149
149
|
- lib/ood_core/errors.rb
|
|
150
|
-
- lib/ood_core/job/._task_status.rb
|
|
151
150
|
- lib/ood_core/job/adapter.rb
|
|
152
151
|
- lib/ood_core/job/adapters/drmaa.rb
|
|
153
152
|
- lib/ood_core/job/adapters/helper.rb
|
|
@@ -198,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
198
197
|
version: '0'
|
|
199
198
|
requirements: []
|
|
200
199
|
rubyforge_project:
|
|
201
|
-
rubygems_version: 2.
|
|
200
|
+
rubygems_version: 2.6.11
|
|
202
201
|
signing_key:
|
|
203
202
|
specification_version: 4
|
|
204
203
|
summary: Open OnDemand core library
|