ood_core 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGELOG.md +14 -1
- data/lib/ood_core/cluster.rb +1 -1
- data/lib/ood_core/job/adapters/lsf.rb +41 -18
- data/lib/ood_core/job/adapters/lsf/batch.rb +2 -2
- data/lib/ood_core/job/adapters/lsf/helper.rb +2 -0
- data/lib/ood_core/job/adapters/pbspro.rb +48 -12
- data/lib/ood_core/job/adapters/sge/batch.rb +9 -5
- data/lib/ood_core/job/adapters/sge/helper.rb +38 -1
- data/lib/ood_core/job/adapters/slurm.rb +173 -69
- data/lib/ood_core/job/info.rb +15 -0
- data/lib/ood_core/job/task.rb +6 -5
- data/lib/ood_core/version.rb +1 -1
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ac5caf10cd563acf0e8ef6a4b7d421b5718dc097
|
4
|
+
data.tar.gz: c9e401652e388868a2d583751ef94d50ccb2f22a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db745be6e2bcc4a7c4bfcd31d0a47c50bb948be84d24c2bc5f45c7bab6bbf46e22d82ed86120087ce6c0e6d554d323acef53b4adde3ffc7eb801216cc419f986
|
7
|
+
data.tar.gz: '096513b3c128b32c81b19784ef56164ef74e158dbd72d37b47d51063cd6b89daac03a00abf632421cf4e3687390930b5bb3c3771b5eb7290c91fb54a2757bf21'
|
data/CHANGELOG.md
CHANGED
@@ -6,6 +6,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
6
6
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
|
+
## [0.9.0] - 2019-05-04
|
10
|
+
### Added
|
11
|
+
- Job array support for LSF and PBSPro
|
12
|
+
- Slurm adapter uses `squeue` owner filter (`-u`) for `info_where_owner`
|
13
|
+
|
14
|
+
### Fixed
|
15
|
+
- Grid Engine adapter now starts scripts in the current directory like all other adapters
|
16
|
+
- Fixed issue where Slurm comment field might break job info parsing
|
17
|
+
- Fixed possible crash when comparing two clusters if the id of one of the clusters is nil
|
18
|
+
- Fixed bug with the live system test that impacted non-Torque systems
|
19
|
+
- Fixed bug with Slurm adapter when submit time is not available
|
20
|
+
|
9
21
|
## [0.8.0] - 2019-01-29
|
10
22
|
### Added
|
11
23
|
- info_all_each and info_where_owner_each super class methods
|
@@ -165,7 +177,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
165
177
|
### Added
|
166
178
|
- Initial release!
|
167
179
|
|
168
|
-
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.
|
180
|
+
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.9.0...HEAD
|
181
|
+
[0.9.0]: https://github.com/OSC/ood_core/compare/v0.8.0...v0.9.0
|
169
182
|
[0.8.0]: https://github.com/OSC/ood_core/compare/v0.7.1...v0.8.0
|
170
183
|
[0.7.1]: https://github.com/OSC/ood_core/compare/v0.7.0...v0.7.1
|
171
184
|
[0.7.0]: https://github.com/OSC/ood_core/compare/v0.6.0...v0.7.0
|
data/lib/ood_core/cluster.rb
CHANGED
@@ -90,16 +90,8 @@ module OodCore
|
|
90
90
|
# @return [Info] information describing submitted job
|
91
91
|
# @see Adapter#info
|
92
92
|
def info(id)
|
93
|
-
|
94
|
-
|
95
|
-
if job
|
96
|
-
info_for_batch_hash(job)
|
97
|
-
else
|
98
|
-
Info.new(
|
99
|
-
id: id,
|
100
|
-
status: :completed
|
101
|
-
)
|
102
|
-
end
|
93
|
+
info_ary = batch.get_job(id: id).map{|v| info_for_batch_hash(v)}
|
94
|
+
handle_job_array(info_ary, id)
|
103
95
|
rescue Batch::Error => e
|
104
96
|
raise JobAdapterError, e.message
|
105
97
|
end
|
@@ -131,19 +123,13 @@ module OodCore
|
|
131
123
|
raise JobAdapterError, e.message
|
132
124
|
end
|
133
125
|
|
134
|
-
def supports_job_arrays?
|
135
|
-
false
|
136
|
-
end
|
137
|
-
|
138
126
|
# Retrieve job status from resource manager
|
139
127
|
# @param id [#to_s] the id of the job
|
140
128
|
# @raise [JobAdapterError] if something goes wrong getting job status
|
141
129
|
# @return [Status] status of job
|
142
130
|
# @see Adapter#status
|
143
131
|
def status(id)
|
144
|
-
|
145
|
-
state = job ? get_state(job[:status]) : :completed
|
146
|
-
Status.new(state: state)
|
132
|
+
info(id).status
|
147
133
|
rescue Batch::Error => e
|
148
134
|
raise JobAdapterError, e.message
|
149
135
|
end
|
@@ -196,8 +182,11 @@ module OodCore
|
|
196
182
|
dispatch_time = helper.parse_past_time(v[:start_time], ignore_errors: true)
|
197
183
|
finish_time = helper.parse_past_time(v[:finish_time], ignore_errors: true)
|
198
184
|
|
185
|
+
# Detect job array index from name
|
186
|
+
array_index = /(\[\d+\])$/.match(v[:name])
|
187
|
+
|
199
188
|
Info.new(
|
200
|
-
id: v[:id],
|
189
|
+
id: (array_index) ? "#{v[:id]}#{array_index[1]}" : v[:id],
|
201
190
|
status: get_state(v[:status]),
|
202
191
|
allocated_nodes: nodes,
|
203
192
|
submit_host: v[:from_host],
|
@@ -214,6 +203,40 @@ module OodCore
|
|
214
203
|
native: v
|
215
204
|
)
|
216
205
|
end
|
206
|
+
|
207
|
+
def handle_job_array(info_ary, id)
|
208
|
+
return Info.new(id: id, status: :completed) if info_ary.nil? || info_ary.empty?
|
209
|
+
return info_ary.first if info_ary.size == 1
|
210
|
+
|
211
|
+
parent_task_hash = build_proxy_parent(info_ary.first, id)
|
212
|
+
|
213
|
+
info_ary.map do |task_info|
|
214
|
+
parent_task_hash[:tasks] << {:id => task_info.id, :status => task_info.status}
|
215
|
+
end
|
216
|
+
|
217
|
+
parent_task_hash[:status] = parent_task_hash[:tasks].map{|task| task[:status]}.max
|
218
|
+
|
219
|
+
Info.new(**parent_task_hash)
|
220
|
+
end
|
221
|
+
|
222
|
+
# Proxy the first element as the parent hash delete non-shared attributes
|
223
|
+
def build_proxy_parent(info, id)
|
224
|
+
info.to_h.merge({
|
225
|
+
:tasks => [],
|
226
|
+
:id => id
|
227
|
+
}).delete_if{
|
228
|
+
|key, _| [
|
229
|
+
:allocated_nodes, :dispatch_time,
|
230
|
+
:cpu_time, :wallclock_time, :status
|
231
|
+
].include?(key)
|
232
|
+
}.tap{
|
233
|
+
# Remove the child array index from the :job_name
|
234
|
+
|
235
|
+
# Note that a true representation of the parent should have the
|
236
|
+
# full array spec in the name. Worth attempting to reconstruct?
|
237
|
+
|h| h[:job_name] = h[:job_name].gsub(/\[[^\]]+\]/, '')
|
238
|
+
}
|
239
|
+
end
|
217
240
|
end
|
218
241
|
end
|
219
242
|
end
|
@@ -43,10 +43,10 @@ class OodCore::Job::Adapters::Lsf::Batch
|
|
43
43
|
# Get hash detailing the specified job
|
44
44
|
# @param id [#to_s] the id of the job to check
|
45
45
|
# @raise [Error] if `bjobs` command exited unsuccessfully
|
46
|
-
# @return [Hash] details of specified job
|
46
|
+
# @return [Array<Hash>] details of specified job
|
47
47
|
def get_job(id:)
|
48
48
|
args = %W( -a -w -W #{id.to_s} )
|
49
|
-
parse_bjobs_output(call("bjobs", *args))
|
49
|
+
parse_bjobs_output(call("bjobs", *args))
|
50
50
|
end
|
51
51
|
|
52
52
|
# status fields available from bjobs
|
@@ -81,6 +81,8 @@ class OodCore::Job::Adapters::Lsf::Helper
|
|
81
81
|
args += ["-P", script.accounting_id] unless script.accounting_id.nil?
|
82
82
|
args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
|
83
83
|
args += ["-J", script.job_name] unless script.job_name.nil?
|
84
|
+
args[-1] += "[#{script.job_array_request}]" unless script.job_array_request.nil?
|
85
|
+
|
84
86
|
args += ["-q", script.queue_name] unless script.queue_name.nil?
|
85
87
|
args += ["-U", script.reservation_id] unless script.reservation_id.nil?
|
86
88
|
args += ["-sp", script.priority] unless script.priority.nil?
|
@@ -86,8 +86,7 @@ module OodCore
|
|
86
86
|
# @raise [Error] if `qstat` command exited unsuccessfully
|
87
87
|
# @return [Array<Hash>] list of details for jobs
|
88
88
|
def get_jobs(id: "")
|
89
|
-
args = ["-f"] # display all information
|
90
|
-
args += ["-t"] # list subjobs
|
89
|
+
args = ["-f", "-t"] # display all information
|
91
90
|
args += [id.to_s] unless id.to_s.empty?
|
92
91
|
lines = call("qstat", *args).gsub("\n\t", "").split("\n").map(&:strip)
|
93
92
|
|
@@ -101,7 +100,8 @@ module OodCore
|
|
101
100
|
k2 ? ( hsh[k1] ||= {} and hsh[k1][k2] = value ) : ( hsh[k1] = value )
|
102
101
|
end
|
103
102
|
end
|
104
|
-
|
103
|
+
|
104
|
+
jobs
|
105
105
|
end
|
106
106
|
|
107
107
|
# Select batch jobs from the batch server
|
@@ -181,8 +181,8 @@ module OodCore
|
|
181
181
|
'U' => :suspended, # cycle-harvesting job is suspended due to keyboard activity
|
182
182
|
'E' => :running, # job is exiting after having run
|
183
183
|
'F' => :completed, # job is finished
|
184
|
-
'X' => :completed
|
185
|
-
#
|
184
|
+
'X' => :completed, # subjob has completed execution or has been deleted
|
185
|
+
'B' => :running # job array has at least one child running
|
186
186
|
}
|
187
187
|
|
188
188
|
# What percentage of jobs a user owns out of all jobs, used to decide
|
@@ -266,6 +266,8 @@ module OodCore
|
|
266
266
|
# mimics what the other resource managers do)
|
267
267
|
args += ["-j", "oe"] if script.error_path.nil?
|
268
268
|
|
269
|
+
args += ["-J", script.job_array_request] unless script.job_array_request.nil?
|
270
|
+
|
269
271
|
# Set native options
|
270
272
|
args += script.native if script.native
|
271
273
|
|
@@ -303,13 +305,21 @@ module OodCore
|
|
303
305
|
if usr_jobs.size > (qstat_factor * all_jobs.size)
|
304
306
|
super
|
305
307
|
else
|
306
|
-
|
307
|
-
|
308
|
-
|
308
|
+
begin
|
309
|
+
user_job_infos = []
|
310
|
+
usr_jobs.each do |id|
|
311
|
+
job = info(id)
|
312
|
+
user_job_infos << job
|
309
313
|
|
310
|
-
|
311
|
-
|
314
|
+
job.tasks.each {|task| user_job_infos << job.build_child_info(task)}
|
315
|
+
end
|
316
|
+
|
317
|
+
user_job_infos
|
318
|
+
rescue Batch::Error => e
|
319
|
+
raise JobAdapterError, e.message
|
320
|
+
end
|
312
321
|
end
|
322
|
+
end
|
313
323
|
|
314
324
|
# Retrieve job info from the resource manager
|
315
325
|
# @param id [#to_s] the id of the job
|
@@ -318,9 +328,18 @@ module OodCore
|
|
318
328
|
# @see Adapter#info
|
319
329
|
def info(id)
|
320
330
|
id = id.to_s
|
321
|
-
|
331
|
+
|
332
|
+
job_infos = @pbspro.get_jobs(id: id).map do |v|
|
322
333
|
parse_job_info(v)
|
323
|
-
end
|
334
|
+
end
|
335
|
+
|
336
|
+
if job_infos.empty?
|
337
|
+
Info.new(id: id, status: :completed)
|
338
|
+
elsif job_infos.length == 1
|
339
|
+
job_infos.first
|
340
|
+
else
|
341
|
+
process_job_array(id, job_infos)
|
342
|
+
end
|
324
343
|
rescue Batch::Error => e
|
325
344
|
# set completed status if can't find job id
|
326
345
|
if /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
@@ -434,6 +453,23 @@ module OodCore
|
|
434
453
|
native: v
|
435
454
|
)
|
436
455
|
end
|
456
|
+
|
457
|
+
# Combine the array parent with the states of its children
|
458
|
+
def process_job_array(id, jobs)
|
459
|
+
parent_job = jobs.select { |j| /\[\]/ =~ j.id }.first
|
460
|
+
parent = (parent_job) ? parent_job.to_h : {:id => id, :status => :undetermined}
|
461
|
+
|
462
|
+
# create task hashes from children
|
463
|
+
parent[:tasks] = jobs.reject { |j| /\[\]/ =~ j.id }.map do |j|
|
464
|
+
{
|
465
|
+
:id => j.id,
|
466
|
+
:status => j.status.to_sym,
|
467
|
+
:wallclock_time => j.wallclock_time
|
468
|
+
}
|
469
|
+
end
|
470
|
+
|
471
|
+
Info.new(**parent)
|
472
|
+
end
|
437
473
|
end
|
438
474
|
end
|
439
475
|
end
|
@@ -95,7 +95,12 @@ class OodCore::Job::Adapters::Sge::Batch
|
|
95
95
|
|
96
96
|
job_hash = listener.parsed_job
|
97
97
|
|
98
|
-
|
98
|
+
if job_hash[:id]
|
99
|
+
update_job_hash_status!(job_hash)
|
100
|
+
else
|
101
|
+
job_hash[:id] = job_id
|
102
|
+
job_hash[:status] = :completed
|
103
|
+
end
|
99
104
|
|
100
105
|
job_info = OodCore::Job::Info.new(**job_hash)
|
101
106
|
rescue REXML::ParseException => e
|
@@ -115,8 +120,8 @@ class OodCore::Job::Adapters::Sge::Batch
|
|
115
120
|
if get_status_from_drmaa?(job_hash)
|
116
121
|
begin
|
117
122
|
job_hash[:status] = get_status_from_drmma(job_hash[:id])
|
118
|
-
rescue DRMAA::
|
119
|
-
|
123
|
+
rescue DRMAA::DRMAAException => e
|
124
|
+
# log DRMAA error?
|
120
125
|
end
|
121
126
|
end
|
122
127
|
end
|
@@ -156,8 +161,7 @@ class OodCore::Job::Adapters::Sge::Batch
|
|
156
161
|
# @param job_id [#to_s]
|
157
162
|
# @return job_id [String]
|
158
163
|
def submit(content, args)
|
159
|
-
|
160
|
-
@helper.parse_job_id_from_qsub(call(*cmd, :stdin => content))
|
164
|
+
@helper.parse_job_id_from_qsub(call('qsub', *args, :stdin => content))
|
161
165
|
end
|
162
166
|
|
163
167
|
# Call a forked SGE command for a given batch server
|
@@ -20,7 +20,12 @@ class OodCore::Job::Adapters::Sge::Helper
|
|
20
20
|
args += ['-h'] if script.submit_as_hold
|
21
21
|
args += ['-r', 'yes'] if script.rerunnable
|
22
22
|
script.job_environment.each_pair {|k, v| args += ['-v', "#{k.to_s}=#{v.to_s}"]} unless script.job_environment.nil?
|
23
|
-
|
23
|
+
|
24
|
+
if script.workdir
|
25
|
+
args += ['-wd', script.workdir]
|
26
|
+
elsif ! script_contains_wd_directive?(script.content)
|
27
|
+
args += ['-cwd']
|
28
|
+
end
|
24
29
|
|
25
30
|
on_event_email = []
|
26
31
|
on_event_email << 'b' if script.email_on_started # beginning
|
@@ -47,6 +52,38 @@ class OodCore::Job::Adapters::Sge::Helper
|
|
47
52
|
args
|
48
53
|
end
|
49
54
|
|
55
|
+
# @brief Detect whether script content contains either -cwd or -wd
|
56
|
+
#
|
57
|
+
# @param content The script content
|
58
|
+
#
|
59
|
+
# Examples:
|
60
|
+
# #$-wd /home/ood/ondemand # should match
|
61
|
+
# #$ -wd /home/ood/ondemand # should match
|
62
|
+
# #$ -cwd /home/ood/ondemand # should match
|
63
|
+
# #$ -j yes -wd /home/ood/ondemand # should match
|
64
|
+
# #$ -j yes -o this-wd /home/ood/ondemand # should NOT match
|
65
|
+
# #$ -t 1-10:5 -wd /home/ood/ondemand # should NOT match
|
66
|
+
#
|
67
|
+
# @return [bool]
|
68
|
+
#
|
69
|
+
def script_contains_wd_directive?(content)
|
70
|
+
content.slice(
|
71
|
+
# Only search within the script's first 1024 characters in case the user is
|
72
|
+
# putting lots of non-line delimited data into their scripts.
|
73
|
+
0, 1024
|
74
|
+
).split(
|
75
|
+
"\n"
|
76
|
+
).any? {
|
77
|
+
|line|
|
78
|
+
# String must start with #$
|
79
|
+
# Match may be:
|
80
|
+
# Immediate -c?wd
|
81
|
+
# Eventual space or tab followed by -c?wd
|
82
|
+
# String may end with multiple characters
|
83
|
+
/^#\$(?:-c?wd|.*[ \t]+-c?wd).*$/ =~ line
|
84
|
+
}
|
85
|
+
end
|
86
|
+
|
50
87
|
# Raise exceptions when adapter is asked to perform an action that SGE does not support
|
51
88
|
# @raise [Error] when an incompatible action is requested
|
52
89
|
def raise_error_on_unsupported_args(script, after:, afterok:, afternotok:, afterany:)
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require "time"
|
2
2
|
require "ood_core/refinements/hash_extensions"
|
3
|
+
require "ood_core/refinements/array_extensions"
|
3
4
|
require "ood_core/job/adapters/helper"
|
4
5
|
|
5
6
|
module OodCore
|
@@ -29,10 +30,14 @@ module OodCore
|
|
29
30
|
# resource manager for job management.
|
30
31
|
class Slurm < Adapter
|
31
32
|
using Refinements::HashExtensions
|
33
|
+
using Refinements::ArrayExtensions
|
32
34
|
|
33
35
|
# Object used for simplified communication with a Slurm batch server
|
34
36
|
# @api private
|
35
37
|
class Batch
|
38
|
+
UNIT_SEPARATOR = "\x1F"
|
39
|
+
RECORD_SEPARATOR = "\x1E"
|
40
|
+
|
36
41
|
# The cluster of the Slurm batch server
|
37
42
|
# @example CHPC's kingspeak cluster
|
38
43
|
# my_batch.cluster #=> "kingspeak"
|
@@ -89,22 +94,66 @@ module OodCore
|
|
89
94
|
# # ...
|
90
95
|
# #]
|
91
96
|
# @param id [#to_s] the id of the job
|
92
|
-
# @param
|
97
|
+
# @param owner [String] the owner(s) of the job
|
98
|
+
# @param attrs [Array<Symbol>, nil] list of attributes request when calling squeue
|
93
99
|
# @raise [Error] if `squeue` command exited unsuccessfully
|
94
100
|
# @return [Array<Hash>] list of details for jobs
|
95
|
-
def get_jobs(id: "",
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
101
|
+
def get_jobs(id: "", owner: nil, attrs: nil)
|
102
|
+
fields = squeue_fields(attrs)
|
103
|
+
args = squeue_args(id: id, owner: owner, options: fields.values)
|
104
|
+
|
105
|
+
#TODO: switch mock of Open3 to be the squeue mock script
|
106
|
+
# then you can use that for performance metrics
|
107
|
+
StringIO.open(call("squeue", *args)) do |output|
|
108
|
+
advance_past_squeue_header!(output)
|
109
|
+
|
110
|
+
jobs = []
|
111
|
+
output.each_line(RECORD_SEPARATOR) do |line|
|
112
|
+
# TODO: once you can do performance metrics you can test zip against some other tools
|
113
|
+
# or just small optimizations
|
114
|
+
# for example, fields is ALREADY A HASH and we are setting the VALUES to
|
115
|
+
# "line.strip.split(unit_separator)" array
|
116
|
+
#
|
117
|
+
# i.e. store keys in an array, do Hash[[keys, values].transpose]
|
118
|
+
#
|
119
|
+
# or
|
120
|
+
#
|
121
|
+
# job = {}
|
122
|
+
# keys.each_with_index { |key, index| [key] = values[index] }
|
123
|
+
# jobs << job
|
124
|
+
#
|
125
|
+
# assuming keys and values are same length! if not we have an error!
|
126
|
+
values = line.chomp(RECORD_SEPARATOR).strip.split(UNIT_SEPARATOR)
|
127
|
+
jobs << Hash[fields.keys.zip(values)] unless values.empty?
|
128
|
+
end
|
129
|
+
jobs
|
130
|
+
end
|
131
|
+
end
|
102
132
|
|
103
|
-
|
104
|
-
|
133
|
+
def squeue_fields(attrs)
|
134
|
+
if attrs.nil?
|
135
|
+
all_squeue_fields
|
136
|
+
else
|
137
|
+
all_squeue_fields.slice(*squeue_attrs_for_info_attrs(Array.wrap(attrs) + squeue_required_fields))
|
105
138
|
end
|
106
139
|
end
|
107
140
|
|
141
|
+
def squeue_required_fields
|
142
|
+
#TODO: does this need to include ::array_job_task_id?
|
143
|
+
#TODO: does it matter that order of the output can vary depending on the arguments and if "squeue_required_fields" are included?
|
144
|
+
# previously the order was "fields.keys"; i don't think it does
|
145
|
+
[:job_id, :state_compact]
|
146
|
+
end
|
147
|
+
|
148
|
+
#TODO: write some barebones test for this? like 2 options and id or no id
|
149
|
+
def squeue_args(id: "", owner: nil, options: [])
|
150
|
+
args = ["--all", "--states=all", "--noconvert"]
|
151
|
+
args += ["-o", "#{RECORD_SEPARATOR}#{options.join(UNIT_SEPARATOR)}"]
|
152
|
+
args += ["-u", owner.to_s] unless owner.to_s.empty?
|
153
|
+
args += ["-j", id.to_s] unless id.to_s.empty?
|
154
|
+
args
|
155
|
+
end
|
156
|
+
|
108
157
|
# Put a specified job on hold
|
109
158
|
# @example Put job "1234" on hold
|
110
159
|
# my_batch.hold_job("1234")
|
@@ -147,7 +196,82 @@ module OodCore
|
|
147
196
|
call("sbatch", *args, env: env, stdin: str.to_s).strip.split(";").first
|
148
197
|
end
|
149
198
|
|
199
|
+
# Fields requested from a formatted `squeue` call
|
200
|
+
# Note that the order of these fields is important
|
201
|
+
def all_squeue_fields
|
202
|
+
{
|
203
|
+
account: "%a",
|
204
|
+
job_id: "%A",
|
205
|
+
exec_host: "%B",
|
206
|
+
min_cpus: "%c",
|
207
|
+
cpus: "%C",
|
208
|
+
min_tmp_disk: "%d",
|
209
|
+
nodes: "%D",
|
210
|
+
end_time: "%e",
|
211
|
+
dependency: "%E",
|
212
|
+
features: "%f",
|
213
|
+
array_job_id: "%F",
|
214
|
+
group_name: "%g",
|
215
|
+
group_id: "%G",
|
216
|
+
over_subscribe: "%h",
|
217
|
+
sockets_per_node: "%H",
|
218
|
+
array_job_task_id: "%i",
|
219
|
+
cores_per_socket: "%I",
|
220
|
+
job_name: "%j",
|
221
|
+
threads_per_core: "%J",
|
222
|
+
comment: "%k",
|
223
|
+
array_task_id: "%K",
|
224
|
+
time_limit: "%l",
|
225
|
+
time_left: "%L",
|
226
|
+
min_memory: "%m",
|
227
|
+
time_used: "%M",
|
228
|
+
req_node: "%n",
|
229
|
+
node_list: "%N",
|
230
|
+
command: "%o",
|
231
|
+
contiguous: "%O",
|
232
|
+
qos: "%q",
|
233
|
+
partition: "%P",
|
234
|
+
priority: "%Q",
|
235
|
+
reason: "%r",
|
236
|
+
start_time: "%S",
|
237
|
+
state_compact: "%t",
|
238
|
+
state: "%T",
|
239
|
+
user: "%u",
|
240
|
+
user_id: "%U",
|
241
|
+
reservation: "%v",
|
242
|
+
submit_time: "%V",
|
243
|
+
wckey: "%w",
|
244
|
+
licenses: "%W",
|
245
|
+
excluded_nodes: "%x",
|
246
|
+
core_specialization: "%X",
|
247
|
+
nice: "%y",
|
248
|
+
scheduled_nodes: "%Y",
|
249
|
+
sockets_cores_threads: "%z",
|
250
|
+
work_dir: "%Z",
|
251
|
+
gres: "%b", # must come at the end to fix a bug with Slurm 18
|
252
|
+
}
|
253
|
+
end
|
254
|
+
|
150
255
|
private
|
256
|
+
# Modify the StringIO instance by advancing past the squeue header
|
257
|
+
#
|
258
|
+
# The first two "records" should always be discarded. Consider the
|
259
|
+
# following squeue with -M output (invisible characters shown):
|
260
|
+
#
|
261
|
+
# CLUSTER: slurm_cluster_name\n
|
262
|
+
# \x1EJOBID\x1F\x1FSTATE\n
|
263
|
+
# \x1E1\x1F\x1FR\n
|
264
|
+
# \x1E2\x1F\x1FPD\n
|
265
|
+
#
|
266
|
+
# Splitting on the record separator first gives the Cluster header,
|
267
|
+
# and then the regular header. If -M or --cluster is not specified
|
268
|
+
# the effect is the same because the record separator is at the
|
269
|
+
# start of the format string, so the first "record" would simply be
|
270
|
+
# empty.
|
271
|
+
def advance_past_squeue_header!(squeue_output)
|
272
|
+
2.times { squeue_output.gets(RECORD_SEPARATOR) }
|
273
|
+
end
|
274
|
+
|
151
275
|
# Call a forked Slurm command for a given cluster
|
152
276
|
def call(cmd, *args, env: {}, stdin: "")
|
153
277
|
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
@@ -159,60 +283,25 @@ module OodCore
|
|
159
283
|
s.success? ? o : raise(Error, e)
|
160
284
|
end
|
161
285
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
array_job_task_id: "%i",
|
182
|
-
cores_per_socket: "%I",
|
183
|
-
job_name: "%j",
|
184
|
-
threads_per_core: "%J",
|
185
|
-
comment: "%k",
|
186
|
-
array_task_id: "%K",
|
187
|
-
time_limit: "%l",
|
188
|
-
time_left: "%L",
|
189
|
-
min_memory: "%m",
|
190
|
-
time_used: "%M",
|
191
|
-
req_node: "%n",
|
192
|
-
node_list: "%N",
|
193
|
-
command: "%o",
|
194
|
-
contiguous: "%O",
|
195
|
-
qos: "%q",
|
196
|
-
partition: "%P",
|
197
|
-
priority: "%Q",
|
198
|
-
reason: "%r",
|
199
|
-
start_time: "%S",
|
200
|
-
state_compact: "%t",
|
201
|
-
state: "%T",
|
202
|
-
user: "%u",
|
203
|
-
user_id: "%U",
|
204
|
-
reservation: "%v",
|
205
|
-
submit_time: "%V",
|
206
|
-
wckey: "%w",
|
207
|
-
licenses: "%W",
|
208
|
-
excluded_nodes: "%x",
|
209
|
-
core_specialization: "%X",
|
210
|
-
nice: "%y",
|
211
|
-
scheduled_nodes: "%Y",
|
212
|
-
sockets_cores_threads: "%z",
|
213
|
-
work_dir: "%Z",
|
214
|
-
gres: "%b", # must come at the end to fix a bug with Slurm 18
|
215
|
-
}
|
286
|
+
def squeue_attrs_for_info_attrs(attrs)
|
287
|
+
attrs.map { |a|
|
288
|
+
{
|
289
|
+
id: :job_id,
|
290
|
+
status: :state_compact,
|
291
|
+
allocated_nodes: [:node_list, :scheduled_nodes],
|
292
|
+
# submit_host: nil,
|
293
|
+
job_name: :job_name,
|
294
|
+
job_owner: :user,
|
295
|
+
accounting_id: :account,
|
296
|
+
procs: :cpus,
|
297
|
+
queue_name: :partition,
|
298
|
+
wallclock_time: :time_used,
|
299
|
+
wallclock_limit: :time_limit,
|
300
|
+
# cpu_time: nil,
|
301
|
+
submission_time: :submit_time,
|
302
|
+
dispatch_time: :start_time
|
303
|
+
}.fetch(a, a)
|
304
|
+
}.flatten
|
216
305
|
end
|
217
306
|
end
|
218
307
|
|
@@ -328,7 +417,7 @@ module OodCore
|
|
328
417
|
# @return [Array<Info>] information describing submitted jobs
|
329
418
|
# @see Adapter#info_all
|
330
419
|
def info_all(attrs: nil)
|
331
|
-
@slurm.get_jobs.map do |v|
|
420
|
+
@slurm.get_jobs(attrs: attrs).map do |v|
|
332
421
|
parse_job_info(v)
|
333
422
|
end
|
334
423
|
rescue Batch::Error => e
|
@@ -360,6 +449,20 @@ module OodCore
|
|
360
449
|
end
|
361
450
|
end
|
362
451
|
|
452
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
453
|
+
# resource manager
|
454
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
455
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
456
|
+
# @return [Array<Info>] information describing submitted jobs
|
457
|
+
def info_where_owner(owner, attrs: nil)
|
458
|
+
owner = Array.wrap(owner).map(&:to_s).join(',')
|
459
|
+
@slurm.get_jobs(owner: owner).map do |v|
|
460
|
+
parse_job_info(v)
|
461
|
+
end
|
462
|
+
rescue Batch::Error => e
|
463
|
+
raise JobAdapterError, e.message
|
464
|
+
end
|
465
|
+
|
363
466
|
# Retrieve job status from resource manager
|
364
467
|
# @param id [#to_s] the id of the job
|
365
468
|
# @raise [JobAdapterError] if something goes wrong getting job status
|
@@ -369,7 +472,7 @@ module OodCore
|
|
369
472
|
id = id.to_s
|
370
473
|
jobs = @slurm.get_jobs(
|
371
474
|
id: id,
|
372
|
-
|
475
|
+
attrs: [:job_id, :array_job_task_id, :state_compact]
|
373
476
|
)
|
374
477
|
# A job id can return multiple jobs if it corresponds to a job array
|
375
478
|
# id, so we need to find the job that corresponds to the given job id
|
@@ -478,6 +581,7 @@ module OodCore
|
|
478
581
|
allocated_nodes = [ { name: nil } ] * v[:nodes].to_i
|
479
582
|
end
|
480
583
|
end
|
584
|
+
|
481
585
|
Info.new(
|
482
586
|
id: v[:job_id],
|
483
587
|
status: get_state(v[:state_compact]),
|
@@ -491,8 +595,8 @@ module OodCore
|
|
491
595
|
wallclock_time: duration_in_seconds(v[:time_used]),
|
492
596
|
wallclock_limit: duration_in_seconds(v[:time_limit]),
|
493
597
|
cpu_time: nil,
|
494
|
-
submission_time: Time.parse(v[:submit_time]),
|
495
|
-
dispatch_time: v[:start_time] == "N/A" ? nil : Time.parse(v[:start_time]),
|
598
|
+
submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil,
|
599
|
+
dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]),
|
496
600
|
native: v
|
497
601
|
)
|
498
602
|
end
|
@@ -500,7 +604,7 @@ module OodCore
|
|
500
604
|
def handle_job_array(info_ary, id)
|
501
605
|
# If only one job was returned we return it
|
502
606
|
return info_ary.first unless info_ary.length > 1
|
503
|
-
|
607
|
+
|
504
608
|
parent_task_hash = {:tasks => []}
|
505
609
|
|
506
610
|
info_ary.map do |task_info|
|
data/lib/ood_core/job/info.rb
CHANGED
@@ -113,6 +113,21 @@ module OodCore
|
|
113
113
|
@native = native
|
114
114
|
end
|
115
115
|
|
116
|
+
# Create a new Info for a child task
|
117
|
+
# @return [Info] merging the parent and the child task
|
118
|
+
def build_child_info(task)
|
119
|
+
parent_only_keys = [
|
120
|
+
:allocated_nodes,
|
121
|
+
:procs,
|
122
|
+
:cpu_time,
|
123
|
+
:dispatch_time,
|
124
|
+
:native,
|
125
|
+
:tasks
|
126
|
+
]
|
127
|
+
|
128
|
+
new(**to_h.merge(task.to_h).delete_if{|k, v| parent_only_keys.include?(k)})
|
129
|
+
end
|
130
|
+
|
116
131
|
# Convert object to hash
|
117
132
|
# @return [Hash] object as hash
|
118
133
|
def to_h
|
data/lib/ood_core/job/task.rb
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
module OodCore
|
2
2
|
module Job
|
3
3
|
class Task
|
4
|
-
attr_reader :id
|
5
|
-
attr_reader :status
|
4
|
+
attr_reader :id, :status, :wallclock_time
|
6
5
|
|
7
|
-
def initialize(id:, status:, **_)
|
8
|
-
@
|
6
|
+
def initialize(id:, status:, wallclock_time: nil, **_)
|
7
|
+
@id = id.to_s
|
9
8
|
@status = OodCore::Job::Status.new(state: status)
|
9
|
+
@wallclock_time = wallclock_time && wallclock_time.to_i
|
10
10
|
end
|
11
11
|
|
12
12
|
def to_h
|
13
13
|
{
|
14
14
|
:id => id,
|
15
|
-
:status => status
|
15
|
+
:status => status,
|
16
|
+
:wallclock_time => wallclock_time
|
16
17
|
}
|
17
18
|
end
|
18
19
|
|
data/lib/ood_core/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2019-
|
13
|
+
date: 2019-05-03 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -147,7 +147,6 @@ files:
|
|
147
147
|
- lib/ood_core/cluster.rb
|
148
148
|
- lib/ood_core/clusters.rb
|
149
149
|
- lib/ood_core/errors.rb
|
150
|
-
- lib/ood_core/job/._task_status.rb
|
151
150
|
- lib/ood_core/job/adapter.rb
|
152
151
|
- lib/ood_core/job/adapters/drmaa.rb
|
153
152
|
- lib/ood_core/job/adapters/helper.rb
|
@@ -198,7 +197,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
198
197
|
version: '0'
|
199
198
|
requirements: []
|
200
199
|
rubyforge_project:
|
201
|
-
rubygems_version: 2.
|
200
|
+
rubygems_version: 2.6.11
|
202
201
|
signing_key:
|
203
202
|
specification_version: 4
|
204
203
|
summary: Open OnDemand core library
|