ood_core 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +63 -20
- data/lib/ood_core.rb +6 -0
- data/lib/ood_core/batch_connect/factory.rb +42 -0
- data/lib/ood_core/batch_connect/template.rb +207 -0
- data/lib/ood_core/batch_connect/templates/basic.rb +23 -0
- data/lib/ood_core/batch_connect/templates/vnc.rb +201 -0
- data/lib/ood_core/cluster.rb +33 -8
- data/lib/ood_core/errors.rb +6 -0
- data/lib/ood_core/job/adapter.rb +11 -0
- data/lib/ood_core/job/adapters/lsf.rb +16 -22
- data/lib/ood_core/job/adapters/lsf/batch.rb +28 -15
- data/lib/ood_core/job/adapters/lsf/helper.rb +79 -0
- data/lib/ood_core/job/adapters/pbspro.rb +424 -0
- data/lib/ood_core/job/adapters/slurm.rb +8 -0
- data/lib/ood_core/job/adapters/torque.rb +32 -2
- data/lib/ood_core/job/info.rb +9 -2
- data/lib/ood_core/refinements/hash_extensions.rb +9 -0
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +1 -1
- metadata +11 -6
data/lib/ood_core/cluster.rb
CHANGED
@@ -33,8 +33,12 @@ module OodCore
|
|
33
33
|
# @option cluster [#to_h] :metadata ({}) The cluster's metadata
|
34
34
|
# @option cluster [#to_h] :login ({}) The cluster's SSH host
|
35
35
|
# @option cluster [#to_h] :job ({}) The job adapter for this cluster
|
36
|
-
# @option cluster [#to_h] :custom ({}) Any custom resources for this
|
37
|
-
#
|
36
|
+
# @option cluster [#to_h] :custom ({}) Any custom resources for this
|
37
|
+
# cluster
|
38
|
+
# @option cluster [Array<#to_h>] :acls ([]) List of ACLs to validate
|
39
|
+
# against
|
40
|
+
# @option cluster [#to_h] :batch_connect ({}) Configuration for batch
|
41
|
+
# connect templates
|
38
42
|
def initialize(cluster)
|
39
43
|
c = cluster.to_h.symbolize_keys
|
40
44
|
|
@@ -47,6 +51,7 @@ module OodCore
|
|
47
51
|
@job_config = c.fetch(:job, {}) .to_h.symbolize_keys
|
48
52
|
@custom_config = c.fetch(:custom, {}) .to_h.symbolize_keys
|
49
53
|
@acls_config = c.fetch(:acls, []) .map(&:to_h)
|
54
|
+
@batch_connect_config = c.fetch(:batch_connect, {}).to_h.symbolize_keys
|
50
55
|
end
|
51
56
|
|
52
57
|
# Metadata that provides extra information about this cluster
|
@@ -81,6 +86,25 @@ module OodCore
|
|
81
86
|
build_acls(job_config.fetch(:acls, []).map(&:to_h)).all?(&:allow?)
|
82
87
|
end
|
83
88
|
|
89
|
+
# The batch connect template configuration used for this cluster
|
90
|
+
# @param template [#to_sym, nil] the template type
|
91
|
+
# @return [Hash] the batch connect configuration
|
92
|
+
def batch_connect_config(template = nil)
|
93
|
+
if template
|
94
|
+
@batch_connect_config.fetch(template.to_sym, {}).to_h.symbolize_keys.merge(template: template.to_sym)
|
95
|
+
else
|
96
|
+
@batch_connect_config
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Build a batch connect template from the respective configuration
|
101
|
+
# @param context [#to_h] the context used for rendering the template
|
102
|
+
# @return [BatchConnect::Template] the batch connect template
|
103
|
+
def batch_connect_template(context = {})
|
104
|
+
context = context.to_h.symbolize_keys
|
105
|
+
BatchConnect::Factory.build batch_connect_config(context[:template] || :basic).merge(context)
|
106
|
+
end
|
107
|
+
|
84
108
|
# The configuration for any custom features or resources for this cluster
|
85
109
|
# @param feature [#to_sym, nil] the feature or resource
|
86
110
|
# @return [Hash] configuration for custom feature or resource
|
@@ -125,12 +149,13 @@ module OodCore
|
|
125
149
|
# @return [Hash] the hash describing this object
|
126
150
|
def to_h
|
127
151
|
{
|
128
|
-
id:
|
129
|
-
metadata:
|
130
|
-
login:
|
131
|
-
job:
|
132
|
-
custom:
|
133
|
-
acls:
|
152
|
+
id: id,
|
153
|
+
metadata: metadata_config,
|
154
|
+
login: login_config,
|
155
|
+
job: job_config,
|
156
|
+
custom: custom_config,
|
157
|
+
acls: acls_config,
|
158
|
+
batch_connect: batch_connect_config
|
134
159
|
}
|
135
160
|
end
|
136
161
|
|
data/lib/ood_core/errors.rb
CHANGED
@@ -16,4 +16,10 @@ module OodCore
|
|
16
16
|
|
17
17
|
# Raised when a job state is set to an invalid option
|
18
18
|
class UnknownStateAttribute < Error; end
|
19
|
+
|
20
|
+
# Raised when template not specified in configuration
|
21
|
+
class TemplateNotSpecified < Error; end
|
22
|
+
|
23
|
+
# Raised when cannot find template specified in configuration
|
24
|
+
class TemplateNotFound < Error; end
|
19
25
|
end
|
data/lib/ood_core/job/adapter.rb
CHANGED
@@ -4,6 +4,8 @@ module OodCore
|
|
4
4
|
# submitting/statusing/holding/deleting jobs
|
5
5
|
# @abstract
|
6
6
|
class Adapter
|
7
|
+
using Refinements::ArrayExtensions
|
8
|
+
|
7
9
|
# Submit a job with the attributes defined in the job template instance
|
8
10
|
# @abstract Subclass is expected to implement {#submit}
|
9
11
|
# @raise [NotImplementedError] if subclass did not define {#submit}
|
@@ -39,6 +41,15 @@ module OodCore
|
|
39
41
|
raise NotImplementedError, "subclass did not define #info_all"
|
40
42
|
end
|
41
43
|
|
44
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
45
|
+
# resource manager
|
46
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
47
|
+
# @return [Array<Info>] information describing submitted jobs
|
48
|
+
def info_where_owner(owner)
|
49
|
+
owner = Array.wrap(owner).map(&:to_s)
|
50
|
+
info_all.select { |info| owner.include? info.job_owner }
|
51
|
+
end
|
52
|
+
|
42
53
|
# Retrieve job info from the resource manager
|
43
54
|
# @abstract Subclass is expected to implement {#info}
|
44
55
|
# @raise [NotImplementedError] if subclass did not define {#info}
|
@@ -72,24 +72,9 @@ module OodCore
|
|
72
72
|
afternotok = Array(afternotok).map(&:to_s)
|
73
73
|
afterany = Array(afterany).map(&:to_s)
|
74
74
|
|
75
|
-
|
76
|
-
args += ["-P", script.accounting_id] unless script.accounting_id.nil?
|
77
|
-
args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
|
78
|
-
args += ["-J", script.job_name] unless script.job_name.nil?
|
79
|
-
|
80
|
-
# TODO: dependencies
|
81
|
-
|
82
|
-
env = {
|
83
|
-
#TODO:
|
84
|
-
#LSB_HOSTS?
|
85
|
-
#LSB_MCPU_HOSTS?
|
86
|
-
#SNDJOBS_TO?
|
87
|
-
#
|
88
|
-
}
|
89
|
-
|
90
|
-
# Submit job
|
91
|
-
batch.submit_string(script.content, args: args, env: env)
|
75
|
+
kwargs = helper.batch_submit_args(script, after: after, afterok: afterok, afternotok: afternotok, afterany: afterany)
|
92
76
|
|
77
|
+
batch.submit_string(script.content, **kwargs)
|
93
78
|
rescue Batch::Error => e
|
94
79
|
raise JobAdapterError, e.message
|
95
80
|
end
|
@@ -170,20 +155,29 @@ module OodCore
|
|
170
155
|
end
|
171
156
|
|
172
157
|
def info_for_batch_hash(v)
|
158
|
+
nodes = helper.parse_exec_host(v[:exec_host]).map do |host|
|
159
|
+
NodeInfo.new(name: host[:host], procs: host[:slots])
|
160
|
+
end
|
161
|
+
|
162
|
+
# FIXME: estimated_runtime should be set by batch object instead of
|
163
|
+
dispatch_time = helper.parse_past_time(v[:start_time], ignore_errors: true)
|
164
|
+
finish_time = helper.parse_past_time(v[:finish_time], ignore_errors: true)
|
165
|
+
|
173
166
|
Info.new(
|
174
167
|
id: v[:id],
|
175
168
|
status: get_state(v[:status]),
|
176
|
-
allocated_nodes:
|
169
|
+
allocated_nodes: nodes,
|
177
170
|
submit_host: v[:from_host],
|
178
171
|
job_name: v[:name],
|
179
172
|
job_owner: v[:user],
|
180
173
|
accounting_id: v[:project],
|
181
|
-
procs:
|
174
|
+
procs: nodes.any? ? nodes.map(&:procs).reduce(&:+) : 0,
|
182
175
|
queue_name: v[:queue],
|
183
|
-
wallclock_time:
|
184
|
-
cpu_time:
|
176
|
+
wallclock_time: helper.estimate_runtime(current_time: Time.now, start_time: dispatch_time, finish_time: finish_time),
|
177
|
+
cpu_time: helper.parse_cpu_used(v[:cpu_used]),
|
178
|
+
# cpu_time: nil,
|
185
179
|
submission_time: helper.parse_past_time(v[:submit_time], ignore_errors: true),
|
186
|
-
dispatch_time:
|
180
|
+
dispatch_time: dispatch_time,
|
187
181
|
native: v
|
188
182
|
)
|
189
183
|
end
|
@@ -49,21 +49,24 @@ class OodCore::Job::Adapters::Lsf::Batch
|
|
49
49
|
%w( -u all -a -w -W )
|
50
50
|
end
|
51
51
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
52
|
+
# status fields available from bjobs
|
53
|
+
def fields
|
54
|
+
%i(id user status queue from_host exec_host name submit_time
|
55
|
+
project cpu_used mem swap pids start_time finish_time)
|
56
|
+
end
|
57
57
|
|
58
58
|
# helper method
|
59
59
|
def parse_bjobs_output(response)
|
60
60
|
return [] if response =~ /No job found/ || response.nil?
|
61
61
|
|
62
62
|
lines = response.split("\n")
|
63
|
-
|
63
|
+
columns = lines.shift.split
|
64
|
+
|
65
|
+
validate_bjobs_output_columns(columns)
|
66
|
+
jobname_column_idx = columns.find_index("JOB_NAME")
|
64
67
|
|
65
|
-
lines.
|
66
|
-
values = split_bjobs_output_line(job)
|
68
|
+
lines.map{ |job|
|
69
|
+
values = split_bjobs_output_line(job, num_columns: columns.count, jobname_column_idx: jobname_column_idx)
|
67
70
|
|
68
71
|
# make a hash of { field: "value", etc. }
|
69
72
|
Hash[fields.zip(values)].each_with_object({}) { |(k,v),o|
|
@@ -135,13 +138,21 @@ class OodCore::Job::Adapters::Lsf::Batch
|
|
135
138
|
end
|
136
139
|
|
137
140
|
# split a line of output from bjobs into field values
|
138
|
-
def split_bjobs_output_line(line)
|
141
|
+
def split_bjobs_output_line(line, num_columns:, jobname_column_idx:)
|
139
142
|
values = line.strip.split
|
140
143
|
|
141
|
-
if(values.count >
|
142
|
-
#
|
143
|
-
#
|
144
|
-
|
144
|
+
if(values.count > num_columns)
|
145
|
+
# if the line has more fields than the number of columns, that means one
|
146
|
+
# field value has spaces, so it was erroneously split into
|
147
|
+
# multiple fields; we assume that is the jobname field, and we will
|
148
|
+
# collapse the fields into a single field
|
149
|
+
#
|
150
|
+
# FIXME: assumes jobname_column_idx is not first or last item
|
151
|
+
j = jobname_column_idx
|
152
|
+
|
153
|
+
# e.g. if 15 fields and jobname is 7th field
|
154
|
+
# values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
|
155
|
+
values = values[0..(j-1)] + [values[j..(j-num_columns)].join(" ")] + values[(j+1-num_columns)..-1]
|
145
156
|
end
|
146
157
|
|
147
158
|
values
|
@@ -151,9 +162,11 @@ class OodCore::Job::Adapters::Lsf::Batch
|
|
151
162
|
def validate_bjobs_output_columns(columns)
|
152
163
|
expected = %w(JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME
|
153
164
|
SUBMIT_TIME PROJ_NAME CPU_USED MEM SWAP PIDS START_TIME FINISH_TIME)
|
154
|
-
|
165
|
+
# (expected & columns) will return the columns that are the same
|
166
|
+
# so if there are extra columns we can just ignore those (like SLOTS in LSF 9.1)
|
167
|
+
if columns && ((expected & columns) != expected)
|
155
168
|
raise Error, "bjobs output in different format than expected: " \
|
156
|
-
"#{columns.inspect}
|
169
|
+
"#{columns.inspect} did not include all columns: #{expected.inspect}"
|
157
170
|
end
|
158
171
|
end
|
159
172
|
|
@@ -23,4 +23,83 @@ class OodCore::Job::Adapters::Lsf::Helper
|
|
23
23
|
|
24
24
|
nil
|
25
25
|
end
|
26
|
+
|
27
|
+
# convert exec_host string format from bjobs to a hash
|
28
|
+
# i.e. "c012" => [{host: "c012", slots: 1}]
|
29
|
+
# i.e. "4*c012:8*c013" => [{host: "c012", slots: 4}, {host: "c013", slots: 8}]
|
30
|
+
def parse_exec_host(exec_host_str)
|
31
|
+
return [] if exec_host_str.nil? || exec_host_str.empty?
|
32
|
+
|
33
|
+
exec_host_str.scan(exec_host_regex).map do |match|
|
34
|
+
{host: match[2], slots: match[1] ? match[1].to_i : 1}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def exec_host_regex
|
39
|
+
@exec_host_regex ||= Regexp.new(/((\d+)\*)?([^:]+)/)
|
40
|
+
end
|
41
|
+
|
42
|
+
# given current time, dispatch time, and finish time values, estimate the
|
43
|
+
# runtime for a job; this estimate will be accurate if the job never enters a
|
44
|
+
# suspended state during its execution
|
45
|
+
def estimate_runtime(current_time:, start_time:, finish_time:)
|
46
|
+
return nil if start_time.nil?
|
47
|
+
|
48
|
+
(finish_time || current_time) - start_time
|
49
|
+
end
|
50
|
+
|
51
|
+
# Convert CPU_USED string to seconds
|
52
|
+
#
|
53
|
+
# example strings of cpu_used in LSF 8.3:
|
54
|
+
#
|
55
|
+
# 060:24:00.00
|
56
|
+
# 046:19:37.00
|
57
|
+
# 1118:59:09.00
|
58
|
+
# 000:00:00.00
|
59
|
+
# 000:48:18.39
|
60
|
+
# 003:11:36.67
|
61
|
+
# 003:24:40.95
|
62
|
+
# 50769:48:00.-48
|
63
|
+
# 50835:48:48.-48
|
64
|
+
#
|
65
|
+
# my guess is: hours:minutes:seconds.????
|
66
|
+
#
|
67
|
+
# @return [Fixnum, nil] cpu used as seconds
|
68
|
+
def parse_cpu_used(cpu_used)
|
69
|
+
if cpu_used =~ /^(\d+):(\d+):(\d+)\..*$/
|
70
|
+
$1.to_i*3600 + $2.to_i*60 + $3.to_i
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def batch_submit_args(script, after: [], afterok: [], afternotok: [], afterany: [])
|
75
|
+
args = []
|
76
|
+
|
77
|
+
args += ["-P", script.accounting_id] unless script.accounting_id.nil?
|
78
|
+
args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
|
79
|
+
args += ["-J", script.job_name] unless script.job_name.nil?
|
80
|
+
args += ["-q", script.queue_name] unless script.queue_name.nil?
|
81
|
+
args += ["-U", script.reservation_id] unless script.reservation_id.nil?
|
82
|
+
args += ["-sp", script.priority] unless script.priority.nil?
|
83
|
+
args += ["-H"] if script.submit_as_hold
|
84
|
+
args += (script.rerunnable ? ["-r"] : ["-rn"]) unless script.rerunnable.nil?
|
85
|
+
args += ["-b", script.start_time.localtime.strftime("%Y:%m:%d:%H:%M")] unless script.start_time.nil?
|
86
|
+
args += ["-W", (script.wall_time / 60).to_i] unless script.wall_time.nil?
|
87
|
+
|
88
|
+
# input and output files
|
89
|
+
args += ["-i", script.input_path] unless script.input_path.nil?
|
90
|
+
args += ["-o", script.output_path] unless script.output_path.nil?
|
91
|
+
args += ["-e", script.error_path] unless script.error_path.nil?
|
92
|
+
|
93
|
+
# email
|
94
|
+
args += ["-B"] if script.email_on_started
|
95
|
+
args += ["-N"] if script.email_on_terminated
|
96
|
+
args += ["-u", script.email.join(",")] unless script.email.nil? || script.email.empty?
|
97
|
+
|
98
|
+
args += script.native unless script.native.nil?
|
99
|
+
|
100
|
+
# environment
|
101
|
+
env = script.job_environment || {}
|
102
|
+
|
103
|
+
{args: args, env: env}
|
104
|
+
end
|
26
105
|
end
|
@@ -0,0 +1,424 @@
|
|
1
|
+
require "time"
|
2
|
+
require "ood_core/refinements/hash_extensions"
|
3
|
+
|
4
|
+
module OodCore
|
5
|
+
module Job
|
6
|
+
class Factory
|
7
|
+
using Refinements::HashExtensions
|
8
|
+
|
9
|
+
# Build the PBS Pro adapter from a configuration
|
10
|
+
# @param config [#to_h] the configuration for job adapter
|
11
|
+
# @option config [Object] :host (nil) The batch server host
|
12
|
+
# @option config [Object] :exec (nil) Path to PBS Pro executables
|
13
|
+
# @option config [Object] :qstat_factor (nil) Deciding factor on how to
|
14
|
+
# call qstat for a user
|
15
|
+
def self.build_pbspro(config)
|
16
|
+
c = config.to_h.compact.symbolize_keys
|
17
|
+
host = c.fetch(:host, nil)
|
18
|
+
exec = c.fetch(:exec, nil)
|
19
|
+
qstat_factor = c.fetch(:qstat_factor, nil)
|
20
|
+
pbspro = Adapters::PBSPro::Batch.new(host: host, exec: exec)
|
21
|
+
Adapters::PBSPro.new(pbspro: pbspro, qstat_factor: qstat_factor)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
module Adapters
|
26
|
+
# An adapter object that describes the communication with a PBS Pro
|
27
|
+
# resource manager for job management.
|
28
|
+
class PBSPro < Adapter
|
29
|
+
using Refinements::ArrayExtensions
|
30
|
+
using Refinements::HashExtensions
|
31
|
+
|
32
|
+
# Object used for simplified communication with a PBS Pro batch server
|
33
|
+
# @api private
|
34
|
+
class Batch
|
35
|
+
# The host of the PBS Pro batch server
|
36
|
+
# @example
|
37
|
+
# my_batch.host #=> "my_batch.server.edu"
|
38
|
+
# @return [String, nil] the batch server host
|
39
|
+
attr_reader :host
|
40
|
+
|
41
|
+
# The path containing the PBS executables
|
42
|
+
# @example
|
43
|
+
# my_batch.exec.to_s #=> "/usr/local/pbspro/10.0.0
|
44
|
+
# @return [Pathname, nil] path to pbs executables
|
45
|
+
attr_reader :exec
|
46
|
+
|
47
|
+
# The root exception class that all PBS Pro-specific exceptions
|
48
|
+
# inherit from
|
49
|
+
class Error < StandardError; end
|
50
|
+
|
51
|
+
# @param host [#to_s, nil] the batch server host
|
52
|
+
# @param exec [#to_s, nil] path to pbs executables
|
53
|
+
def initialize(host: nil, exec: nil)
|
54
|
+
@host = host && host.to_s
|
55
|
+
@exec = exec && Pathname.new(exec.to_s)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get a list of hashes detailing each of the jobs on the batch server
|
59
|
+
# @example Status info for all jobs
|
60
|
+
# my_batch.get_jobs
|
61
|
+
# #=>
|
62
|
+
# #[
|
63
|
+
# # {
|
64
|
+
# # :account => "account",
|
65
|
+
# # :job_id => "my_job",
|
66
|
+
# # ...
|
67
|
+
# # },
|
68
|
+
# # {
|
69
|
+
# # :account => "account",
|
70
|
+
# # :job_id => "my_other_job",
|
71
|
+
# # ...
|
72
|
+
# # },
|
73
|
+
# # ...
|
74
|
+
# #]
|
75
|
+
# @param id [#to_s] the id of the job
|
76
|
+
# @raise [Error] if `qstat` command exited unsuccessfully
|
77
|
+
# @return [Array<Hash>] list of details for jobs
|
78
|
+
def get_jobs(id: "")
|
79
|
+
args = ["-f"] # display all information
|
80
|
+
args += ["-t"] # list subjobs
|
81
|
+
args += [id.to_s] unless id.to_s.empty?
|
82
|
+
lines = call("qstat", *args).gsub("\n\t", "").split("\n").map(&:strip)
|
83
|
+
|
84
|
+
jobs = []
|
85
|
+
lines.each do |line|
|
86
|
+
if /^Job Id: (?<job_id>.+)$/ =~ line
|
87
|
+
jobs << { job_id: job_id }
|
88
|
+
elsif /^(?<key>[^\s]+) = (?<value>.+)$/ =~ line
|
89
|
+
hsh = jobs.last
|
90
|
+
k1, k2 = key.split(".").map(&:to_sym)
|
91
|
+
k2 ? ( hsh[k1] ||= {} and hsh[k1][k2] = value ) : ( hsh[k1] = value )
|
92
|
+
end
|
93
|
+
end
|
94
|
+
jobs.reject { |j| /\[\]/ =~ j[:job_id] } # drop main job array jobs
|
95
|
+
end
|
96
|
+
|
97
|
+
# Select batch jobs from the batch server
|
98
|
+
# @param args [Array<#to_s>] arguments passed to `qselect` command
|
99
|
+
# @raise [Error] if `qselect` command exited unsuccessfully
|
100
|
+
# @return [Array<String>] list of job ids that match selection
|
101
|
+
# criteria
|
102
|
+
def select_jobs(args: [])
|
103
|
+
call("qselect", *args).split("\n").map(&:strip)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Put a specified job on hold
|
107
|
+
# @example Put job "1234" on hold
|
108
|
+
# my_batch.hold_job("1234")
|
109
|
+
# @param id [#to_s] the id of the job
|
110
|
+
# @raise [Error] if `qhold` command exited unsuccessfully
|
111
|
+
# @return [void]
|
112
|
+
def hold_job(id)
|
113
|
+
call("qhold", id.to_s)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Release a specified job that is on hold
|
117
|
+
# @example Release job "1234" from on hold
|
118
|
+
# my_batch.release_job("1234")
|
119
|
+
# @param id [#to_s] the id of the job
|
120
|
+
# @raise [Error] if `qrls` command exited unsuccessfully
|
121
|
+
# @return [void]
|
122
|
+
def release_job(id)
|
123
|
+
call("qrls", id.to_s)
|
124
|
+
end
|
125
|
+
|
126
|
+
# Delete a specified job from batch server
|
127
|
+
# @example Delete job "1234"
|
128
|
+
# my_batch.delete_job("1234")
|
129
|
+
# @param id [#to_s] the id of the job
|
130
|
+
# @raise [Error] if `qdel` command exited unsuccessfully
|
131
|
+
# @return [void]
|
132
|
+
def delete_job(id)
|
133
|
+
call("qdel", id.to_s)
|
134
|
+
end
|
135
|
+
|
136
|
+
# Submit a script expanded as a string to the batch server
|
137
|
+
# @param str [#to_s] script as a string
|
138
|
+
# @param args [Array<#to_s>] arguments passed to `qsub` command
|
139
|
+
# @param chdir [#to_s, nil] working directory where `qsub` is called
|
140
|
+
# @raise [Error] if `qsub` command exited unsuccessfully
|
141
|
+
# @return [String] the id of the job that was created
|
142
|
+
def submit_string(str, args: [], chdir: nil)
|
143
|
+
call("qsub", *args, stdin: str.to_s, chdir: chdir).strip
|
144
|
+
end
|
145
|
+
|
146
|
+
private
|
147
|
+
# Call a forked PBS Pro command for a given batch server
|
148
|
+
def call(cmd, *args, env: {}, stdin: "", chdir: nil)
|
149
|
+
cmd = cmd.to_s
|
150
|
+
cmd = exec.join("bin", cmd).to_s if exec
|
151
|
+
args = args.map(&:to_s)
|
152
|
+
env = env.to_h.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
|
153
|
+
env["PBS_DEFAULT"] = host.to_s if host
|
154
|
+
env["PBS_EXEC"] = exec.to_s if exec
|
155
|
+
chdir ||= "."
|
156
|
+
o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s, chdir: chdir.to_s)
|
157
|
+
s.success? ? o : raise(Error, e)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# Mapping of state codes for PBSPro
|
162
|
+
STATE_MAP = {
|
163
|
+
'Q' => :queued,
|
164
|
+
'W' => :queued, # job is waiting for its submitter-assigned start time to be reached
|
165
|
+
'H' => :queued_held,
|
166
|
+
'T' => :queued_held, # job is being moved to a new location
|
167
|
+
'M' => :completed, # job was moved to another server
|
168
|
+
'R' => :running,
|
169
|
+
'S' => :suspended,
|
170
|
+
'U' => :suspended, # cycle-harvesting job is suspended due to keyboard activity
|
171
|
+
'E' => :running, # job is exiting after having run
|
172
|
+
'F' => :completed, # job is finished
|
173
|
+
'X' => :completed # subjob has completed execution or has been deleted
|
174
|
+
# ignore B as it signifies a job array
|
175
|
+
}
|
176
|
+
|
177
|
+
# What percentage of jobs a user owns out of all jobs, used to decide
|
178
|
+
# whether we filter the owner's jobs from a `qstat` of all jobs or call
|
179
|
+
# `qstat` on each of the owner's individual jobs
|
180
|
+
# @return [Float] ratio of owner's jobs to all jobs
|
181
|
+
attr_reader :qstat_factor
|
182
|
+
|
183
|
+
# @api private
|
184
|
+
# @param opts [#to_h] the options defining this adapter
|
185
|
+
# @option opts [Batch] :pbspro The PBS Pro batch object
|
186
|
+
# @option opts [#to_f] :qstat_factor (0.10) The qstat deciding factor
|
187
|
+
# @see Factory.build_pbspro
|
188
|
+
def initialize(opts = {})
|
189
|
+
o = opts.to_h.compact.symbolize_keys
|
190
|
+
|
191
|
+
@pbspro = o.fetch(:pbspro) { raise ArgumentError, "No pbspro object specified. Missing argument: pbspro" }
|
192
|
+
@qstat_factor = o.fetch(:qstat_factor, 0.10).to_f
|
193
|
+
end
|
194
|
+
|
195
|
+
# Submit a job with the attributes defined in the job template instance
|
196
|
+
# @param script [Script] script object that describes the script and
|
197
|
+
# attributes for the submitted job
|
198
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for
|
199
|
+
# execution at any point after dependent jobs have started execution
|
200
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
201
|
+
# execution only after dependent jobs have terminated with no errors
|
202
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
203
|
+
# execution only after dependent jobs have terminated with errors
|
204
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
205
|
+
# execution after dependent jobs have terminated
|
206
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
207
|
+
# @return [String] the job id returned after successfully submitting a
|
208
|
+
# job
|
209
|
+
# @see Adapter#submit
|
210
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
211
|
+
after = Array(after).map(&:to_s)
|
212
|
+
afterok = Array(afterok).map(&:to_s)
|
213
|
+
afternotok = Array(afternotok).map(&:to_s)
|
214
|
+
afterany = Array(afterany).map(&:to_s)
|
215
|
+
|
216
|
+
# Set qsub options
|
217
|
+
args = []
|
218
|
+
# ignore args, can't use these if submitting from STDIN
|
219
|
+
args += ["-h"] if script.submit_as_hold
|
220
|
+
args += ["-r", script.rerunnable ? "y" : "n"] unless script.rerunnable.nil?
|
221
|
+
args += ["-M", script.email.join(",")] unless script.email.nil?
|
222
|
+
if script.email_on_started && script.email_on_terminated
|
223
|
+
args += ["-m", "be"]
|
224
|
+
elsif script.email_on_started
|
225
|
+
args += ["-m", "b"]
|
226
|
+
elsif script.email_on_terminated
|
227
|
+
args += ["-m", "e"]
|
228
|
+
end
|
229
|
+
args += ["-N", script.job_name] unless script.job_name.nil?
|
230
|
+
# ignore input_path (not defined in PBS Pro)
|
231
|
+
args += ["-o", script.output_path] unless script.output_path.nil?
|
232
|
+
args += ["-e", script.error_path] unless script.error_path.nil?
|
233
|
+
# Reservations are actually just queues in PBS Pro
|
234
|
+
args += ["-q", script.reservation_id] if !script.reservation_id.nil? && script.queue_name.nil?
|
235
|
+
args += ["-q", script.queue_name] unless script.queue_name.nil?
|
236
|
+
args += ["-p", script.priority] unless script.priority.nil?
|
237
|
+
args += ["-a", script.start_time.localtime.strftime("%C%y%m%d%H%M.%S")] unless script.start_time.nil?
|
238
|
+
args += ["-A", script.accounting_id] unless script.accounting_id.nil?
|
239
|
+
args += ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
|
240
|
+
|
241
|
+
# Set dependencies
|
242
|
+
depend = []
|
243
|
+
depend << "after:#{after.join(":")}" unless after.empty?
|
244
|
+
depend << "afterok:#{afterok.join(":")}" unless afterok.empty?
|
245
|
+
depend << "afternotok:#{afternotok.join(":")}" unless afternotok.empty?
|
246
|
+
depend << "afterany:#{afterany.join(":")}" unless afterany.empty?
|
247
|
+
args += ["-W", "depend=#{depend.join(",")}"] unless depend.empty?
|
248
|
+
|
249
|
+
# Set environment variables
|
250
|
+
envvars = script.job_environment.to_h
|
251
|
+
args += ["-v", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
|
252
|
+
|
253
|
+
# If error_path is not specified we join stdout & stderr (as this
|
254
|
+
# mimics what the other resource managers do)
|
255
|
+
args += ["-j", "oe"] if script.error_path.nil?
|
256
|
+
|
257
|
+
# Set native options
|
258
|
+
args += script.native if script.native
|
259
|
+
|
260
|
+
# Submit job
|
261
|
+
@pbspro.submit_string(script.content, args: args, chdir: script.workdir)
|
262
|
+
rescue Batch::Error => e
|
263
|
+
raise JobAdapterError, e.message
|
264
|
+
end
|
265
|
+
|
266
|
+
# Retrieve info for all jobs from the resource manager
|
267
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
268
|
+
# @return [Array<Info>] information describing submitted jobs
|
269
|
+
# @see Adapter#info_all
|
270
|
+
def info_all
|
271
|
+
@pbspro.get_jobs.map do |v|
|
272
|
+
parse_job_info(v)
|
273
|
+
end
|
274
|
+
rescue Batch::Error => e
|
275
|
+
raise JobAdapterError, e.message
|
276
|
+
end
|
277
|
+
|
278
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
279
|
+
# resource manager
|
280
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
281
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
282
|
+
# @return [Array<Info>] information describing submitted jobs
|
283
|
+
def info_where_owner(owner)
|
284
|
+
owner = Array.wrap(owner).map(&:to_s)
|
285
|
+
|
286
|
+
usr_jobs = @pbspro.select_jobs(args: ["-u", owner.join(",")])
|
287
|
+
all_jobs = @pbspro.select_jobs(args: ["-T"])
|
288
|
+
|
289
|
+
# `qstat` all jobs if user has too many jobs, otherwise `qstat` each
|
290
|
+
# individual job (default factor is 10%)
|
291
|
+
if usr_jobs.size > (qstat_factor * all_jobs.size)
|
292
|
+
super
|
293
|
+
else
|
294
|
+
usr_jobs.map { |id| info(id) }
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
# Retrieve job info from the resource manager
|
299
|
+
# @param id [#to_s] the id of the job
|
300
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
301
|
+
# @return [Info] information describing submitted job
|
302
|
+
# @see Adapter#info
|
303
|
+
def info(id)
|
304
|
+
id = id.to_s
|
305
|
+
@pbspro.get_jobs(id: id).map do |v|
|
306
|
+
parse_job_info(v)
|
307
|
+
end.first || Info.new(id: id, status: :completed)
|
308
|
+
rescue Batch::Error => e
|
309
|
+
# set completed status if can't find job id
|
310
|
+
if /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
311
|
+
Info.new(
|
312
|
+
id: id,
|
313
|
+
status: :completed
|
314
|
+
)
|
315
|
+
else
|
316
|
+
raise JobAdapterError, e.message
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
# Retrieve job status from resource manager
|
321
|
+
# @param id [#to_s] the id of the job
|
322
|
+
# @raise [JobAdapterError] if something goes wrong getting job status
|
323
|
+
# @return [Status] status of job
|
324
|
+
# @see Adapter#status
|
325
|
+
def status(id)
|
326
|
+
info(id.to_s).status
|
327
|
+
end
|
328
|
+
|
329
|
+
# Put the submitted job on hold
|
330
|
+
# @param id [#to_s] the id of the job
|
331
|
+
# @raise [JobAdapterError] if something goes wrong holding a job
|
332
|
+
# @return [void]
|
333
|
+
# @see Adapter#hold
|
334
|
+
def hold(id)
|
335
|
+
@pbspro.hold_job(id.to_s)
|
336
|
+
rescue Batch::Error => e
|
337
|
+
# assume successful job hold if can't find job id
|
338
|
+
raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
339
|
+
end
|
340
|
+
|
341
|
+
# Release the job that is on hold
|
342
|
+
# @param id [#to_s] the id of the job
|
343
|
+
# @raise [JobAdapterError] if something goes wrong releasing a job
|
344
|
+
# @return [void]
|
345
|
+
# @see Adapter#release
|
346
|
+
def release(id)
|
347
|
+
@pbspro.release_job(id.to_s)
|
348
|
+
rescue Batch::Error => e
|
349
|
+
# assume successful job release if can't find job id
|
350
|
+
raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
351
|
+
end
|
352
|
+
|
353
|
+
# Delete the submitted job
|
354
|
+
# @param id [#to_s] the id of the job
|
355
|
+
# @raise [JobAdapterError] if something goes wrong deleting a job
|
356
|
+
# @return [void]
|
357
|
+
# @see Adapter#delete
|
358
|
+
def delete(id)
|
359
|
+
@pbspro.delete_job(id.to_s)
|
360
|
+
rescue Batch::Error => e
|
361
|
+
# assume successful job deletion if can't find job id
|
362
|
+
raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
363
|
+
end
|
364
|
+
|
365
|
+
private
|
366
|
+
# Convert duration to seconds
|
367
|
+
def duration_in_seconds(time)
|
368
|
+
time.nil? ? nil : time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
|
369
|
+
end
|
370
|
+
|
371
|
+
# Convert seconds to duration
|
372
|
+
def seconds_to_duration(time)
|
373
|
+
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
|
374
|
+
end
|
375
|
+
|
376
|
+
# Convert host list string to individual nodes
|
377
|
+
# "hosta/J1+hostb/J2*P+..."
|
378
|
+
# where J1 and J2 are an index of the job on the named host and P is the number of
|
379
|
+
# processors allocated from that host to this job. P does not appear if it is 1.
|
380
|
+
# Example: "i5n14/2*7" uses 7 procs on node "i5n14"
|
381
|
+
def parse_nodes(node_list)
|
382
|
+
node_list.split('+').map do |n|
|
383
|
+
name, procs_list = n.split('/')
|
384
|
+
procs = (procs_list.split('*')[1] || 1).to_i
|
385
|
+
{name: name, procs: procs}
|
386
|
+
end
|
387
|
+
end
|
388
|
+
|
389
|
+
# Determine state from PBS Pro state code
|
390
|
+
def get_state(st)
|
391
|
+
STATE_MAP.fetch(st, :undetermined)
|
392
|
+
end
|
393
|
+
|
394
|
+
# Parse hash describing PBS Pro job status
|
395
|
+
def parse_job_info(v)
|
396
|
+
/^(?<job_owner>[\w-]+)@(?<submit_host>.+)$/ =~ v[:Job_Owner]
|
397
|
+
allocated_nodes = parse_nodes(v[:exec_host] || "")
|
398
|
+
procs = allocated_nodes.inject(0) { |sum, x| sum + x[:procs] }
|
399
|
+
if allocated_nodes.empty? # fill in with requested resources
|
400
|
+
allocated_nodes = [ { name: nil } ] * v.fetch(:Resource_List, {})[:nodect].to_i
|
401
|
+
procs = v.fetch(:Resource_List, {})[:ncpus].to_i
|
402
|
+
end
|
403
|
+
Info.new(
|
404
|
+
id: v[:job_id],
|
405
|
+
status: get_state(v[:job_state]),
|
406
|
+
allocated_nodes: allocated_nodes,
|
407
|
+
submit_host: submit_host,
|
408
|
+
job_name: v[:Job_Name],
|
409
|
+
job_owner: job_owner,
|
410
|
+
accounting_id: v[:Account_Name],
|
411
|
+
procs: procs,
|
412
|
+
queue_name: v[:queue],
|
413
|
+
wallclock_time: duration_in_seconds(v.fetch(:resources_used, {})[:walltime]),
|
414
|
+
wallclock_limit: duration_in_seconds(v.fetch(:Resource_List, {})[:walltime]),
|
415
|
+
cpu_time: duration_in_seconds(v.fetch(:resources_used, {})[:cput]),
|
416
|
+
submission_time: v[:ctime] ? Time.parse(v[:ctime]) : nil,
|
417
|
+
dispatch_time: v[:stime] ? Time.parse(v[:stime]) : nil,
|
418
|
+
native: v
|
419
|
+
)
|
420
|
+
end
|
421
|
+
end
|
422
|
+
end
|
423
|
+
end
|
424
|
+
end
|