ood_core 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +63 -20
- data/lib/ood_core.rb +6 -0
- data/lib/ood_core/batch_connect/factory.rb +42 -0
- data/lib/ood_core/batch_connect/template.rb +207 -0
- data/lib/ood_core/batch_connect/templates/basic.rb +23 -0
- data/lib/ood_core/batch_connect/templates/vnc.rb +201 -0
- data/lib/ood_core/cluster.rb +33 -8
- data/lib/ood_core/errors.rb +6 -0
- data/lib/ood_core/job/adapter.rb +11 -0
- data/lib/ood_core/job/adapters/lsf.rb +16 -22
- data/lib/ood_core/job/adapters/lsf/batch.rb +28 -15
- data/lib/ood_core/job/adapters/lsf/helper.rb +79 -0
- data/lib/ood_core/job/adapters/pbspro.rb +424 -0
- data/lib/ood_core/job/adapters/slurm.rb +8 -0
- data/lib/ood_core/job/adapters/torque.rb +32 -2
- data/lib/ood_core/job/info.rb +9 -2
- data/lib/ood_core/refinements/hash_extensions.rb +9 -0
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +1 -1
- metadata +11 -6
data/lib/ood_core/cluster.rb
CHANGED
@@ -33,8 +33,12 @@ module OodCore
|
|
33
33
|
# @option cluster [#to_h] :metadata ({}) The cluster's metadata
|
34
34
|
# @option cluster [#to_h] :login ({}) The cluster's SSH host
|
35
35
|
# @option cluster [#to_h] :job ({}) The job adapter for this cluster
|
36
|
-
# @option cluster [#to_h] :custom ({}) Any custom resources for this
|
37
|
-
#
|
36
|
+
# @option cluster [#to_h] :custom ({}) Any custom resources for this
|
37
|
+
# cluster
|
38
|
+
# @option cluster [Array<#to_h>] :acls ([]) List of ACLs to validate
|
39
|
+
# against
|
40
|
+
# @option cluster [#to_h] :batch_connect ({}) Configuration for batch
|
41
|
+
# connect templates
|
38
42
|
def initialize(cluster)
|
39
43
|
c = cluster.to_h.symbolize_keys
|
40
44
|
|
@@ -47,6 +51,7 @@ module OodCore
|
|
47
51
|
@job_config = c.fetch(:job, {}) .to_h.symbolize_keys
|
48
52
|
@custom_config = c.fetch(:custom, {}) .to_h.symbolize_keys
|
49
53
|
@acls_config = c.fetch(:acls, []) .map(&:to_h)
|
54
|
+
@batch_connect_config = c.fetch(:batch_connect, {}).to_h.symbolize_keys
|
50
55
|
end
|
51
56
|
|
52
57
|
# Metadata that provides extra information about this cluster
|
@@ -81,6 +86,25 @@ module OodCore
|
|
81
86
|
build_acls(job_config.fetch(:acls, []).map(&:to_h)).all?(&:allow?)
|
82
87
|
end
|
83
88
|
|
89
|
+
# The batch connect template configuration used for this cluster
|
90
|
+
# @param template [#to_sym, nil] the template type
|
91
|
+
# @return [Hash] the batch connect configuration
|
92
|
+
def batch_connect_config(template = nil)
|
93
|
+
if template
|
94
|
+
@batch_connect_config.fetch(template.to_sym, {}).to_h.symbolize_keys.merge(template: template.to_sym)
|
95
|
+
else
|
96
|
+
@batch_connect_config
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Build a batch connect template from the respective configuration
|
101
|
+
# @param context [#to_h] the context used for rendering the template
|
102
|
+
# @return [BatchConnect::Template] the batch connect template
|
103
|
+
def batch_connect_template(context = {})
|
104
|
+
context = context.to_h.symbolize_keys
|
105
|
+
BatchConnect::Factory.build batch_connect_config(context[:template] || :basic).merge(context)
|
106
|
+
end
|
107
|
+
|
84
108
|
# The configuration for any custom features or resources for this cluster
|
85
109
|
# @param feature [#to_sym, nil] the feature or resource
|
86
110
|
# @return [Hash] configuration for custom feature or resource
|
@@ -125,12 +149,13 @@ module OodCore
|
|
125
149
|
# @return [Hash] the hash describing this object
|
126
150
|
def to_h
|
127
151
|
{
|
128
|
-
id:
|
129
|
-
metadata:
|
130
|
-
login:
|
131
|
-
job:
|
132
|
-
custom:
|
133
|
-
acls:
|
152
|
+
id: id,
|
153
|
+
metadata: metadata_config,
|
154
|
+
login: login_config,
|
155
|
+
job: job_config,
|
156
|
+
custom: custom_config,
|
157
|
+
acls: acls_config,
|
158
|
+
batch_connect: batch_connect_config
|
134
159
|
}
|
135
160
|
end
|
136
161
|
|
data/lib/ood_core/errors.rb
CHANGED
@@ -16,4 +16,10 @@ module OodCore
|
|
16
16
|
|
17
17
|
# Raised when a job state is set to an invalid option
|
18
18
|
class UnknownStateAttribute < Error; end
|
19
|
+
|
20
|
+
# Raised when template not specified in configuration
|
21
|
+
class TemplateNotSpecified < Error; end
|
22
|
+
|
23
|
+
# Raised when cannot find template specified in configuration
|
24
|
+
class TemplateNotFound < Error; end
|
19
25
|
end
|
data/lib/ood_core/job/adapter.rb
CHANGED
@@ -4,6 +4,8 @@ module OodCore
|
|
4
4
|
# submitting/statusing/holding/deleting jobs
|
5
5
|
# @abstract
|
6
6
|
class Adapter
|
7
|
+
using Refinements::ArrayExtensions
|
8
|
+
|
7
9
|
# Submit a job with the attributes defined in the job template instance
|
8
10
|
# @abstract Subclass is expected to implement {#submit}
|
9
11
|
# @raise [NotImplementedError] if subclass did not define {#submit}
|
@@ -39,6 +41,15 @@ module OodCore
|
|
39
41
|
raise NotImplementedError, "subclass did not define #info_all"
|
40
42
|
end
|
41
43
|
|
44
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
45
|
+
# resource manager
|
46
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
47
|
+
# @return [Array<Info>] information describing submitted jobs
|
48
|
+
def info_where_owner(owner)
|
49
|
+
owner = Array.wrap(owner).map(&:to_s)
|
50
|
+
info_all.select { |info| owner.include? info.job_owner }
|
51
|
+
end
|
52
|
+
|
42
53
|
# Retrieve job info from the resource manager
|
43
54
|
# @abstract Subclass is expected to implement {#info}
|
44
55
|
# @raise [NotImplementedError] if subclass did not define {#info}
|
@@ -72,24 +72,9 @@ module OodCore
|
|
72
72
|
afternotok = Array(afternotok).map(&:to_s)
|
73
73
|
afterany = Array(afterany).map(&:to_s)
|
74
74
|
|
75
|
-
|
76
|
-
args += ["-P", script.accounting_id] unless script.accounting_id.nil?
|
77
|
-
args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
|
78
|
-
args += ["-J", script.job_name] unless script.job_name.nil?
|
79
|
-
|
80
|
-
# TODO: dependencies
|
81
|
-
|
82
|
-
env = {
|
83
|
-
#TODO:
|
84
|
-
#LSB_HOSTS?
|
85
|
-
#LSB_MCPU_HOSTS?
|
86
|
-
#SNDJOBS_TO?
|
87
|
-
#
|
88
|
-
}
|
89
|
-
|
90
|
-
# Submit job
|
91
|
-
batch.submit_string(script.content, args: args, env: env)
|
75
|
+
kwargs = helper.batch_submit_args(script, after: after, afterok: afterok, afternotok: afternotok, afterany: afterany)
|
92
76
|
|
77
|
+
batch.submit_string(script.content, **kwargs)
|
93
78
|
rescue Batch::Error => e
|
94
79
|
raise JobAdapterError, e.message
|
95
80
|
end
|
@@ -170,20 +155,29 @@ module OodCore
|
|
170
155
|
end
|
171
156
|
|
172
157
|
def info_for_batch_hash(v)
|
158
|
+
nodes = helper.parse_exec_host(v[:exec_host]).map do |host|
|
159
|
+
NodeInfo.new(name: host[:host], procs: host[:slots])
|
160
|
+
end
|
161
|
+
|
162
|
+
# FIXME: estimated_runtime should be set by batch object instead of
|
163
|
+
dispatch_time = helper.parse_past_time(v[:start_time], ignore_errors: true)
|
164
|
+
finish_time = helper.parse_past_time(v[:finish_time], ignore_errors: true)
|
165
|
+
|
173
166
|
Info.new(
|
174
167
|
id: v[:id],
|
175
168
|
status: get_state(v[:status]),
|
176
|
-
allocated_nodes:
|
169
|
+
allocated_nodes: nodes,
|
177
170
|
submit_host: v[:from_host],
|
178
171
|
job_name: v[:name],
|
179
172
|
job_owner: v[:user],
|
180
173
|
accounting_id: v[:project],
|
181
|
-
procs:
|
174
|
+
procs: nodes.any? ? nodes.map(&:procs).reduce(&:+) : 0,
|
182
175
|
queue_name: v[:queue],
|
183
|
-
wallclock_time:
|
184
|
-
cpu_time:
|
176
|
+
wallclock_time: helper.estimate_runtime(current_time: Time.now, start_time: dispatch_time, finish_time: finish_time),
|
177
|
+
cpu_time: helper.parse_cpu_used(v[:cpu_used]),
|
178
|
+
# cpu_time: nil,
|
185
179
|
submission_time: helper.parse_past_time(v[:submit_time], ignore_errors: true),
|
186
|
-
dispatch_time:
|
180
|
+
dispatch_time: dispatch_time,
|
187
181
|
native: v
|
188
182
|
)
|
189
183
|
end
|
@@ -49,21 +49,24 @@ class OodCore::Job::Adapters::Lsf::Batch
|
|
49
49
|
%w( -u all -a -w -W )
|
50
50
|
end
|
51
51
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
52
|
+
# status fields available from bjobs
|
53
|
+
def fields
|
54
|
+
%i(id user status queue from_host exec_host name submit_time
|
55
|
+
project cpu_used mem swap pids start_time finish_time)
|
56
|
+
end
|
57
57
|
|
58
58
|
# helper method
|
59
59
|
def parse_bjobs_output(response)
|
60
60
|
return [] if response =~ /No job found/ || response.nil?
|
61
61
|
|
62
62
|
lines = response.split("\n")
|
63
|
-
|
63
|
+
columns = lines.shift.split
|
64
|
+
|
65
|
+
validate_bjobs_output_columns(columns)
|
66
|
+
jobname_column_idx = columns.find_index("JOB_NAME")
|
64
67
|
|
65
|
-
lines.
|
66
|
-
values = split_bjobs_output_line(job)
|
68
|
+
lines.map{ |job|
|
69
|
+
values = split_bjobs_output_line(job, num_columns: columns.count, jobname_column_idx: jobname_column_idx)
|
67
70
|
|
68
71
|
# make a hash of { field: "value", etc. }
|
69
72
|
Hash[fields.zip(values)].each_with_object({}) { |(k,v),o|
|
@@ -135,13 +138,21 @@ class OodCore::Job::Adapters::Lsf::Batch
|
|
135
138
|
end
|
136
139
|
|
137
140
|
# split a line of output from bjobs into field values
|
138
|
-
def split_bjobs_output_line(line)
|
141
|
+
def split_bjobs_output_line(line, num_columns:, jobname_column_idx:)
|
139
142
|
values = line.strip.split
|
140
143
|
|
141
|
-
if(values.count >
|
142
|
-
#
|
143
|
-
#
|
144
|
-
|
144
|
+
if(values.count > num_columns)
|
145
|
+
# if the line has more fields than the number of columns, that means one
|
146
|
+
# field value has spaces, so it was erroneously split into
|
147
|
+
# multiple fields; we assume that is the jobname field, and we will
|
148
|
+
# collapse the fields into a single field
|
149
|
+
#
|
150
|
+
# FIXME: assumes jobname_column_idx is not first or last item
|
151
|
+
j = jobname_column_idx
|
152
|
+
|
153
|
+
# e.g. if 15 fields and jobname is 7th field
|
154
|
+
# values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
|
155
|
+
values = values[0..(j-1)] + [values[j..(j-num_columns)].join(" ")] + values[(j+1-num_columns)..-1]
|
145
156
|
end
|
146
157
|
|
147
158
|
values
|
@@ -151,9 +162,11 @@ class OodCore::Job::Adapters::Lsf::Batch
|
|
151
162
|
def validate_bjobs_output_columns(columns)
|
152
163
|
expected = %w(JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME
|
153
164
|
SUBMIT_TIME PROJ_NAME CPU_USED MEM SWAP PIDS START_TIME FINISH_TIME)
|
154
|
-
|
165
|
+
# (expected & columns) will return the columns that are the same
|
166
|
+
# so if there are extra columns we can just ignore those (like SLOTS in LSF 9.1)
|
167
|
+
if columns && ((expected & columns) != expected)
|
155
168
|
raise Error, "bjobs output in different format than expected: " \
|
156
|
-
"#{columns.inspect}
|
169
|
+
"#{columns.inspect} did not include all columns: #{expected.inspect}"
|
157
170
|
end
|
158
171
|
end
|
159
172
|
|
@@ -23,4 +23,83 @@ class OodCore::Job::Adapters::Lsf::Helper
|
|
23
23
|
|
24
24
|
nil
|
25
25
|
end
|
26
|
+
|
27
|
+
# convert exec_host string format from bjobs to a hash
|
28
|
+
# i.e. "c012" => [{host: "c012", slots: 1}]
|
29
|
+
# i.e. "4*c012:8*c013" => [{host: "c012", slots: 4}, {host: "c013", slots: 8}]
|
30
|
+
def parse_exec_host(exec_host_str)
|
31
|
+
return [] if exec_host_str.nil? || exec_host_str.empty?
|
32
|
+
|
33
|
+
exec_host_str.scan(exec_host_regex).map do |match|
|
34
|
+
{host: match[2], slots: match[1] ? match[1].to_i : 1}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def exec_host_regex
|
39
|
+
@exec_host_regex ||= Regexp.new(/((\d+)\*)?([^:]+)/)
|
40
|
+
end
|
41
|
+
|
42
|
+
# given current time, dispatch time, and finish time values, estimate the
|
43
|
+
# runtime for a job; this estimate will be accurate if the job never enters a
|
44
|
+
# suspended state during its execution
|
45
|
+
def estimate_runtime(current_time:, start_time:, finish_time:)
|
46
|
+
return nil if start_time.nil?
|
47
|
+
|
48
|
+
(finish_time || current_time) - start_time
|
49
|
+
end
|
50
|
+
|
51
|
+
# Convert CPU_USED string to seconds
|
52
|
+
#
|
53
|
+
# example strings of cpu_used in LSF 8.3:
|
54
|
+
#
|
55
|
+
# 060:24:00.00
|
56
|
+
# 046:19:37.00
|
57
|
+
# 1118:59:09.00
|
58
|
+
# 000:00:00.00
|
59
|
+
# 000:48:18.39
|
60
|
+
# 003:11:36.67
|
61
|
+
# 003:24:40.95
|
62
|
+
# 50769:48:00.-48
|
63
|
+
# 50835:48:48.-48
|
64
|
+
#
|
65
|
+
# my guess is: hours:minutes:seconds.????
|
66
|
+
#
|
67
|
+
# @return [Fixnum, nil] cpu used as seconds
|
68
|
+
def parse_cpu_used(cpu_used)
|
69
|
+
if cpu_used =~ /^(\d+):(\d+):(\d+)\..*$/
|
70
|
+
$1.to_i*3600 + $2.to_i*60 + $3.to_i
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def batch_submit_args(script, after: [], afterok: [], afternotok: [], afterany: [])
|
75
|
+
args = []
|
76
|
+
|
77
|
+
args += ["-P", script.accounting_id] unless script.accounting_id.nil?
|
78
|
+
args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
|
79
|
+
args += ["-J", script.job_name] unless script.job_name.nil?
|
80
|
+
args += ["-q", script.queue_name] unless script.queue_name.nil?
|
81
|
+
args += ["-U", script.reservation_id] unless script.reservation_id.nil?
|
82
|
+
args += ["-sp", script.priority] unless script.priority.nil?
|
83
|
+
args += ["-H"] if script.submit_as_hold
|
84
|
+
args += (script.rerunnable ? ["-r"] : ["-rn"]) unless script.rerunnable.nil?
|
85
|
+
args += ["-b", script.start_time.localtime.strftime("%Y:%m:%d:%H:%M")] unless script.start_time.nil?
|
86
|
+
args += ["-W", (script.wall_time / 60).to_i] unless script.wall_time.nil?
|
87
|
+
|
88
|
+
# input and output files
|
89
|
+
args += ["-i", script.input_path] unless script.input_path.nil?
|
90
|
+
args += ["-o", script.output_path] unless script.output_path.nil?
|
91
|
+
args += ["-e", script.error_path] unless script.error_path.nil?
|
92
|
+
|
93
|
+
# email
|
94
|
+
args += ["-B"] if script.email_on_started
|
95
|
+
args += ["-N"] if script.email_on_terminated
|
96
|
+
args += ["-u", script.email.join(",")] unless script.email.nil? || script.email.empty?
|
97
|
+
|
98
|
+
args += script.native unless script.native.nil?
|
99
|
+
|
100
|
+
# environment
|
101
|
+
env = script.job_environment || {}
|
102
|
+
|
103
|
+
{args: args, env: env}
|
104
|
+
end
|
26
105
|
end
|
@@ -0,0 +1,424 @@
|
|
1
|
+
require "time"
|
2
|
+
require "ood_core/refinements/hash_extensions"
|
3
|
+
|
4
|
+
module OodCore
|
5
|
+
module Job
|
6
|
+
class Factory
|
7
|
+
using Refinements::HashExtensions
|
8
|
+
|
9
|
+
# Build the PBS Pro adapter from a configuration
|
10
|
+
# @param config [#to_h] the configuration for job adapter
|
11
|
+
# @option config [Object] :host (nil) The batch server host
|
12
|
+
# @option config [Object] :exec (nil) Path to PBS Pro executables
|
13
|
+
# @option config [Object] :qstat_factor (nil) Deciding factor on how to
|
14
|
+
# call qstat for a user
|
15
|
+
def self.build_pbspro(config)
|
16
|
+
c = config.to_h.compact.symbolize_keys
|
17
|
+
host = c.fetch(:host, nil)
|
18
|
+
exec = c.fetch(:exec, nil)
|
19
|
+
qstat_factor = c.fetch(:qstat_factor, nil)
|
20
|
+
pbspro = Adapters::PBSPro::Batch.new(host: host, exec: exec)
|
21
|
+
Adapters::PBSPro.new(pbspro: pbspro, qstat_factor: qstat_factor)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
module Adapters
|
26
|
+
# An adapter object that describes the communication with a PBS Pro
|
27
|
+
# resource manager for job management.
|
28
|
+
class PBSPro < Adapter
|
29
|
+
using Refinements::ArrayExtensions
|
30
|
+
using Refinements::HashExtensions
|
31
|
+
|
32
|
+
# Object used for simplified communication with a PBS Pro batch server
|
33
|
+
# @api private
|
34
|
+
class Batch
|
35
|
+
# The host of the PBS Pro batch server
|
36
|
+
# @example
|
37
|
+
# my_batch.host #=> "my_batch.server.edu"
|
38
|
+
# @return [String, nil] the batch server host
|
39
|
+
attr_reader :host
|
40
|
+
|
41
|
+
# The path containing the PBS executables
|
42
|
+
# @example
|
43
|
+
# my_batch.exec.to_s #=> "/usr/local/pbspro/10.0.0
|
44
|
+
# @return [Pathname, nil] path to pbs executables
|
45
|
+
attr_reader :exec
|
46
|
+
|
47
|
+
# The root exception class that all PBS Pro-specific exceptions
|
48
|
+
# inherit from
|
49
|
+
class Error < StandardError; end
|
50
|
+
|
51
|
+
# @param host [#to_s, nil] the batch server host
|
52
|
+
# @param exec [#to_s, nil] path to pbs executables
|
53
|
+
def initialize(host: nil, exec: nil)
|
54
|
+
@host = host && host.to_s
|
55
|
+
@exec = exec && Pathname.new(exec.to_s)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Get a list of hashes detailing each of the jobs on the batch server
|
59
|
+
# @example Status info for all jobs
|
60
|
+
# my_batch.get_jobs
|
61
|
+
# #=>
|
62
|
+
# #[
|
63
|
+
# # {
|
64
|
+
# # :account => "account",
|
65
|
+
# # :job_id => "my_job",
|
66
|
+
# # ...
|
67
|
+
# # },
|
68
|
+
# # {
|
69
|
+
# # :account => "account",
|
70
|
+
# # :job_id => "my_other_job",
|
71
|
+
# # ...
|
72
|
+
# # },
|
73
|
+
# # ...
|
74
|
+
# #]
|
75
|
+
# @param id [#to_s] the id of the job
|
76
|
+
# @raise [Error] if `qstat` command exited unsuccessfully
|
77
|
+
# @return [Array<Hash>] list of details for jobs
|
78
|
+
def get_jobs(id: "")
|
79
|
+
args = ["-f"] # display all information
|
80
|
+
args += ["-t"] # list subjobs
|
81
|
+
args += [id.to_s] unless id.to_s.empty?
|
82
|
+
lines = call("qstat", *args).gsub("\n\t", "").split("\n").map(&:strip)
|
83
|
+
|
84
|
+
jobs = []
|
85
|
+
lines.each do |line|
|
86
|
+
if /^Job Id: (?<job_id>.+)$/ =~ line
|
87
|
+
jobs << { job_id: job_id }
|
88
|
+
elsif /^(?<key>[^\s]+) = (?<value>.+)$/ =~ line
|
89
|
+
hsh = jobs.last
|
90
|
+
k1, k2 = key.split(".").map(&:to_sym)
|
91
|
+
k2 ? ( hsh[k1] ||= {} and hsh[k1][k2] = value ) : ( hsh[k1] = value )
|
92
|
+
end
|
93
|
+
end
|
94
|
+
jobs.reject { |j| /\[\]/ =~ j[:job_id] } # drop main job array jobs
|
95
|
+
end
|
96
|
+
|
97
|
+
# Select batch jobs from the batch server
|
98
|
+
# @param args [Array<#to_s>] arguments passed to `qselect` command
|
99
|
+
# @raise [Error] if `qselect` command exited unsuccessfully
|
100
|
+
# @return [Array<String>] list of job ids that match selection
|
101
|
+
# criteria
|
102
|
+
def select_jobs(args: [])
|
103
|
+
call("qselect", *args).split("\n").map(&:strip)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Put a specified job on hold
|
107
|
+
# @example Put job "1234" on hold
|
108
|
+
# my_batch.hold_job("1234")
|
109
|
+
# @param id [#to_s] the id of the job
|
110
|
+
# @raise [Error] if `qhold` command exited unsuccessfully
|
111
|
+
# @return [void]
|
112
|
+
def hold_job(id)
|
113
|
+
call("qhold", id.to_s)
|
114
|
+
end
|
115
|
+
|
116
|
+
# Release a specified job that is on hold
|
117
|
+
# @example Release job "1234" from on hold
|
118
|
+
# my_batch.release_job("1234")
|
119
|
+
# @param id [#to_s] the id of the job
|
120
|
+
# @raise [Error] if `qrls` command exited unsuccessfully
|
121
|
+
# @return [void]
|
122
|
+
def release_job(id)
|
123
|
+
call("qrls", id.to_s)
|
124
|
+
end
|
125
|
+
|
126
|
+
# Delete a specified job from batch server
|
127
|
+
# @example Delete job "1234"
|
128
|
+
# my_batch.delete_job("1234")
|
129
|
+
# @param id [#to_s] the id of the job
|
130
|
+
# @raise [Error] if `qdel` command exited unsuccessfully
|
131
|
+
# @return [void]
|
132
|
+
def delete_job(id)
|
133
|
+
call("qdel", id.to_s)
|
134
|
+
end
|
135
|
+
|
136
|
+
# Submit a script expanded as a string to the batch server
|
137
|
+
# @param str [#to_s] script as a string
|
138
|
+
# @param args [Array<#to_s>] arguments passed to `qsub` command
|
139
|
+
# @param chdir [#to_s, nil] working directory where `qsub` is called
|
140
|
+
# @raise [Error] if `qsub` command exited unsuccessfully
|
141
|
+
# @return [String] the id of the job that was created
|
142
|
+
def submit_string(str, args: [], chdir: nil)
|
143
|
+
call("qsub", *args, stdin: str.to_s, chdir: chdir).strip
|
144
|
+
end
|
145
|
+
|
146
|
+
private
|
147
|
+
# Call a forked PBS Pro command for a given batch server
|
148
|
+
def call(cmd, *args, env: {}, stdin: "", chdir: nil)
|
149
|
+
cmd = cmd.to_s
|
150
|
+
cmd = exec.join("bin", cmd).to_s if exec
|
151
|
+
args = args.map(&:to_s)
|
152
|
+
env = env.to_h.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
|
153
|
+
env["PBS_DEFAULT"] = host.to_s if host
|
154
|
+
env["PBS_EXEC"] = exec.to_s if exec
|
155
|
+
chdir ||= "."
|
156
|
+
o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s, chdir: chdir.to_s)
|
157
|
+
s.success? ? o : raise(Error, e)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# Mapping of state codes for PBSPro
|
162
|
+
STATE_MAP = {
|
163
|
+
'Q' => :queued,
|
164
|
+
'W' => :queued, # job is waiting for its submitter-assigned start time to be reached
|
165
|
+
'H' => :queued_held,
|
166
|
+
'T' => :queued_held, # job is being moved to a new location
|
167
|
+
'M' => :completed, # job was moved to another server
|
168
|
+
'R' => :running,
|
169
|
+
'S' => :suspended,
|
170
|
+
'U' => :suspended, # cycle-harvesting job is suspended due to keyboard activity
|
171
|
+
'E' => :running, # job is exiting after having run
|
172
|
+
'F' => :completed, # job is finished
|
173
|
+
'X' => :completed # subjob has completed execution or has been deleted
|
174
|
+
# ignore B as it signifies a job array
|
175
|
+
}
|
176
|
+
|
177
|
+
# What percentage of jobs a user owns out of all jobs, used to decide
|
178
|
+
# whether we filter the owner's jobs from a `qstat` of all jobs or call
|
179
|
+
# `qstat` on each of the owner's individual jobs
|
180
|
+
# @return [Float] ratio of owner's jobs to all jobs
|
181
|
+
attr_reader :qstat_factor
|
182
|
+
|
183
|
+
# @api private
|
184
|
+
# @param opts [#to_h] the options defining this adapter
|
185
|
+
# @option opts [Batch] :pbspro The PBS Pro batch object
|
186
|
+
# @option opts [#to_f] :qstat_factor (0.10) The qstat deciding factor
|
187
|
+
# @see Factory.build_pbspro
|
188
|
+
def initialize(opts = {})
|
189
|
+
o = opts.to_h.compact.symbolize_keys
|
190
|
+
|
191
|
+
@pbspro = o.fetch(:pbspro) { raise ArgumentError, "No pbspro object specified. Missing argument: pbspro" }
|
192
|
+
@qstat_factor = o.fetch(:qstat_factor, 0.10).to_f
|
193
|
+
end
|
194
|
+
|
195
|
+
# Submit a job with the attributes defined in the job template instance
|
196
|
+
# @param script [Script] script object that describes the script and
|
197
|
+
# attributes for the submitted job
|
198
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for
|
199
|
+
# execution at any point after dependent jobs have started execution
|
200
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
201
|
+
# execution only after dependent jobs have terminated with no errors
|
202
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
203
|
+
# execution only after dependent jobs have terminated with errors
|
204
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
205
|
+
# execution after dependent jobs have terminated
|
206
|
+
# @raise [JobAdapterError] if something goes wrong submitting a job
|
207
|
+
# @return [String] the job id returned after successfully submitting a
|
208
|
+
# job
|
209
|
+
# @see Adapter#submit
|
210
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
211
|
+
after = Array(after).map(&:to_s)
|
212
|
+
afterok = Array(afterok).map(&:to_s)
|
213
|
+
afternotok = Array(afternotok).map(&:to_s)
|
214
|
+
afterany = Array(afterany).map(&:to_s)
|
215
|
+
|
216
|
+
# Set qsub options
|
217
|
+
args = []
|
218
|
+
# ignore args, can't use these if submitting from STDIN
|
219
|
+
args += ["-h"] if script.submit_as_hold
|
220
|
+
args += ["-r", script.rerunnable ? "y" : "n"] unless script.rerunnable.nil?
|
221
|
+
args += ["-M", script.email.join(",")] unless script.email.nil?
|
222
|
+
if script.email_on_started && script.email_on_terminated
|
223
|
+
args += ["-m", "be"]
|
224
|
+
elsif script.email_on_started
|
225
|
+
args += ["-m", "b"]
|
226
|
+
elsif script.email_on_terminated
|
227
|
+
args += ["-m", "e"]
|
228
|
+
end
|
229
|
+
args += ["-N", script.job_name] unless script.job_name.nil?
|
230
|
+
# ignore input_path (not defined in PBS Pro)
|
231
|
+
args += ["-o", script.output_path] unless script.output_path.nil?
|
232
|
+
args += ["-e", script.error_path] unless script.error_path.nil?
|
233
|
+
# Reservations are actually just queues in PBS Pro
|
234
|
+
args += ["-q", script.reservation_id] if !script.reservation_id.nil? && script.queue_name.nil?
|
235
|
+
args += ["-q", script.queue_name] unless script.queue_name.nil?
|
236
|
+
args += ["-p", script.priority] unless script.priority.nil?
|
237
|
+
args += ["-a", script.start_time.localtime.strftime("%C%y%m%d%H%M.%S")] unless script.start_time.nil?
|
238
|
+
args += ["-A", script.accounting_id] unless script.accounting_id.nil?
|
239
|
+
args += ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
|
240
|
+
|
241
|
+
# Set dependencies
|
242
|
+
depend = []
|
243
|
+
depend << "after:#{after.join(":")}" unless after.empty?
|
244
|
+
depend << "afterok:#{afterok.join(":")}" unless afterok.empty?
|
245
|
+
depend << "afternotok:#{afternotok.join(":")}" unless afternotok.empty?
|
246
|
+
depend << "afterany:#{afterany.join(":")}" unless afterany.empty?
|
247
|
+
args += ["-W", "depend=#{depend.join(",")}"] unless depend.empty?
|
248
|
+
|
249
|
+
# Set environment variables
|
250
|
+
envvars = script.job_environment.to_h
|
251
|
+
args += ["-v", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
|
252
|
+
|
253
|
+
# If error_path is not specified we join stdout & stderr (as this
|
254
|
+
# mimics what the other resource managers do)
|
255
|
+
args += ["-j", "oe"] if script.error_path.nil?
|
256
|
+
|
257
|
+
# Set native options
|
258
|
+
args += script.native if script.native
|
259
|
+
|
260
|
+
# Submit job
|
261
|
+
@pbspro.submit_string(script.content, args: args, chdir: script.workdir)
|
262
|
+
rescue Batch::Error => e
|
263
|
+
raise JobAdapterError, e.message
|
264
|
+
end
|
265
|
+
|
266
|
+
# Retrieve info for all jobs from the resource manager
|
267
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
268
|
+
# @return [Array<Info>] information describing submitted jobs
|
269
|
+
# @see Adapter#info_all
|
270
|
+
def info_all
|
271
|
+
@pbspro.get_jobs.map do |v|
|
272
|
+
parse_job_info(v)
|
273
|
+
end
|
274
|
+
rescue Batch::Error => e
|
275
|
+
raise JobAdapterError, e.message
|
276
|
+
end
|
277
|
+
|
278
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
279
|
+
# resource manager
|
280
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
281
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
282
|
+
# @return [Array<Info>] information describing submitted jobs
|
283
|
+
def info_where_owner(owner)
|
284
|
+
owner = Array.wrap(owner).map(&:to_s)
|
285
|
+
|
286
|
+
usr_jobs = @pbspro.select_jobs(args: ["-u", owner.join(",")])
|
287
|
+
all_jobs = @pbspro.select_jobs(args: ["-T"])
|
288
|
+
|
289
|
+
# `qstat` all jobs if user has too many jobs, otherwise `qstat` each
|
290
|
+
# individual job (default factor is 10%)
|
291
|
+
if usr_jobs.size > (qstat_factor * all_jobs.size)
|
292
|
+
super
|
293
|
+
else
|
294
|
+
usr_jobs.map { |id| info(id) }
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
# Retrieve job info from the resource manager
|
299
|
+
# @param id [#to_s] the id of the job
|
300
|
+
# @raise [JobAdapterError] if something goes wrong getting job info
|
301
|
+
# @return [Info] information describing submitted job
|
302
|
+
# @see Adapter#info
|
303
|
+
def info(id)
|
304
|
+
id = id.to_s
|
305
|
+
@pbspro.get_jobs(id: id).map do |v|
|
306
|
+
parse_job_info(v)
|
307
|
+
end.first || Info.new(id: id, status: :completed)
|
308
|
+
rescue Batch::Error => e
|
309
|
+
# set completed status if can't find job id
|
310
|
+
if /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
311
|
+
Info.new(
|
312
|
+
id: id,
|
313
|
+
status: :completed
|
314
|
+
)
|
315
|
+
else
|
316
|
+
raise JobAdapterError, e.message
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
# Retrieve job status from resource manager
|
321
|
+
# @param id [#to_s] the id of the job
|
322
|
+
# @raise [JobAdapterError] if something goes wrong getting job status
|
323
|
+
# @return [Status] status of job
|
324
|
+
# @see Adapter#status
|
325
|
+
def status(id)
|
326
|
+
info(id.to_s).status
|
327
|
+
end
|
328
|
+
|
329
|
+
# Put the submitted job on hold
|
330
|
+
# @param id [#to_s] the id of the job
|
331
|
+
# @raise [JobAdapterError] if something goes wrong holding a job
|
332
|
+
# @return [void]
|
333
|
+
# @see Adapter#hold
|
334
|
+
def hold(id)
|
335
|
+
@pbspro.hold_job(id.to_s)
|
336
|
+
rescue Batch::Error => e
|
337
|
+
# assume successful job hold if can't find job id
|
338
|
+
raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
339
|
+
end
|
340
|
+
|
341
|
+
# Release the job that is on hold
|
342
|
+
# @param id [#to_s] the id of the job
|
343
|
+
# @raise [JobAdapterError] if something goes wrong releasing a job
|
344
|
+
# @return [void]
|
345
|
+
# @see Adapter#release
|
346
|
+
def release(id)
|
347
|
+
@pbspro.release_job(id.to_s)
|
348
|
+
rescue Batch::Error => e
|
349
|
+
# assume successful job release if can't find job id
|
350
|
+
raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
351
|
+
end
|
352
|
+
|
353
|
+
# Delete the submitted job
|
354
|
+
# @param id [#to_s] the id of the job
|
355
|
+
# @raise [JobAdapterError] if something goes wrong deleting a job
|
356
|
+
# @return [void]
|
357
|
+
# @see Adapter#delete
|
358
|
+
def delete(id)
|
359
|
+
@pbspro.delete_job(id.to_s)
|
360
|
+
rescue Batch::Error => e
|
361
|
+
# assume successful job deletion if can't find job id
|
362
|
+
raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
|
363
|
+
end
|
364
|
+
|
365
|
+
private
|
366
|
+
# Convert duration to seconds
|
367
|
+
def duration_in_seconds(time)
|
368
|
+
time.nil? ? nil : time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
|
369
|
+
end
|
370
|
+
|
371
|
+
# Convert seconds to duration
|
372
|
+
def seconds_to_duration(time)
|
373
|
+
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
|
374
|
+
end
|
375
|
+
|
376
|
+
# Convert host list string to individual nodes
|
377
|
+
# "hosta/J1+hostb/J2*P+..."
|
378
|
+
# where J1 and J2 are an index of the job on the named host and P is the number of
|
379
|
+
# processors allocated from that host to this job. P does not appear if it is 1.
|
380
|
+
# Example: "i5n14/2*7" uses 7 procs on node "i5n14"
|
381
|
+
def parse_nodes(node_list)
|
382
|
+
node_list.split('+').map do |n|
|
383
|
+
name, procs_list = n.split('/')
|
384
|
+
procs = (procs_list.split('*')[1] || 1).to_i
|
385
|
+
{name: name, procs: procs}
|
386
|
+
end
|
387
|
+
end
|
388
|
+
|
389
|
+
# Determine state from PBS Pro state code
|
390
|
+
def get_state(st)
|
391
|
+
STATE_MAP.fetch(st, :undetermined)
|
392
|
+
end
|
393
|
+
|
394
|
+
# Parse hash describing PBS Pro job status
|
395
|
+
def parse_job_info(v)
|
396
|
+
/^(?<job_owner>[\w-]+)@(?<submit_host>.+)$/ =~ v[:Job_Owner]
|
397
|
+
allocated_nodes = parse_nodes(v[:exec_host] || "")
|
398
|
+
procs = allocated_nodes.inject(0) { |sum, x| sum + x[:procs] }
|
399
|
+
if allocated_nodes.empty? # fill in with requested resources
|
400
|
+
allocated_nodes = [ { name: nil } ] * v.fetch(:Resource_List, {})[:nodect].to_i
|
401
|
+
procs = v.fetch(:Resource_List, {})[:ncpus].to_i
|
402
|
+
end
|
403
|
+
Info.new(
|
404
|
+
id: v[:job_id],
|
405
|
+
status: get_state(v[:job_state]),
|
406
|
+
allocated_nodes: allocated_nodes,
|
407
|
+
submit_host: submit_host,
|
408
|
+
job_name: v[:Job_Name],
|
409
|
+
job_owner: job_owner,
|
410
|
+
accounting_id: v[:Account_Name],
|
411
|
+
procs: procs,
|
412
|
+
queue_name: v[:queue],
|
413
|
+
wallclock_time: duration_in_seconds(v.fetch(:resources_used, {})[:walltime]),
|
414
|
+
wallclock_limit: duration_in_seconds(v.fetch(:Resource_List, {})[:walltime]),
|
415
|
+
cpu_time: duration_in_seconds(v.fetch(:resources_used, {})[:cput]),
|
416
|
+
submission_time: v[:ctime] ? Time.parse(v[:ctime]) : nil,
|
417
|
+
dispatch_time: v[:stime] ? Time.parse(v[:stime]) : nil,
|
418
|
+
native: v
|
419
|
+
)
|
420
|
+
end
|
421
|
+
end
|
422
|
+
end
|
423
|
+
end
|
424
|
+
end
|