ood_core 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,8 +33,12 @@ module OodCore
33
33
  # @option cluster [#to_h] :metadata ({}) The cluster's metadata
34
34
  # @option cluster [#to_h] :login ({}) The cluster's SSH host
35
35
  # @option cluster [#to_h] :job ({}) The job adapter for this cluster
36
- # @option cluster [#to_h] :custom ({}) Any custom resources for this cluster
37
- # @option cluster [Array<#to_h>] :acls ([]) List of ACLs to validate against
36
+ # @option cluster [#to_h] :custom ({}) Any custom resources for this
37
+ # cluster
38
+ # @option cluster [Array<#to_h>] :acls ([]) List of ACLs to validate
39
+ # against
40
+ # @option cluster [#to_h] :batch_connect ({}) Configuration for batch
41
+ # connect templates
38
42
  def initialize(cluster)
39
43
  c = cluster.to_h.symbolize_keys
40
44
 
@@ -47,6 +51,7 @@ module OodCore
47
51
  @job_config = c.fetch(:job, {}) .to_h.symbolize_keys
48
52
  @custom_config = c.fetch(:custom, {}) .to_h.symbolize_keys
49
53
  @acls_config = c.fetch(:acls, []) .map(&:to_h)
54
+ @batch_connect_config = c.fetch(:batch_connect, {}).to_h.symbolize_keys
50
55
  end
51
56
 
52
57
  # Metadata that provides extra information about this cluster
@@ -81,6 +86,25 @@ module OodCore
81
86
  build_acls(job_config.fetch(:acls, []).map(&:to_h)).all?(&:allow?)
82
87
  end
83
88
 
89
+ # The batch connect template configuration used for this cluster
90
+ # @param template [#to_sym, nil] the template type
91
+ # @return [Hash] the batch connect configuration
92
+ def batch_connect_config(template = nil)
93
+ if template
94
+ @batch_connect_config.fetch(template.to_sym, {}).to_h.symbolize_keys.merge(template: template.to_sym)
95
+ else
96
+ @batch_connect_config
97
+ end
98
+ end
99
+
100
+ # Build a batch connect template from the respective configuration
101
+ # @param context [#to_h] the context used for rendering the template
102
+ # @return [BatchConnect::Template] the batch connect template
103
+ def batch_connect_template(context = {})
104
+ context = context.to_h.symbolize_keys
105
+ BatchConnect::Factory.build batch_connect_config(context[:template] || :basic).merge(context)
106
+ end
107
+
84
108
  # The configuration for any custom features or resources for this cluster
85
109
  # @param feature [#to_sym, nil] the feature or resource
86
110
  # @return [Hash] configuration for custom feature or resource
@@ -125,12 +149,13 @@ module OodCore
125
149
  # @return [Hash] the hash describing this object
126
150
  def to_h
127
151
  {
128
- id: id,
129
- metadata: metadata_config,
130
- login: login_config,
131
- job: job_config,
132
- custom: custom_config,
133
- acls: acls_config
152
+ id: id,
153
+ metadata: metadata_config,
154
+ login: login_config,
155
+ job: job_config,
156
+ custom: custom_config,
157
+ acls: acls_config,
158
+ batch_connect: batch_connect_config
134
159
  }
135
160
  end
136
161
 
@@ -16,4 +16,10 @@ module OodCore
16
16
 
17
17
  # Raised when a job state is set to an invalid option
18
18
  class UnknownStateAttribute < Error; end
19
+
20
+ # Raised when template not specified in configuration
21
+ class TemplateNotSpecified < Error; end
22
+
23
+ # Raised when cannot find template specified in configuration
24
+ class TemplateNotFound < Error; end
19
25
  end
@@ -4,6 +4,8 @@ module OodCore
4
4
  # submitting/statusing/holding/deleting jobs
5
5
  # @abstract
6
6
  class Adapter
7
+ using Refinements::ArrayExtensions
8
+
7
9
  # Submit a job with the attributes defined in the job template instance
8
10
  # @abstract Subclass is expected to implement {#submit}
9
11
  # @raise [NotImplementedError] if subclass did not define {#submit}
@@ -39,6 +41,15 @@ module OodCore
39
41
  raise NotImplementedError, "subclass did not define #info_all"
40
42
  end
41
43
 
44
+ # Retrieve info for all jobs for a given owner or owners from the
45
+ # resource manager
46
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
47
+ # @return [Array<Info>] information describing submitted jobs
48
+ def info_where_owner(owner)
49
+ owner = Array.wrap(owner).map(&:to_s)
50
+ info_all.select { |info| owner.include? info.job_owner }
51
+ end
52
+
42
53
  # Retrieve job info from the resource manager
43
54
  # @abstract Subclass is expected to implement {#info}
44
55
  # @raise [NotImplementedError] if subclass did not define {#info}
@@ -72,24 +72,9 @@ module OodCore
72
72
  afternotok = Array(afternotok).map(&:to_s)
73
73
  afterany = Array(afterany).map(&:to_s)
74
74
 
75
- args = []
76
- args += ["-P", script.accounting_id] unless script.accounting_id.nil?
77
- args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
78
- args += ["-J", script.job_name] unless script.job_name.nil?
79
-
80
- # TODO: dependencies
81
-
82
- env = {
83
- #TODO:
84
- #LSB_HOSTS?
85
- #LSB_MCPU_HOSTS?
86
- #SNDJOBS_TO?
87
- #
88
- }
89
-
90
- # Submit job
91
- batch.submit_string(script.content, args: args, env: env)
75
+ kwargs = helper.batch_submit_args(script, after: after, afterok: afterok, afternotok: afternotok, afterany: afterany)
92
76
 
77
+ batch.submit_string(script.content, **kwargs)
93
78
  rescue Batch::Error => e
94
79
  raise JobAdapterError, e.message
95
80
  end
@@ -170,20 +155,29 @@ module OodCore
170
155
  end
171
156
 
172
157
  def info_for_batch_hash(v)
158
+ nodes = helper.parse_exec_host(v[:exec_host]).map do |host|
159
+ NodeInfo.new(name: host[:host], procs: host[:slots])
160
+ end
161
+
162
+ # FIXME: estimated_runtime should be set by batch object instead of
163
+ dispatch_time = helper.parse_past_time(v[:start_time], ignore_errors: true)
164
+ finish_time = helper.parse_past_time(v[:finish_time], ignore_errors: true)
165
+
173
166
  Info.new(
174
167
  id: v[:id],
175
168
  status: get_state(v[:status]),
176
- allocated_nodes: [],
169
+ allocated_nodes: nodes,
177
170
  submit_host: v[:from_host],
178
171
  job_name: v[:name],
179
172
  job_owner: v[:user],
180
173
  accounting_id: v[:project],
181
- procs: nil,
174
+ procs: nodes.any? ? nodes.map(&:procs).reduce(&:+) : 0,
182
175
  queue_name: v[:queue],
183
- wallclock_time: nil,
184
- cpu_time: nil,
176
+ wallclock_time: helper.estimate_runtime(current_time: Time.now, start_time: dispatch_time, finish_time: finish_time),
177
+ cpu_time: helper.parse_cpu_used(v[:cpu_used]),
178
+ # cpu_time: nil,
185
179
  submission_time: helper.parse_past_time(v[:submit_time], ignore_errors: true),
186
- dispatch_time: helper.parse_past_time(v[:start_time], ignore_errors: true),
180
+ dispatch_time: dispatch_time,
187
181
  native: v
188
182
  )
189
183
  end
@@ -49,21 +49,24 @@ class OodCore::Job::Adapters::Lsf::Batch
49
49
  %w( -u all -a -w -W )
50
50
  end
51
51
 
52
- # status fields available from bjobs
53
- def fields
54
- %i(id user status queue from_host exec_host name submit_time
55
- project cpu_used mem swap pids start_time finish_time)
56
- end
52
+ # status fields available from bjobs
53
+ def fields
54
+ %i(id user status queue from_host exec_host name submit_time
55
+ project cpu_used mem swap pids start_time finish_time)
56
+ end
57
57
 
58
58
  # helper method
59
59
  def parse_bjobs_output(response)
60
60
  return [] if response =~ /No job found/ || response.nil?
61
61
 
62
62
  lines = response.split("\n")
63
- validate_bjobs_output_columns(lines.first.split)
63
+ columns = lines.shift.split
64
+
65
+ validate_bjobs_output_columns(columns)
66
+ jobname_column_idx = columns.find_index("JOB_NAME")
64
67
 
65
- lines.drop(1).map{ |job|
66
- values = split_bjobs_output_line(job)
68
+ lines.map{ |job|
69
+ values = split_bjobs_output_line(job, num_columns: columns.count, jobname_column_idx: jobname_column_idx)
67
70
 
68
71
  # make a hash of { field: "value", etc. }
69
72
  Hash[fields.zip(values)].each_with_object({}) { |(k,v),o|
@@ -135,13 +138,21 @@ class OodCore::Job::Adapters::Lsf::Batch
135
138
  end
136
139
 
137
140
  # split a line of output from bjobs into field values
138
- def split_bjobs_output_line(line)
141
+ def split_bjobs_output_line(line, num_columns:, jobname_column_idx:)
139
142
  values = line.strip.split
140
143
 
141
- if(values.count > 15)
142
- # FIXME: hack assumes 15 fields & only job name may have spaces
143
- # collapse >15 fields into 15, assumes 7th field is JOB_NAME
144
- values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
144
+ if(values.count > num_columns)
145
+ # if the line has more fields than the number of columns, that means one
146
+ # field value has spaces, so it was erroneously split into
147
+ # multiple fields; we assume that is the jobname field, and we will
148
+ # collapse the fields into a single field
149
+ #
150
+ # FIXME: assumes jobname_column_idx is not first or last item
151
+ j = jobname_column_idx
152
+
153
+ # e.g. if 15 fields and jobname is 7th field
154
+ # values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
155
+ values = values[0..(j-1)] + [values[j..(j-num_columns)].join(" ")] + values[(j+1-num_columns)..-1]
145
156
  end
146
157
 
147
158
  values
@@ -151,9 +162,11 @@ class OodCore::Job::Adapters::Lsf::Batch
151
162
  def validate_bjobs_output_columns(columns)
152
163
  expected = %w(JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME
153
164
  SUBMIT_TIME PROJ_NAME CPU_USED MEM SWAP PIDS START_TIME FINISH_TIME)
154
- if columns != expected
165
+ # (expected & columns) will return the columns that are the same
166
+ # so if there are extra columns we can just ignore those (like SLOTS in LSF 9.1)
167
+ if columns && ((expected & columns) != expected)
155
168
  raise Error, "bjobs output in different format than expected: " \
156
- "#{columns.inspect} instead of #{expected.inspect}"
169
+ "#{columns.inspect} did not include all columns: #{expected.inspect}"
157
170
  end
158
171
  end
159
172
 
@@ -23,4 +23,83 @@ class OodCore::Job::Adapters::Lsf::Helper
23
23
 
24
24
  nil
25
25
  end
26
+
27
+ # convert exec_host string format from bjobs to a hash
28
+ # i.e. "c012" => [{host: "c012", slots: 1}]
29
+ # i.e. "4*c012:8*c013" => [{host: "c012", slots: 4}, {host: "c013", slots: 8}]
30
+ def parse_exec_host(exec_host_str)
31
+ return [] if exec_host_str.nil? || exec_host_str.empty?
32
+
33
+ exec_host_str.scan(exec_host_regex).map do |match|
34
+ {host: match[2], slots: match[1] ? match[1].to_i : 1}
35
+ end
36
+ end
37
+
38
+ def exec_host_regex
39
+ @exec_host_regex ||= Regexp.new(/((\d+)\*)?([^:]+)/)
40
+ end
41
+
42
+ # given current time, dispatch time, and finish time values, estimate the
43
+ # runtime for a job; this estimate will be accurate if the job never enters a
44
+ # suspended state during its execution
45
+ def estimate_runtime(current_time:, start_time:, finish_time:)
46
+ return nil if start_time.nil?
47
+
48
+ (finish_time || current_time) - start_time
49
+ end
50
+
51
+ # Convert CPU_USED string to seconds
52
+ #
53
+ # example strings of cpu_used in LSF 8.3:
54
+ #
55
+ # 060:24:00.00
56
+ # 046:19:37.00
57
+ # 1118:59:09.00
58
+ # 000:00:00.00
59
+ # 000:48:18.39
60
+ # 003:11:36.67
61
+ # 003:24:40.95
62
+ # 50769:48:00.-48
63
+ # 50835:48:48.-48
64
+ #
65
+ # my guess is: hours:minutes:seconds.????
66
+ #
67
+ # @return [Fixnum, nil] cpu used as seconds
68
+ def parse_cpu_used(cpu_used)
69
+ if cpu_used =~ /^(\d+):(\d+):(\d+)\..*$/
70
+ $1.to_i*3600 + $2.to_i*60 + $3.to_i
71
+ end
72
+ end
73
+
74
+ def batch_submit_args(script, after: [], afterok: [], afternotok: [], afterany: [])
75
+ args = []
76
+
77
+ args += ["-P", script.accounting_id] unless script.accounting_id.nil?
78
+ args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
79
+ args += ["-J", script.job_name] unless script.job_name.nil?
80
+ args += ["-q", script.queue_name] unless script.queue_name.nil?
81
+ args += ["-U", script.reservation_id] unless script.reservation_id.nil?
82
+ args += ["-sp", script.priority] unless script.priority.nil?
83
+ args += ["-H"] if script.submit_as_hold
84
+ args += (script.rerunnable ? ["-r"] : ["-rn"]) unless script.rerunnable.nil?
85
+ args += ["-b", script.start_time.localtime.strftime("%Y:%m:%d:%H:%M")] unless script.start_time.nil?
86
+ args += ["-W", (script.wall_time / 60).to_i] unless script.wall_time.nil?
87
+
88
+ # input and output files
89
+ args += ["-i", script.input_path] unless script.input_path.nil?
90
+ args += ["-o", script.output_path] unless script.output_path.nil?
91
+ args += ["-e", script.error_path] unless script.error_path.nil?
92
+
93
+ # email
94
+ args += ["-B"] if script.email_on_started
95
+ args += ["-N"] if script.email_on_terminated
96
+ args += ["-u", script.email.join(",")] unless script.email.nil? || script.email.empty?
97
+
98
+ args += script.native unless script.native.nil?
99
+
100
+ # environment
101
+ env = script.job_environment || {}
102
+
103
+ {args: args, env: env}
104
+ end
26
105
  end
@@ -0,0 +1,424 @@
1
+ require "time"
2
+ require "ood_core/refinements/hash_extensions"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ # Build the PBS Pro adapter from a configuration
10
+ # @param config [#to_h] the configuration for job adapter
11
+ # @option config [Object] :host (nil) The batch server host
12
+ # @option config [Object] :exec (nil) Path to PBS Pro executables
13
+ # @option config [Object] :qstat_factor (nil) Deciding factor on how to
14
+ # call qstat for a user
15
+ def self.build_pbspro(config)
16
+ c = config.to_h.compact.symbolize_keys
17
+ host = c.fetch(:host, nil)
18
+ exec = c.fetch(:exec, nil)
19
+ qstat_factor = c.fetch(:qstat_factor, nil)
20
+ pbspro = Adapters::PBSPro::Batch.new(host: host, exec: exec)
21
+ Adapters::PBSPro.new(pbspro: pbspro, qstat_factor: qstat_factor)
22
+ end
23
+ end
24
+
25
+ module Adapters
26
+ # An adapter object that describes the communication with a PBS Pro
27
+ # resource manager for job management.
28
+ class PBSPro < Adapter
29
+ using Refinements::ArrayExtensions
30
+ using Refinements::HashExtensions
31
+
32
+ # Object used for simplified communication with a PBS Pro batch server
33
+ # @api private
34
+ class Batch
35
+ # The host of the PBS Pro batch server
36
+ # @example
37
+ # my_batch.host #=> "my_batch.server.edu"
38
+ # @return [String, nil] the batch server host
39
+ attr_reader :host
40
+
41
+ # The path containing the PBS executables
42
+ # @example
43
+ # my_batch.exec.to_s #=> "/usr/local/pbspro/10.0.0
44
+ # @return [Pathname, nil] path to pbs executables
45
+ attr_reader :exec
46
+
47
+ # The root exception class that all PBS Pro-specific exceptions
48
+ # inherit from
49
+ class Error < StandardError; end
50
+
51
+ # @param host [#to_s, nil] the batch server host
52
+ # @param exec [#to_s, nil] path to pbs executables
53
+ def initialize(host: nil, exec: nil)
54
+ @host = host && host.to_s
55
+ @exec = exec && Pathname.new(exec.to_s)
56
+ end
57
+
58
+ # Get a list of hashes detailing each of the jobs on the batch server
59
+ # @example Status info for all jobs
60
+ # my_batch.get_jobs
61
+ # #=>
62
+ # #[
63
+ # # {
64
+ # # :account => "account",
65
+ # # :job_id => "my_job",
66
+ # # ...
67
+ # # },
68
+ # # {
69
+ # # :account => "account",
70
+ # # :job_id => "my_other_job",
71
+ # # ...
72
+ # # },
73
+ # # ...
74
+ # #]
75
+ # @param id [#to_s] the id of the job
76
+ # @raise [Error] if `qstat` command exited unsuccessfully
77
+ # @return [Array<Hash>] list of details for jobs
78
+ def get_jobs(id: "")
79
+ args = ["-f"] # display all information
80
+ args += ["-t"] # list subjobs
81
+ args += [id.to_s] unless id.to_s.empty?
82
+ lines = call("qstat", *args).gsub("\n\t", "").split("\n").map(&:strip)
83
+
84
+ jobs = []
85
+ lines.each do |line|
86
+ if /^Job Id: (?<job_id>.+)$/ =~ line
87
+ jobs << { job_id: job_id }
88
+ elsif /^(?<key>[^\s]+) = (?<value>.+)$/ =~ line
89
+ hsh = jobs.last
90
+ k1, k2 = key.split(".").map(&:to_sym)
91
+ k2 ? ( hsh[k1] ||= {} and hsh[k1][k2] = value ) : ( hsh[k1] = value )
92
+ end
93
+ end
94
+ jobs.reject { |j| /\[\]/ =~ j[:job_id] } # drop main job array jobs
95
+ end
96
+
97
+ # Select batch jobs from the batch server
98
+ # @param args [Array<#to_s>] arguments passed to `qselect` command
99
+ # @raise [Error] if `qselect` command exited unsuccessfully
100
+ # @return [Array<String>] list of job ids that match selection
101
+ # criteria
102
+ def select_jobs(args: [])
103
+ call("qselect", *args).split("\n").map(&:strip)
104
+ end
105
+
106
+ # Put a specified job on hold
107
+ # @example Put job "1234" on hold
108
+ # my_batch.hold_job("1234")
109
+ # @param id [#to_s] the id of the job
110
+ # @raise [Error] if `qhold` command exited unsuccessfully
111
+ # @return [void]
112
+ def hold_job(id)
113
+ call("qhold", id.to_s)
114
+ end
115
+
116
+ # Release a specified job that is on hold
117
+ # @example Release job "1234" from on hold
118
+ # my_batch.release_job("1234")
119
+ # @param id [#to_s] the id of the job
120
+ # @raise [Error] if `qrls` command exited unsuccessfully
121
+ # @return [void]
122
+ def release_job(id)
123
+ call("qrls", id.to_s)
124
+ end
125
+
126
+ # Delete a specified job from batch server
127
+ # @example Delete job "1234"
128
+ # my_batch.delete_job("1234")
129
+ # @param id [#to_s] the id of the job
130
+ # @raise [Error] if `qdel` command exited unsuccessfully
131
+ # @return [void]
132
+ def delete_job(id)
133
+ call("qdel", id.to_s)
134
+ end
135
+
136
+ # Submit a script expanded as a string to the batch server
137
+ # @param str [#to_s] script as a string
138
+ # @param args [Array<#to_s>] arguments passed to `qsub` command
139
+ # @param chdir [#to_s, nil] working directory where `qsub` is called
140
+ # @raise [Error] if `qsub` command exited unsuccessfully
141
+ # @return [String] the id of the job that was created
142
+ def submit_string(str, args: [], chdir: nil)
143
+ call("qsub", *args, stdin: str.to_s, chdir: chdir).strip
144
+ end
145
+
146
+ private
147
+ # Call a forked PBS Pro command for a given batch server
148
+ def call(cmd, *args, env: {}, stdin: "", chdir: nil)
149
+ cmd = cmd.to_s
150
+ cmd = exec.join("bin", cmd).to_s if exec
151
+ args = args.map(&:to_s)
152
+ env = env.to_h.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
153
+ env["PBS_DEFAULT"] = host.to_s if host
154
+ env["PBS_EXEC"] = exec.to_s if exec
155
+ chdir ||= "."
156
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s, chdir: chdir.to_s)
157
+ s.success? ? o : raise(Error, e)
158
+ end
159
+ end
160
+
161
+ # Mapping of state codes for PBSPro
162
+ STATE_MAP = {
163
+ 'Q' => :queued,
164
+ 'W' => :queued, # job is waiting for its submitter-assigned start time to be reached
165
+ 'H' => :queued_held,
166
+ 'T' => :queued_held, # job is being moved to a new location
167
+ 'M' => :completed, # job was moved to another server
168
+ 'R' => :running,
169
+ 'S' => :suspended,
170
+ 'U' => :suspended, # cycle-harvesting job is suspended due to keyboard activity
171
+ 'E' => :running, # job is exiting after having run
172
+ 'F' => :completed, # job is finished
173
+ 'X' => :completed # subjob has completed execution or has been deleted
174
+ # ignore B as it signifies a job array
175
+ }
176
+
177
+ # What percentage of jobs a user owns out of all jobs, used to decide
178
+ # whether we filter the owner's jobs from a `qstat` of all jobs or call
179
+ # `qstat` on each of the owner's individual jobs
180
+ # @return [Float] ratio of owner's jobs to all jobs
181
+ attr_reader :qstat_factor
182
+
183
+ # @api private
184
+ # @param opts [#to_h] the options defining this adapter
185
+ # @option opts [Batch] :pbspro The PBS Pro batch object
186
+ # @option opts [#to_f] :qstat_factor (0.10) The qstat deciding factor
187
+ # @see Factory.build_pbspro
188
+ def initialize(opts = {})
189
+ o = opts.to_h.compact.symbolize_keys
190
+
191
+ @pbspro = o.fetch(:pbspro) { raise ArgumentError, "No pbspro object specified. Missing argument: pbspro" }
192
+ @qstat_factor = o.fetch(:qstat_factor, 0.10).to_f
193
+ end
194
+
195
+ # Submit a job with the attributes defined in the job template instance
196
+ # @param script [Script] script object that describes the script and
197
+ # attributes for the submitted job
198
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
199
+ # execution at any point after dependent jobs have started execution
200
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
201
+ # execution only after dependent jobs have terminated with no errors
202
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
203
+ # execution only after dependent jobs have terminated with errors
204
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
205
+ # execution after dependent jobs have terminated
206
+ # @raise [JobAdapterError] if something goes wrong submitting a job
207
+ # @return [String] the job id returned after successfully submitting a
208
+ # job
209
+ # @see Adapter#submit
210
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
211
+ after = Array(after).map(&:to_s)
212
+ afterok = Array(afterok).map(&:to_s)
213
+ afternotok = Array(afternotok).map(&:to_s)
214
+ afterany = Array(afterany).map(&:to_s)
215
+
216
+ # Set qsub options
217
+ args = []
218
+ # ignore args, can't use these if submitting from STDIN
219
+ args += ["-h"] if script.submit_as_hold
220
+ args += ["-r", script.rerunnable ? "y" : "n"] unless script.rerunnable.nil?
221
+ args += ["-M", script.email.join(",")] unless script.email.nil?
222
+ if script.email_on_started && script.email_on_terminated
223
+ args += ["-m", "be"]
224
+ elsif script.email_on_started
225
+ args += ["-m", "b"]
226
+ elsif script.email_on_terminated
227
+ args += ["-m", "e"]
228
+ end
229
+ args += ["-N", script.job_name] unless script.job_name.nil?
230
+ # ignore input_path (not defined in PBS Pro)
231
+ args += ["-o", script.output_path] unless script.output_path.nil?
232
+ args += ["-e", script.error_path] unless script.error_path.nil?
233
+ # Reservations are actually just queues in PBS Pro
234
+ args += ["-q", script.reservation_id] if !script.reservation_id.nil? && script.queue_name.nil?
235
+ args += ["-q", script.queue_name] unless script.queue_name.nil?
236
+ args += ["-p", script.priority] unless script.priority.nil?
237
+ args += ["-a", script.start_time.localtime.strftime("%C%y%m%d%H%M.%S")] unless script.start_time.nil?
238
+ args += ["-A", script.accounting_id] unless script.accounting_id.nil?
239
+ args += ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
240
+
241
+ # Set dependencies
242
+ depend = []
243
+ depend << "after:#{after.join(":")}" unless after.empty?
244
+ depend << "afterok:#{afterok.join(":")}" unless afterok.empty?
245
+ depend << "afternotok:#{afternotok.join(":")}" unless afternotok.empty?
246
+ depend << "afterany:#{afterany.join(":")}" unless afterany.empty?
247
+ args += ["-W", "depend=#{depend.join(",")}"] unless depend.empty?
248
+
249
+ # Set environment variables
250
+ envvars = script.job_environment.to_h
251
+ args += ["-v", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
252
+
253
+ # If error_path is not specified we join stdout & stderr (as this
254
+ # mimics what the other resource managers do)
255
+ args += ["-j", "oe"] if script.error_path.nil?
256
+
257
+ # Set native options
258
+ args += script.native if script.native
259
+
260
+ # Submit job
261
+ @pbspro.submit_string(script.content, args: args, chdir: script.workdir)
262
+ rescue Batch::Error => e
263
+ raise JobAdapterError, e.message
264
+ end
265
+
266
+ # Retrieve info for all jobs from the resource manager
267
+ # @raise [JobAdapterError] if something goes wrong getting job info
268
+ # @return [Array<Info>] information describing submitted jobs
269
+ # @see Adapter#info_all
270
+ def info_all
271
+ @pbspro.get_jobs.map do |v|
272
+ parse_job_info(v)
273
+ end
274
+ rescue Batch::Error => e
275
+ raise JobAdapterError, e.message
276
+ end
277
+
278
+ # Retrieve info for all jobs for a given owner or owners from the
279
+ # resource manager
280
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
281
+ # @raise [JobAdapterError] if something goes wrong getting job info
282
+ # @return [Array<Info>] information describing submitted jobs
283
+ def info_where_owner(owner)
284
+ owner = Array.wrap(owner).map(&:to_s)
285
+
286
+ usr_jobs = @pbspro.select_jobs(args: ["-u", owner.join(",")])
287
+ all_jobs = @pbspro.select_jobs(args: ["-T"])
288
+
289
+ # `qstat` all jobs if user has too many jobs, otherwise `qstat` each
290
+ # individual job (default factor is 10%)
291
+ if usr_jobs.size > (qstat_factor * all_jobs.size)
292
+ super
293
+ else
294
+ usr_jobs.map { |id| info(id) }
295
+ end
296
+ end
297
+
298
+ # Retrieve job info from the resource manager
299
+ # @param id [#to_s] the id of the job
300
+ # @raise [JobAdapterError] if something goes wrong getting job info
301
+ # @return [Info] information describing submitted job
302
+ # @see Adapter#info
303
+ def info(id)
304
+ id = id.to_s
305
+ @pbspro.get_jobs(id: id).map do |v|
306
+ parse_job_info(v)
307
+ end.first || Info.new(id: id, status: :completed)
308
+ rescue Batch::Error => e
309
+ # set completed status if can't find job id
310
+ if /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
311
+ Info.new(
312
+ id: id,
313
+ status: :completed
314
+ )
315
+ else
316
+ raise JobAdapterError, e.message
317
+ end
318
+ end
319
+
320
+ # Retrieve job status from resource manager
321
+ # @param id [#to_s] the id of the job
322
+ # @raise [JobAdapterError] if something goes wrong getting job status
323
+ # @return [Status] status of job
324
+ # @see Adapter#status
325
+ def status(id)
326
+ info(id.to_s).status
327
+ end
328
+
329
+ # Put the submitted job on hold
330
+ # @param id [#to_s] the id of the job
331
+ # @raise [JobAdapterError] if something goes wrong holding a job
332
+ # @return [void]
333
+ # @see Adapter#hold
334
+ def hold(id)
335
+ @pbspro.hold_job(id.to_s)
336
+ rescue Batch::Error => e
337
+ # assume successful job hold if can't find job id
338
+ raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
339
+ end
340
+
341
+ # Release the job that is on hold
342
+ # @param id [#to_s] the id of the job
343
+ # @raise [JobAdapterError] if something goes wrong releasing a job
344
+ # @return [void]
345
+ # @see Adapter#release
346
+ def release(id)
347
+ @pbspro.release_job(id.to_s)
348
+ rescue Batch::Error => e
349
+ # assume successful job release if can't find job id
350
+ raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
351
+ end
352
+
353
+ # Delete the submitted job
354
+ # @param id [#to_s] the id of the job
355
+ # @raise [JobAdapterError] if something goes wrong deleting a job
356
+ # @return [void]
357
+ # @see Adapter#delete
358
+ def delete(id)
359
+ @pbspro.delete_job(id.to_s)
360
+ rescue Batch::Error => e
361
+ # assume successful job deletion if can't find job id
362
+ raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
363
+ end
364
+
365
+ private
366
+ # Convert duration to seconds
367
+ def duration_in_seconds(time)
368
+ time.nil? ? nil : time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
369
+ end
370
+
371
+ # Convert seconds to duration
372
+ def seconds_to_duration(time)
373
+ "%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
374
+ end
375
+
376
+ # Convert host list string to individual nodes
377
+ # "hosta/J1+hostb/J2*P+..."
378
+ # where J1 and J2 are an index of the job on the named host and P is the number of
379
+ # processors allocated from that host to this job. P does not appear if it is 1.
380
+ # Example: "i5n14/2*7" uses 7 procs on node "i5n14"
381
+ def parse_nodes(node_list)
382
+ node_list.split('+').map do |n|
383
+ name, procs_list = n.split('/')
384
+ procs = (procs_list.split('*')[1] || 1).to_i
385
+ {name: name, procs: procs}
386
+ end
387
+ end
388
+
389
+ # Determine state from PBS Pro state code
390
+ def get_state(st)
391
+ STATE_MAP.fetch(st, :undetermined)
392
+ end
393
+
394
+ # Parse hash describing PBS Pro job status
395
+ def parse_job_info(v)
396
+ /^(?<job_owner>[\w-]+)@(?<submit_host>.+)$/ =~ v[:Job_Owner]
397
+ allocated_nodes = parse_nodes(v[:exec_host] || "")
398
+ procs = allocated_nodes.inject(0) { |sum, x| sum + x[:procs] }
399
+ if allocated_nodes.empty? # fill in with requested resources
400
+ allocated_nodes = [ { name: nil } ] * v.fetch(:Resource_List, {})[:nodect].to_i
401
+ procs = v.fetch(:Resource_List, {})[:ncpus].to_i
402
+ end
403
+ Info.new(
404
+ id: v[:job_id],
405
+ status: get_state(v[:job_state]),
406
+ allocated_nodes: allocated_nodes,
407
+ submit_host: submit_host,
408
+ job_name: v[:Job_Name],
409
+ job_owner: job_owner,
410
+ accounting_id: v[:Account_Name],
411
+ procs: procs,
412
+ queue_name: v[:queue],
413
+ wallclock_time: duration_in_seconds(v.fetch(:resources_used, {})[:walltime]),
414
+ wallclock_limit: duration_in_seconds(v.fetch(:Resource_List, {})[:walltime]),
415
+ cpu_time: duration_in_seconds(v.fetch(:resources_used, {})[:cput]),
416
+ submission_time: v[:ctime] ? Time.parse(v[:ctime]) : nil,
417
+ dispatch_time: v[:stime] ? Time.parse(v[:stime]) : nil,
418
+ native: v
419
+ )
420
+ end
421
+ end
422
+ end
423
+ end
424
+ end