ood_core 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -33,8 +33,12 @@ module OodCore
33
33
  # @option cluster [#to_h] :metadata ({}) The cluster's metadata
34
34
  # @option cluster [#to_h] :login ({}) The cluster's SSH host
35
35
  # @option cluster [#to_h] :job ({}) The job adapter for this cluster
36
- # @option cluster [#to_h] :custom ({}) Any custom resources for this cluster
37
- # @option cluster [Array<#to_h>] :acls ([]) List of ACLs to validate against
36
+ # @option cluster [#to_h] :custom ({}) Any custom resources for this
37
+ # cluster
38
+ # @option cluster [Array<#to_h>] :acls ([]) List of ACLs to validate
39
+ # against
40
+ # @option cluster [#to_h] :batch_connect ({}) Configuration for batch
41
+ # connect templates
38
42
  def initialize(cluster)
39
43
  c = cluster.to_h.symbolize_keys
40
44
 
@@ -47,6 +51,7 @@ module OodCore
47
51
  @job_config = c.fetch(:job, {}) .to_h.symbolize_keys
48
52
  @custom_config = c.fetch(:custom, {}) .to_h.symbolize_keys
49
53
  @acls_config = c.fetch(:acls, []) .map(&:to_h)
54
+ @batch_connect_config = c.fetch(:batch_connect, {}).to_h.symbolize_keys
50
55
  end
51
56
 
52
57
  # Metadata that provides extra information about this cluster
@@ -81,6 +86,25 @@ module OodCore
81
86
  build_acls(job_config.fetch(:acls, []).map(&:to_h)).all?(&:allow?)
82
87
  end
83
88
 
89
+ # The batch connect template configuration used for this cluster
90
+ # @param template [#to_sym, nil] the template type
91
+ # @return [Hash] the batch connect configuration
92
+ def batch_connect_config(template = nil)
93
+ if template
94
+ @batch_connect_config.fetch(template.to_sym, {}).to_h.symbolize_keys.merge(template: template.to_sym)
95
+ else
96
+ @batch_connect_config
97
+ end
98
+ end
99
+
100
+ # Build a batch connect template from the respective configuration
101
+ # @param context [#to_h] the context used for rendering the template
102
+ # @return [BatchConnect::Template] the batch connect template
103
+ def batch_connect_template(context = {})
104
+ context = context.to_h.symbolize_keys
105
+ BatchConnect::Factory.build batch_connect_config(context[:template] || :basic).merge(context)
106
+ end
107
+
84
108
  # The configuration for any custom features or resources for this cluster
85
109
  # @param feature [#to_sym, nil] the feature or resource
86
110
  # @return [Hash] configuration for custom feature or resource
@@ -125,12 +149,13 @@ module OodCore
125
149
  # @return [Hash] the hash describing this object
126
150
  def to_h
127
151
  {
128
- id: id,
129
- metadata: metadata_config,
130
- login: login_config,
131
- job: job_config,
132
- custom: custom_config,
133
- acls: acls_config
152
+ id: id,
153
+ metadata: metadata_config,
154
+ login: login_config,
155
+ job: job_config,
156
+ custom: custom_config,
157
+ acls: acls_config,
158
+ batch_connect: batch_connect_config
134
159
  }
135
160
  end
136
161
 
@@ -16,4 +16,10 @@ module OodCore
16
16
 
17
17
  # Raised when a job state is set to an invalid option
18
18
  class UnknownStateAttribute < Error; end
19
+
20
+ # Raised when template not specified in configuration
21
+ class TemplateNotSpecified < Error; end
22
+
23
+ # Raised when cannot find template specified in configuration
24
+ class TemplateNotFound < Error; end
19
25
  end
@@ -4,6 +4,8 @@ module OodCore
4
4
  # submitting/statusing/holding/deleting jobs
5
5
  # @abstract
6
6
  class Adapter
7
+ using Refinements::ArrayExtensions
8
+
7
9
  # Submit a job with the attributes defined in the job template instance
8
10
  # @abstract Subclass is expected to implement {#submit}
9
11
  # @raise [NotImplementedError] if subclass did not define {#submit}
@@ -39,6 +41,15 @@ module OodCore
39
41
  raise NotImplementedError, "subclass did not define #info_all"
40
42
  end
41
43
 
44
+ # Retrieve info for all jobs for a given owner or owners from the
45
+ # resource manager
46
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
47
+ # @return [Array<Info>] information describing submitted jobs
48
+ def info_where_owner(owner)
49
+ owner = Array.wrap(owner).map(&:to_s)
50
+ info_all.select { |info| owner.include? info.job_owner }
51
+ end
52
+
42
53
  # Retrieve job info from the resource manager
43
54
  # @abstract Subclass is expected to implement {#info}
44
55
  # @raise [NotImplementedError] if subclass did not define {#info}
@@ -72,24 +72,9 @@ module OodCore
72
72
  afternotok = Array(afternotok).map(&:to_s)
73
73
  afterany = Array(afterany).map(&:to_s)
74
74
 
75
- args = []
76
- args += ["-P", script.accounting_id] unless script.accounting_id.nil?
77
- args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
78
- args += ["-J", script.job_name] unless script.job_name.nil?
79
-
80
- # TODO: dependencies
81
-
82
- env = {
83
- #TODO:
84
- #LSB_HOSTS?
85
- #LSB_MCPU_HOSTS?
86
- #SNDJOBS_TO?
87
- #
88
- }
89
-
90
- # Submit job
91
- batch.submit_string(script.content, args: args, env: env)
75
+ kwargs = helper.batch_submit_args(script, after: after, afterok: afterok, afternotok: afternotok, afterany: afterany)
92
76
 
77
+ batch.submit_string(script.content, **kwargs)
93
78
  rescue Batch::Error => e
94
79
  raise JobAdapterError, e.message
95
80
  end
@@ -170,20 +155,29 @@ module OodCore
170
155
  end
171
156
 
172
157
  def info_for_batch_hash(v)
158
+ nodes = helper.parse_exec_host(v[:exec_host]).map do |host|
159
+ NodeInfo.new(name: host[:host], procs: host[:slots])
160
+ end
161
+
162
+ # FIXME: estimated_runtime should be set by batch object instead of
163
+ dispatch_time = helper.parse_past_time(v[:start_time], ignore_errors: true)
164
+ finish_time = helper.parse_past_time(v[:finish_time], ignore_errors: true)
165
+
173
166
  Info.new(
174
167
  id: v[:id],
175
168
  status: get_state(v[:status]),
176
- allocated_nodes: [],
169
+ allocated_nodes: nodes,
177
170
  submit_host: v[:from_host],
178
171
  job_name: v[:name],
179
172
  job_owner: v[:user],
180
173
  accounting_id: v[:project],
181
- procs: nil,
174
+ procs: nodes.any? ? nodes.map(&:procs).reduce(&:+) : 0,
182
175
  queue_name: v[:queue],
183
- wallclock_time: nil,
184
- cpu_time: nil,
176
+ wallclock_time: helper.estimate_runtime(current_time: Time.now, start_time: dispatch_time, finish_time: finish_time),
177
+ cpu_time: helper.parse_cpu_used(v[:cpu_used]),
178
+ # cpu_time: nil,
185
179
  submission_time: helper.parse_past_time(v[:submit_time], ignore_errors: true),
186
- dispatch_time: helper.parse_past_time(v[:start_time], ignore_errors: true),
180
+ dispatch_time: dispatch_time,
187
181
  native: v
188
182
  )
189
183
  end
@@ -49,21 +49,24 @@ class OodCore::Job::Adapters::Lsf::Batch
49
49
  %w( -u all -a -w -W )
50
50
  end
51
51
 
52
- # status fields available from bjobs
53
- def fields
54
- %i(id user status queue from_host exec_host name submit_time
55
- project cpu_used mem swap pids start_time finish_time)
56
- end
52
+ # status fields available from bjobs
53
+ def fields
54
+ %i(id user status queue from_host exec_host name submit_time
55
+ project cpu_used mem swap pids start_time finish_time)
56
+ end
57
57
 
58
58
  # helper method
59
59
  def parse_bjobs_output(response)
60
60
  return [] if response =~ /No job found/ || response.nil?
61
61
 
62
62
  lines = response.split("\n")
63
- validate_bjobs_output_columns(lines.first.split)
63
+ columns = lines.shift.split
64
+
65
+ validate_bjobs_output_columns(columns)
66
+ jobname_column_idx = columns.find_index("JOB_NAME")
64
67
 
65
- lines.drop(1).map{ |job|
66
- values = split_bjobs_output_line(job)
68
+ lines.map{ |job|
69
+ values = split_bjobs_output_line(job, num_columns: columns.count, jobname_column_idx: jobname_column_idx)
67
70
 
68
71
  # make a hash of { field: "value", etc. }
69
72
  Hash[fields.zip(values)].each_with_object({}) { |(k,v),o|
@@ -135,13 +138,21 @@ class OodCore::Job::Adapters::Lsf::Batch
135
138
  end
136
139
 
137
140
  # split a line of output from bjobs into field values
138
- def split_bjobs_output_line(line)
141
+ def split_bjobs_output_line(line, num_columns:, jobname_column_idx:)
139
142
  values = line.strip.split
140
143
 
141
- if(values.count > 15)
142
- # FIXME: hack assumes 15 fields & only job name may have spaces
143
- # collapse >15 fields into 15, assumes 7th field is JOB_NAME
144
- values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
144
+ if(values.count > num_columns)
145
+ # if the line has more fields than the number of columns, that means one
146
+ # field value has spaces, so it was erroneously split into
147
+ # multiple fields; we assume that is the jobname field, and we will
148
+ # collapse the fields into a single field
149
+ #
150
+ # FIXME: assumes jobname_column_idx is not first or last item
151
+ j = jobname_column_idx
152
+
153
+ # e.g. if 15 fields and jobname is 7th field
154
+ # values = values[0..5] + [values[6..-9].join(" ")] + values[-8..-1]
155
+ values = values[0..(j-1)] + [values[j..(j-num_columns)].join(" ")] + values[(j+1-num_columns)..-1]
145
156
  end
146
157
 
147
158
  values
@@ -151,9 +162,11 @@ class OodCore::Job::Adapters::Lsf::Batch
151
162
  def validate_bjobs_output_columns(columns)
152
163
  expected = %w(JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME
153
164
  SUBMIT_TIME PROJ_NAME CPU_USED MEM SWAP PIDS START_TIME FINISH_TIME)
154
- if columns != expected
165
+ # (expected & columns) will return the columns that are the same
166
+ # so if there are extra columns we can just ignore those (like SLOTS in LSF 9.1)
167
+ if columns && ((expected & columns) != expected)
155
168
  raise Error, "bjobs output in different format than expected: " \
156
- "#{columns.inspect} instead of #{expected.inspect}"
169
+ "#{columns.inspect} did not include all columns: #{expected.inspect}"
157
170
  end
158
171
  end
159
172
 
@@ -23,4 +23,83 @@ class OodCore::Job::Adapters::Lsf::Helper
23
23
 
24
24
  nil
25
25
  end
26
+
27
+ # convert exec_host string format from bjobs to a hash
28
+ # i.e. "c012" => [{host: "c012", slots: 1}]
29
+ # i.e. "4*c012:8*c013" => [{host: "c012", slots: 4}, {host: "c013", slots: 8}]
30
+ def parse_exec_host(exec_host_str)
31
+ return [] if exec_host_str.nil? || exec_host_str.empty?
32
+
33
+ exec_host_str.scan(exec_host_regex).map do |match|
34
+ {host: match[2], slots: match[1] ? match[1].to_i : 1}
35
+ end
36
+ end
37
+
38
+ def exec_host_regex
39
+ @exec_host_regex ||= Regexp.new(/((\d+)\*)?([^:]+)/)
40
+ end
41
+
42
+ # given current time, dispatch time, and finish time values, estimate the
43
+ # runtime for a job; this estimate will be accurate if the job never enters a
44
+ # suspended state during its execution
45
+ def estimate_runtime(current_time:, start_time:, finish_time:)
46
+ return nil if start_time.nil?
47
+
48
+ (finish_time || current_time) - start_time
49
+ end
50
+
51
+ # Convert CPU_USED string to seconds
52
+ #
53
+ # example strings of cpu_used in LSF 8.3:
54
+ #
55
+ # 060:24:00.00
56
+ # 046:19:37.00
57
+ # 1118:59:09.00
58
+ # 000:00:00.00
59
+ # 000:48:18.39
60
+ # 003:11:36.67
61
+ # 003:24:40.95
62
+ # 50769:48:00.-48
63
+ # 50835:48:48.-48
64
+ #
65
+ # my guess is: hours:minutes:seconds.????
66
+ #
67
+ # @return [Fixnum, nil] cpu used as seconds
68
+ def parse_cpu_used(cpu_used)
69
+ if cpu_used =~ /^(\d+):(\d+):(\d+)\..*$/
70
+ $1.to_i*3600 + $2.to_i*60 + $3.to_i
71
+ end
72
+ end
73
+
74
+ def batch_submit_args(script, after: [], afterok: [], afternotok: [], afterany: [])
75
+ args = []
76
+
77
+ args += ["-P", script.accounting_id] unless script.accounting_id.nil?
78
+ args += ["-cwd", script.workdir.to_s] unless script.workdir.nil?
79
+ args += ["-J", script.job_name] unless script.job_name.nil?
80
+ args += ["-q", script.queue_name] unless script.queue_name.nil?
81
+ args += ["-U", script.reservation_id] unless script.reservation_id.nil?
82
+ args += ["-sp", script.priority] unless script.priority.nil?
83
+ args += ["-H"] if script.submit_as_hold
84
+ args += (script.rerunnable ? ["-r"] : ["-rn"]) unless script.rerunnable.nil?
85
+ args += ["-b", script.start_time.localtime.strftime("%Y:%m:%d:%H:%M")] unless script.start_time.nil?
86
+ args += ["-W", (script.wall_time / 60).to_i] unless script.wall_time.nil?
87
+
88
+ # input and output files
89
+ args += ["-i", script.input_path] unless script.input_path.nil?
90
+ args += ["-o", script.output_path] unless script.output_path.nil?
91
+ args += ["-e", script.error_path] unless script.error_path.nil?
92
+
93
+ # email
94
+ args += ["-B"] if script.email_on_started
95
+ args += ["-N"] if script.email_on_terminated
96
+ args += ["-u", script.email.join(",")] unless script.email.nil? || script.email.empty?
97
+
98
+ args += script.native unless script.native.nil?
99
+
100
+ # environment
101
+ env = script.job_environment || {}
102
+
103
+ {args: args, env: env}
104
+ end
26
105
  end
@@ -0,0 +1,424 @@
1
+ require "time"
2
+ require "ood_core/refinements/hash_extensions"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ # Build the PBS Pro adapter from a configuration
10
+ # @param config [#to_h] the configuration for job adapter
11
+ # @option config [Object] :host (nil) The batch server host
12
+ # @option config [Object] :exec (nil) Path to PBS Pro executables
13
+ # @option config [Object] :qstat_factor (nil) Deciding factor on how to
14
+ # call qstat for a user
15
+ def self.build_pbspro(config)
16
+ c = config.to_h.compact.symbolize_keys
17
+ host = c.fetch(:host, nil)
18
+ exec = c.fetch(:exec, nil)
19
+ qstat_factor = c.fetch(:qstat_factor, nil)
20
+ pbspro = Adapters::PBSPro::Batch.new(host: host, exec: exec)
21
+ Adapters::PBSPro.new(pbspro: pbspro, qstat_factor: qstat_factor)
22
+ end
23
+ end
24
+
25
+ module Adapters
26
+ # An adapter object that describes the communication with a PBS Pro
27
+ # resource manager for job management.
28
+ class PBSPro < Adapter
29
+ using Refinements::ArrayExtensions
30
+ using Refinements::HashExtensions
31
+
32
+ # Object used for simplified communication with a PBS Pro batch server
33
+ # @api private
34
+ class Batch
35
+ # The host of the PBS Pro batch server
36
+ # @example
37
+ # my_batch.host #=> "my_batch.server.edu"
38
+ # @return [String, nil] the batch server host
39
+ attr_reader :host
40
+
41
+ # The path containing the PBS executables
42
+ # @example
43
+ # my_batch.exec.to_s #=> "/usr/local/pbspro/10.0.0
44
+ # @return [Pathname, nil] path to pbs executables
45
+ attr_reader :exec
46
+
47
+ # The root exception class that all PBS Pro-specific exceptions
48
+ # inherit from
49
+ class Error < StandardError; end
50
+
51
+ # @param host [#to_s, nil] the batch server host
52
+ # @param exec [#to_s, nil] path to pbs executables
53
+ def initialize(host: nil, exec: nil)
54
+ @host = host && host.to_s
55
+ @exec = exec && Pathname.new(exec.to_s)
56
+ end
57
+
58
+ # Get a list of hashes detailing each of the jobs on the batch server
59
+ # @example Status info for all jobs
60
+ # my_batch.get_jobs
61
+ # #=>
62
+ # #[
63
+ # # {
64
+ # # :account => "account",
65
+ # # :job_id => "my_job",
66
+ # # ...
67
+ # # },
68
+ # # {
69
+ # # :account => "account",
70
+ # # :job_id => "my_other_job",
71
+ # # ...
72
+ # # },
73
+ # # ...
74
+ # #]
75
+ # @param id [#to_s] the id of the job
76
+ # @raise [Error] if `qstat` command exited unsuccessfully
77
+ # @return [Array<Hash>] list of details for jobs
78
+ def get_jobs(id: "")
79
+ args = ["-f"] # display all information
80
+ args += ["-t"] # list subjobs
81
+ args += [id.to_s] unless id.to_s.empty?
82
+ lines = call("qstat", *args).gsub("\n\t", "").split("\n").map(&:strip)
83
+
84
+ jobs = []
85
+ lines.each do |line|
86
+ if /^Job Id: (?<job_id>.+)$/ =~ line
87
+ jobs << { job_id: job_id }
88
+ elsif /^(?<key>[^\s]+) = (?<value>.+)$/ =~ line
89
+ hsh = jobs.last
90
+ k1, k2 = key.split(".").map(&:to_sym)
91
+ k2 ? ( hsh[k1] ||= {} and hsh[k1][k2] = value ) : ( hsh[k1] = value )
92
+ end
93
+ end
94
+ jobs.reject { |j| /\[\]/ =~ j[:job_id] } # drop main job array jobs
95
+ end
96
+
97
+ # Select batch jobs from the batch server
98
+ # @param args [Array<#to_s>] arguments passed to `qselect` command
99
+ # @raise [Error] if `qselect` command exited unsuccessfully
100
+ # @return [Array<String>] list of job ids that match selection
101
+ # criteria
102
+ def select_jobs(args: [])
103
+ call("qselect", *args).split("\n").map(&:strip)
104
+ end
105
+
106
+ # Put a specified job on hold
107
+ # @example Put job "1234" on hold
108
+ # my_batch.hold_job("1234")
109
+ # @param id [#to_s] the id of the job
110
+ # @raise [Error] if `qhold` command exited unsuccessfully
111
+ # @return [void]
112
+ def hold_job(id)
113
+ call("qhold", id.to_s)
114
+ end
115
+
116
+ # Release a specified job that is on hold
117
+ # @example Release job "1234" from on hold
118
+ # my_batch.release_job("1234")
119
+ # @param id [#to_s] the id of the job
120
+ # @raise [Error] if `qrls` command exited unsuccessfully
121
+ # @return [void]
122
+ def release_job(id)
123
+ call("qrls", id.to_s)
124
+ end
125
+
126
+ # Delete a specified job from batch server
127
+ # @example Delete job "1234"
128
+ # my_batch.delete_job("1234")
129
+ # @param id [#to_s] the id of the job
130
+ # @raise [Error] if `qdel` command exited unsuccessfully
131
+ # @return [void]
132
+ def delete_job(id)
133
+ call("qdel", id.to_s)
134
+ end
135
+
136
+ # Submit a script expanded as a string to the batch server
137
+ # @param str [#to_s] script as a string
138
+ # @param args [Array<#to_s>] arguments passed to `qsub` command
139
+ # @param chdir [#to_s, nil] working directory where `qsub` is called
140
+ # @raise [Error] if `qsub` command exited unsuccessfully
141
+ # @return [String] the id of the job that was created
142
+ def submit_string(str, args: [], chdir: nil)
143
+ call("qsub", *args, stdin: str.to_s, chdir: chdir).strip
144
+ end
145
+
146
+ private
147
+ # Call a forked PBS Pro command for a given batch server
148
+ def call(cmd, *args, env: {}, stdin: "", chdir: nil)
149
+ cmd = cmd.to_s
150
+ cmd = exec.join("bin", cmd).to_s if exec
151
+ args = args.map(&:to_s)
152
+ env = env.to_h.each_with_object({}) { |(k, v), h| h[k.to_s] = v.to_s }
153
+ env["PBS_DEFAULT"] = host.to_s if host
154
+ env["PBS_EXEC"] = exec.to_s if exec
155
+ chdir ||= "."
156
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin.to_s, chdir: chdir.to_s)
157
+ s.success? ? o : raise(Error, e)
158
+ end
159
+ end
160
+
161
+ # Mapping of state codes for PBSPro
162
+ STATE_MAP = {
163
+ 'Q' => :queued,
164
+ 'W' => :queued, # job is waiting for its submitter-assigned start time to be reached
165
+ 'H' => :queued_held,
166
+ 'T' => :queued_held, # job is being moved to a new location
167
+ 'M' => :completed, # job was moved to another server
168
+ 'R' => :running,
169
+ 'S' => :suspended,
170
+ 'U' => :suspended, # cycle-harvesting job is suspended due to keyboard activity
171
+ 'E' => :running, # job is exiting after having run
172
+ 'F' => :completed, # job is finished
173
+ 'X' => :completed # subjob has completed execution or has been deleted
174
+ # ignore B as it signifies a job array
175
+ }
176
+
177
+ # What percentage of jobs a user owns out of all jobs, used to decide
178
+ # whether we filter the owner's jobs from a `qstat` of all jobs or call
179
+ # `qstat` on each of the owner's individual jobs
180
+ # @return [Float] ratio of owner's jobs to all jobs
181
+ attr_reader :qstat_factor
182
+
183
+ # @api private
184
+ # @param opts [#to_h] the options defining this adapter
185
+ # @option opts [Batch] :pbspro The PBS Pro batch object
186
+ # @option opts [#to_f] :qstat_factor (0.10) The qstat deciding factor
187
+ # @see Factory.build_pbspro
188
+ def initialize(opts = {})
189
+ o = opts.to_h.compact.symbolize_keys
190
+
191
+ @pbspro = o.fetch(:pbspro) { raise ArgumentError, "No pbspro object specified. Missing argument: pbspro" }
192
+ @qstat_factor = o.fetch(:qstat_factor, 0.10).to_f
193
+ end
194
+
195
+ # Submit a job with the attributes defined in the job template instance
196
+ # @param script [Script] script object that describes the script and
197
+ # attributes for the submitted job
198
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
199
+ # execution at any point after dependent jobs have started execution
200
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
201
+ # execution only after dependent jobs have terminated with no errors
202
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
203
+ # execution only after dependent jobs have terminated with errors
204
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
205
+ # execution after dependent jobs have terminated
206
+ # @raise [JobAdapterError] if something goes wrong submitting a job
207
+ # @return [String] the job id returned after successfully submitting a
208
+ # job
209
+ # @see Adapter#submit
210
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
211
+ after = Array(after).map(&:to_s)
212
+ afterok = Array(afterok).map(&:to_s)
213
+ afternotok = Array(afternotok).map(&:to_s)
214
+ afterany = Array(afterany).map(&:to_s)
215
+
216
+ # Set qsub options
217
+ args = []
218
+ # ignore args, can't use these if submitting from STDIN
219
+ args += ["-h"] if script.submit_as_hold
220
+ args += ["-r", script.rerunnable ? "y" : "n"] unless script.rerunnable.nil?
221
+ args += ["-M", script.email.join(",")] unless script.email.nil?
222
+ if script.email_on_started && script.email_on_terminated
223
+ args += ["-m", "be"]
224
+ elsif script.email_on_started
225
+ args += ["-m", "b"]
226
+ elsif script.email_on_terminated
227
+ args += ["-m", "e"]
228
+ end
229
+ args += ["-N", script.job_name] unless script.job_name.nil?
230
+ # ignore input_path (not defined in PBS Pro)
231
+ args += ["-o", script.output_path] unless script.output_path.nil?
232
+ args += ["-e", script.error_path] unless script.error_path.nil?
233
+ # Reservations are actually just queues in PBS Pro
234
+ args += ["-q", script.reservation_id] if !script.reservation_id.nil? && script.queue_name.nil?
235
+ args += ["-q", script.queue_name] unless script.queue_name.nil?
236
+ args += ["-p", script.priority] unless script.priority.nil?
237
+ args += ["-a", script.start_time.localtime.strftime("%C%y%m%d%H%M.%S")] unless script.start_time.nil?
238
+ args += ["-A", script.accounting_id] unless script.accounting_id.nil?
239
+ args += ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
240
+
241
+ # Set dependencies
242
+ depend = []
243
+ depend << "after:#{after.join(":")}" unless after.empty?
244
+ depend << "afterok:#{afterok.join(":")}" unless afterok.empty?
245
+ depend << "afternotok:#{afternotok.join(":")}" unless afternotok.empty?
246
+ depend << "afterany:#{afterany.join(":")}" unless afterany.empty?
247
+ args += ["-W", "depend=#{depend.join(",")}"] unless depend.empty?
248
+
249
+ # Set environment variables
250
+ envvars = script.job_environment.to_h
251
+ args += ["-v", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
252
+
253
+ # If error_path is not specified we join stdout & stderr (as this
254
+ # mimics what the other resource managers do)
255
+ args += ["-j", "oe"] if script.error_path.nil?
256
+
257
+ # Set native options
258
+ args += script.native if script.native
259
+
260
+ # Submit job
261
+ @pbspro.submit_string(script.content, args: args, chdir: script.workdir)
262
+ rescue Batch::Error => e
263
+ raise JobAdapterError, e.message
264
+ end
265
+
266
+ # Retrieve info for all jobs from the resource manager
267
+ # @raise [JobAdapterError] if something goes wrong getting job info
268
+ # @return [Array<Info>] information describing submitted jobs
269
+ # @see Adapter#info_all
270
+ def info_all
271
+ @pbspro.get_jobs.map do |v|
272
+ parse_job_info(v)
273
+ end
274
+ rescue Batch::Error => e
275
+ raise JobAdapterError, e.message
276
+ end
277
+
278
+ # Retrieve info for all jobs for a given owner or owners from the
279
+ # resource manager
280
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
281
+ # @raise [JobAdapterError] if something goes wrong getting job info
282
+ # @return [Array<Info>] information describing submitted jobs
283
+ def info_where_owner(owner)
284
+ owner = Array.wrap(owner).map(&:to_s)
285
+
286
+ usr_jobs = @pbspro.select_jobs(args: ["-u", owner.join(",")])
287
+ all_jobs = @pbspro.select_jobs(args: ["-T"])
288
+
289
+ # `qstat` all jobs if user has too many jobs, otherwise `qstat` each
290
+ # individual job (default factor is 10%)
291
+ if usr_jobs.size > (qstat_factor * all_jobs.size)
292
+ super
293
+ else
294
+ usr_jobs.map { |id| info(id) }
295
+ end
296
+ end
297
+
298
+ # Retrieve job info from the resource manager
299
+ # @param id [#to_s] the id of the job
300
+ # @raise [JobAdapterError] if something goes wrong getting job info
301
+ # @return [Info] information describing submitted job
302
+ # @see Adapter#info
303
+ def info(id)
304
+ id = id.to_s
305
+ @pbspro.get_jobs(id: id).map do |v|
306
+ parse_job_info(v)
307
+ end.first || Info.new(id: id, status: :completed)
308
+ rescue Batch::Error => e
309
+ # set completed status if can't find job id
310
+ if /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
311
+ Info.new(
312
+ id: id,
313
+ status: :completed
314
+ )
315
+ else
316
+ raise JobAdapterError, e.message
317
+ end
318
+ end
319
+
320
+ # Retrieve job status from resource manager
321
+ # @param id [#to_s] the id of the job
322
+ # @raise [JobAdapterError] if something goes wrong getting job status
323
+ # @return [Status] status of job
324
+ # @see Adapter#status
325
+ def status(id)
326
+ info(id.to_s).status
327
+ end
328
+
329
+ # Put the submitted job on hold
330
+ # @param id [#to_s] the id of the job
331
+ # @raise [JobAdapterError] if something goes wrong holding a job
332
+ # @return [void]
333
+ # @see Adapter#hold
334
+ def hold(id)
335
+ @pbspro.hold_job(id.to_s)
336
+ rescue Batch::Error => e
337
+ # assume successful job hold if can't find job id
338
+ raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
339
+ end
340
+
341
+ # Release the job that is on hold
342
+ # @param id [#to_s] the id of the job
343
+ # @raise [JobAdapterError] if something goes wrong releasing a job
344
+ # @return [void]
345
+ # @see Adapter#release
346
+ def release(id)
347
+ @pbspro.release_job(id.to_s)
348
+ rescue Batch::Error => e
349
+ # assume successful job release if can't find job id
350
+ raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
351
+ end
352
+
353
+ # Delete the submitted job
354
+ # @param id [#to_s] the id of the job
355
+ # @raise [JobAdapterError] if something goes wrong deleting a job
356
+ # @return [void]
357
+ # @see Adapter#delete
358
+ def delete(id)
359
+ @pbspro.delete_job(id.to_s)
360
+ rescue Batch::Error => e
361
+ # assume successful job deletion if can't find job id
362
+ raise JobAdapterError, e.message unless /Unknown Job Id/ =~ e.message || /Job has finished/ =~ e.message
363
+ end
364
+
365
+ private
366
+ # Convert duration to seconds
367
+ def duration_in_seconds(time)
368
+ time.nil? ? nil : time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
369
+ end
370
+
371
+ # Convert seconds to duration
372
+ def seconds_to_duration(time)
373
+ "%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
374
+ end
375
+
376
+ # Convert host list string to individual nodes
377
+ # "hosta/J1+hostb/J2*P+..."
378
+ # where J1 and J2 are an index of the job on the named host and P is the number of
379
+ # processors allocated from that host to this job. P does not appear if it is 1.
380
+ # Example: "i5n14/2*7" uses 7 procs on node "i5n14"
381
+ def parse_nodes(node_list)
382
+ node_list.split('+').map do |n|
383
+ name, procs_list = n.split('/')
384
+ procs = (procs_list.split('*')[1] || 1).to_i
385
+ {name: name, procs: procs}
386
+ end
387
+ end
388
+
389
+ # Determine state from PBS Pro state code
390
+ def get_state(st)
391
+ STATE_MAP.fetch(st, :undetermined)
392
+ end
393
+
394
+ # Parse hash describing PBS Pro job status
395
+ def parse_job_info(v)
396
+ /^(?<job_owner>[\w-]+)@(?<submit_host>.+)$/ =~ v[:Job_Owner]
397
+ allocated_nodes = parse_nodes(v[:exec_host] || "")
398
+ procs = allocated_nodes.inject(0) { |sum, x| sum + x[:procs] }
399
+ if allocated_nodes.empty? # fill in with requested resources
400
+ allocated_nodes = [ { name: nil } ] * v.fetch(:Resource_List, {})[:nodect].to_i
401
+ procs = v.fetch(:Resource_List, {})[:ncpus].to_i
402
+ end
403
+ Info.new(
404
+ id: v[:job_id],
405
+ status: get_state(v[:job_state]),
406
+ allocated_nodes: allocated_nodes,
407
+ submit_host: submit_host,
408
+ job_name: v[:Job_Name],
409
+ job_owner: job_owner,
410
+ accounting_id: v[:Account_Name],
411
+ procs: procs,
412
+ queue_name: v[:queue],
413
+ wallclock_time: duration_in_seconds(v.fetch(:resources_used, {})[:walltime]),
414
+ wallclock_limit: duration_in_seconds(v.fetch(:Resource_List, {})[:walltime]),
415
+ cpu_time: duration_in_seconds(v.fetch(:resources_used, {})[:cput]),
416
+ submission_time: v[:ctime] ? Time.parse(v[:ctime]) : nil,
417
+ dispatch_time: v[:stime] ? Time.parse(v[:stime]) : nil,
418
+ native: v
419
+ )
420
+ end
421
+ end
422
+ end
423
+ end
424
+ end