ood_core 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,274 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+
3
+ gem "pbs", "~> 2.0"
4
+ require "pbs"
5
+
6
+ module OodCore
7
+ module Job
8
+ class Factory
9
+ using Refinements::HashExtensions
10
+
11
+ # Build the Torque adapter from a configuration
12
+ # @param config [#to_h] the configuration for job adapter
13
+ # @option config [#to_s] :host The batch server host
14
+ # @option config [#to_s] :lib ('') Path to torque client libraries
15
+ # @option config [#to_s] :bin ('') Path to torque client binaries
16
+ def self.build_torque(config)
17
+ c = config.to_h.symbolize_keys
18
+ host = c.fetch(:host) { raise ArgumentError, "No host specified. Missing argument: host" }.to_s
19
+ lib = c.fetch(:lib, "").to_s
20
+ bin = c.fetch(:bin, "").to_s
21
+ pbs = PBS::Batch.new(host: host, lib: lib, bin: bin)
22
+ Adapters::Torque.new(pbs: pbs)
23
+ end
24
+ end
25
+
26
+ module Adapters
27
+ # An adapter object that describes the communication with a Torque resource
28
+ # manager for job management.
29
+ class Torque < Adapter
30
+ using Refinements::HashExtensions
31
+
32
+ # Mapping of state characters for PBS
33
+ STATE_MAP = {
34
+ 'Q' => :queued,
35
+ 'H' => :queued_held,
36
+ 'T' => :queued_held, # transiting, most like a held job
37
+ 'R' => :running,
38
+ 'S' => :suspended,
39
+ 'E' => :running, # exiting, but still running
40
+ 'C' => :completed
41
+ }
42
+
43
+ # @api private
44
+ # @param opts [#to_h] the options defining this adapter
45
+ # @option opts [PBS::Batch] :pbs The PBS batch object
46
+ # @see Factory.build_torque
47
+ def initialize(opts = {})
48
+ o = opts.to_h.symbolize_keys
49
+
50
+ @pbs = o.fetch(:pbs) { raise ArgumentError, "No pbs object specified. Missing argument: pbs" }
51
+ end
52
+
53
+ # Submit a job with the attributes defined in the job template instance
54
+ # @param script [Script] script object that describes the
55
+ # script and attributes for the submitted job
56
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
57
+ # at any point after dependent jobs have started execution
58
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
59
+ # execution only after dependent jobs have terminated with no errors
60
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
61
+ # execution only after dependent jobs have terminated with errors
62
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
63
+ # execution after dependent jobs have terminated
64
+ # @raise [JobAdapterError] if something goes wrong submitting a job
65
+ # @return [String] the job id returned after successfully submitting a job
66
+ # @see Adapter#submit
67
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
68
+ after = Array(after).map(&:to_s)
69
+ afterok = Array(afterok).map(&:to_s)
70
+ afternotok = Array(afternotok).map(&:to_s)
71
+ afterany = Array(afterany).map(&:to_s)
72
+
73
+ # Set headers
74
+ headers = {}
75
+ headers.merge!(job_arguments: script.args.join(' ')) unless script.args.nil?
76
+ headers.merge!(Hold_Types: :u) if script.submit_as_hold
77
+ headers.merge!(Rerunable: script.rerunnable ? 'y' : 'n') unless script.rerunnable.nil?
78
+ headers.merge!(init_work_dir: script.workdir) unless script.workdir.nil?
79
+ headers.merge!(Mail_Users: script.email.join(',')) unless script.email.nil?
80
+ mail_points = ''
81
+ mail_points += 'b' if script.email_on_started
82
+ mail_points += 'e' if script.email_on_terminated
83
+ headers.merge!(Mail_Points: mail_points) unless mail_points.empty?
84
+ headers.merge!(Job_Name: script.job_name) unless script.job_name.nil?
85
+ # ignore input_path (not defined in Torque)
86
+ headers.merge!(Output_Path: script.output_path) unless script.output_path.nil?
87
+ headers.merge!(Error_Path: script.error_path) unless script.error_path.nil?
88
+ headers.merge!(Join_Path: 'oe') if script.join_files
89
+ headers.merge!(reservation_id: script.reservation_id) unless script.reservation_id.nil?
90
+ headers.merge!(Priority: script.priority) unless script.priority.nil?
91
+ headers.merge!(Execution_Time: script.start_time.localtime.strftime("%C%y%m%d%H%M.%S")) unless script.start_time.nil?
92
+ headers.merge!(Account_Name: script.accounting_id) unless script.accounting_id.nil?
93
+
94
+ # Set dependencies
95
+ depend = []
96
+ depend << "after:#{after.join(':')}" unless after.empty?
97
+ depend << "afterok:#{afterok.join(':')}" unless afterok.empty?
98
+ depend << "afternotok:#{afternotok.join(':')}" unless afternotok.empty?
99
+ depend << "afterany:#{afterany.join(':')}" unless afterany.empty?
100
+ headers.merge!(depend: depend.join(',')) unless depend.empty?
101
+
102
+ # Set resources
103
+ resources = {}
104
+ resources.merge!(mem: "#{script.min_phys_memory}KB") unless script.min_phys_memory.nil?
105
+ resources.merge!(walltime: seconds_to_duration(script.wall_time)) unless script.wall_time.nil?
106
+ if script.nodes && !script.nodes.empty?
107
+ # Reduce an array to unique objects with count
108
+ # ["a", "a", "b"] #=> {"a" => 2, "b" => 1}
109
+ nodes = script.nodes.group_by {|v| v}.each_with_object({}) {|(k, v), h| h[k] = v.size}
110
+ resources.merge!(nodes: nodes.map {|k, v| k.is_a?(NodeRequest) ? node_request_to_str(k, v) : k }.join('+'))
111
+ end
112
+
113
+ # Set environment variables
114
+ envvars = script.job_environment || {}
115
+
116
+ # Set native options
117
+ if script.native
118
+ headers.merge! script.native.fetch(:headers, {})
119
+ resources.merge! script.native.fetch(:resources, {})
120
+ envvars.merge! script.native.fetch(:envvars, {})
121
+ end
122
+
123
+ # Submit job
124
+ @pbs.submit_string(script.content, queue: script.queue_name, headers: headers, resources: resources, envvars: envvars)
125
+ rescue PBS::Error => e
126
+ raise JobAdapterError, e.message
127
+ end
128
+
129
+ # Retrieve info for all jobs from the resource manager
130
+ # @raise [JobAdapterError] if something goes wrong getting job info
131
+ # @return [Array<Info>] information describing submitted jobs
132
+ # @see Adapter#info_all
133
+ def info_all
134
+ @pbs.get_jobs.map do |k, v|
135
+ parse_job_info(k, v)
136
+ end
137
+ rescue PBS::Error => e
138
+ raise JobAdapterError, e.message
139
+ end
140
+
141
+ # Retrieve job info from the resource manager
142
+ # @param id [#to_s] the id of the job
143
+ # @raise [JobAdapterError] if something goes wrong getting job info
144
+ # @return [Info] information describing submitted job
145
+ # @see Adapter#info
146
+ def info(id)
147
+ id = id.to_s
148
+ parse_job_info(*@pbs.get_job(id).flatten)
149
+ rescue PBS::UnkjobidError
150
+ # set completed status if can't find job id
151
+ Info.new(
152
+ id: id,
153
+ status: :completed
154
+ )
155
+ rescue PBS::Error => e
156
+ raise JobAdapterError, e.message
157
+ end
158
+
159
+ # Retrieve job status from resource manager
160
+ # @param id [#to_s] the id of the job
161
+ # @raise [JobAdapterError] if something goes wrong getting job status
162
+ # @return [Status] status of job
163
+ # @see Adapter#status
164
+ def status(id)
165
+ id = id.to_s
166
+ char = @pbs.get_job(id, filters: [:job_state])[id][:job_state]
167
+ Status.new(state: STATE_MAP.fetch(char, :undetermined))
168
+ rescue PBS::UnkjobidError
169
+ # set completed status if can't find job id
170
+ Status.new(state: :completed)
171
+ rescue PBS::Error => e
172
+ raise JobAdapterError, e.message
173
+ end
174
+
175
+ # Put the submitted job on hold
176
+ # @param id [#to_s] the id of the job
177
+ # @raise [JobAdapterError] if something goes wrong holding a job
178
+ # @return [void]
179
+ # @see Adapter#hold
180
+ def hold(id)
181
+ @pbs.hold_job(id.to_s)
182
+ rescue PBS::UnkjobidError
183
+ # assume successful job hold if can't find job id
184
+ nil
185
+ rescue PBS::Error => e
186
+ raise JobAdapterError, e.message
187
+ end
188
+
189
+ # Release the job that is on hold
190
+ # @param id [#to_s] the id of the job
191
+ # @raise [JobAdapterError] if something goes wrong releasing a job
192
+ # @return [void]
193
+ # @see Adapter#release
194
+ def release(id)
195
+ @pbs.release_job(id.to_s)
196
+ rescue PBS::UnkjobidError
197
+ # assume successful job release if can't find job id
198
+ nil
199
+ rescue PBS::Error => e
200
+ raise JobAdapterError, e.message
201
+ end
202
+
203
+ # Delete the submitted job
204
+ # @param id [#to_s] the id of the job
205
+ # @raise [JobAdapterError] if something goes wrong deleting a job
206
+ # @return [void]
207
+ # @see Adapter#delete
208
+ def delete(id)
209
+ @pbs.delete_job(id.to_s)
210
+ rescue PBS::UnkjobidError, PBS::BadstateError
211
+ # assume successful job deletion if can't find job id
212
+ # assume successful job deletion if job is exiting or completed
213
+ nil
214
+ rescue PBS::Error => e
215
+ raise JobAdapterError, e.message
216
+ end
217
+
218
+ private
219
+ # Convert duration to seconds
220
+ def duration_in_seconds(time)
221
+ time.nil? ? 0 : time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
222
+ end
223
+
224
+ # Convert seconds to duration
225
+ def seconds_to_duration(time)
226
+ '%02d:%02d:%02d' % [time/3600, time/60%60, time%60]
227
+ end
228
+
229
+ # Convert host list string to individual nodes
230
+ # "n0163/2,7,10-11+n0205/0-11+n0156/0-11"
231
+ def parse_nodes(node_list)
232
+ node_list.split('+').map do |n|
233
+ name, procs_list = n.split('/')
234
+ # count procs used in range expression
235
+ procs = procs_list.split(',').inject(0) do |sum, x|
236
+ sum + (x =~ /^(\d+)-(\d+)$/ ? ($2.to_i - $1.to_i) : 0) + 1
237
+ end
238
+ {name: name, procs: procs}
239
+ end
240
+ end
241
+
242
+ # Convert a NodeRequest object to a valid Torque string
243
+ def node_request_to_str(node, cnt)
244
+ str = cnt.to_s
245
+ str += ":ppn=#{node.procs}" if node.procs
246
+ str += ":#{node.properties.join(':')}" if node.properties
247
+ str
248
+ end
249
+
250
+ # Parse hash describing PBS job status
251
+ def parse_job_info(k, v)
252
+ /^(?<job_owner>[\w-]+)@/ =~ v[:Job_Owner]
253
+ allocated_nodes = parse_nodes(v[:exec_host] || "")
254
+ Info.new(
255
+ id: k,
256
+ status: STATE_MAP.fetch(v[:job_state], :undetermined),
257
+ allocated_nodes: allocated_nodes,
258
+ submit_host: v[:submit_host],
259
+ job_name: v[:Job_Name],
260
+ job_owner: job_owner,
261
+ accounting_id: v[:Account_Name],
262
+ procs: allocated_nodes.inject(0) { |sum, x| sum + x[:procs] },
263
+ queue_name: v[:queue],
264
+ wallclock_time: duration_in_seconds(v.fetch(:resources_used, {})[:walltime]),
265
+ cpu_time: duration_in_seconds(v.fetch(:resources_used, {})[:cput]),
266
+ submission_time: v[:ctime],
267
+ dispatch_time: v[:start_time],
268
+ native: v
269
+ )
270
+ end
271
+ end
272
+ end
273
+ end
274
+ end
@@ -0,0 +1,41 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+
3
+ module OodCore
4
+ module Job
5
+ # A factory that builds job adapter objects from a configuration.
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ class << self
10
+ # Build a job adapter from a configuration
11
+ # @param config [#to_h] configuration describing job adapter
12
+ # @option config [#to_s] :adapter The job adapter to use
13
+ # @raise [AdapterNotSpecified] if no adapter is specified
14
+ # @raise [AdapterNotFound] if the specified adapter does not exist
15
+ # @return [Adapter] the job adapter object
16
+ def build(config)
17
+ c = config.to_h.symbolize_keys
18
+
19
+ adapter = c.fetch(:adapter) { raise AdapterNotSpecified, "job configuration does not specify adapter" }.to_s
20
+
21
+ path_to_adapter = "ood_core/job/adapters/#{adapter}"
22
+ begin
23
+ require path_to_adapter
24
+ rescue Gem::LoadError => e
25
+ raise Gem::LoadError, "Specified '#{adapter}' for job adapter, but the gem is not loaded."
26
+ rescue LoadError => e
27
+ raise LoadError, "Could not load '#{adapter}'. Make sure that the job adapter in the configuration file is valid."
28
+ end
29
+
30
+ adapter_method = "build_#{adapter}"
31
+
32
+ unless respond_to?(adapter_method)
33
+ raise AdapterNotFound, "job configuration specifies nonexistent #{adapter} adapter"
34
+ end
35
+
36
+ send(adapter_method, c)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,141 @@
1
+ require 'time'
2
+
3
+ module OodCore
4
+ module Job
5
+ # An object that describes a submitted job
6
+ class Info
7
+ # The identifier of the job
8
+ # @return [String] job id
9
+ attr_reader :id
10
+
11
+ # The status of the job
12
+ # @return [Status] job state
13
+ attr_reader :status
14
+
15
+ # Set of machines that is utilized for job execution
16
+ # @return [Array<NodeInfo>] allocated nodes
17
+ attr_reader :allocated_nodes
18
+
19
+ # Name of the submission host for this job
20
+ # @return [String, nil] submit host
21
+ attr_reader :submit_host
22
+
23
+ # Name of the job
24
+ # @return [String, nil] job name
25
+ attr_reader :job_name
26
+
27
+ # Owner of job
28
+ # @return [String, nil] job owner
29
+ attr_reader :job_owner
30
+
31
+ # The account the job is charged against
32
+ # @return [String, nil] accounting id
33
+ attr_reader :accounting_id
34
+
35
+ # Number of procs allocated for job
36
+ # @return [Fixnum, nil] allocated total number of procs
37
+ attr_reader :procs
38
+
39
+ # Name of the queue in which the job was queued or started
40
+ # @return [String, nil] queue name
41
+ attr_reader :queue_name
42
+
43
+ # The accumulated wall clock time in seconds
44
+ # @return [Fixnum, nil] wallclock time
45
+ attr_reader :wallclock_time
46
+
47
+ # The accumulated CPU time in seconds
48
+ # @return [Fixnum, nil] cpu time
49
+ attr_reader :cpu_time
50
+
51
+ # The time at which the job was submitted
52
+ # @return [Time, nil] submission time
53
+ attr_reader :submission_time
54
+
55
+ # The time the job first entered a "Started" state
56
+ # @return [Time, nil] dispatch time
57
+ attr_reader :dispatch_time
58
+
59
+ # Native resource manager output for job info
60
+ # @note Should not be used by generic apps
61
+ # @return [Object] native info
62
+ attr_reader :native
63
+
64
+ # @param id [#to_s] job id
65
+ # @param status [#to_sym] job state
66
+ # @param allocated_nodes [Array<#to_h>] allocated nodes
67
+ # @param submit_host [#to_s, nil] submit host
68
+ # @param job_name [#to_s, nil] job name
69
+ # @param job_owner [#to_s, nil] job owner
70
+ # @param accounting_id [#to_s, nil] accounting id
71
+ # @param procs [#to_i, nil] allocated total number of procs
72
+ # @param queue_name [#to_s, nil] queue name
73
+ # @param wallclock_time [#to_i, nil] wallclock time
74
+ # @param cpu_time [#to_i, nil] cpu time
75
+ # @param submission_time [#to_i, nil] submission time
76
+ # @param dispatch_time [#to_i, nil] dispatch time
77
+ # @param native [Object] native info
78
+ def initialize(id:, status:, allocated_nodes: [], submit_host: nil,
79
+ job_name: nil, job_owner: nil, accounting_id: nil,
80
+ procs: nil, queue_name: nil, wallclock_time: nil,
81
+ cpu_time: nil, submission_time: nil, dispatch_time: nil,
82
+ native: nil, **_)
83
+ @id = id.to_s
84
+ @status = Status.new(state: status.to_sym)
85
+ @allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(n.to_h) }
86
+ @submit_host = submit_host && submit_host.to_s
87
+ @job_name = job_name && job_name.to_s
88
+ @job_owner = job_owner && job_owner.to_s
89
+ @accounting_id = accounting_id && accounting_id.to_s
90
+ @procs = procs && procs.to_i
91
+ @queue_name = queue_name && queue_name.to_s
92
+ @wallclock_time = wallclock_time && wallclock_time.to_i
93
+ @cpu_time = cpu_time && cpu_time.to_i
94
+ @submission_time = submission_time && Time.at(submission_time.to_i)
95
+ @dispatch_time = dispatch_time && Time.at(dispatch_time.to_i)
96
+ @native = native
97
+ end
98
+
99
+ # Convert object to hash
100
+ # @return [Hash] object as hash
101
+ def to_h
102
+ {
103
+ id: id,
104
+ status: status,
105
+ allocated_nodes: allocated_nodes,
106
+ submit_host: submit_host,
107
+ job_name: job_name,
108
+ job_owner: job_owner,
109
+ accounting_id: accounting_id,
110
+ procs: procs,
111
+ queue_name: queue_name,
112
+ wallclock_time: wallclock_time,
113
+ cpu_time: cpu_time,
114
+ submission_time: submission_time,
115
+ dispatch_time: dispatch_time,
116
+ native: native
117
+ }
118
+ end
119
+
120
+ # The comparison operator
121
+ # @param other [#to_h] object to compare against
122
+ # @return [Boolean] whether objects are equivalent
123
+ def ==(other)
124
+ to_h == other.to_h
125
+ end
126
+
127
+ # Whether objects are identical to each other
128
+ # @param other [#to_h] object to compare against
129
+ # @return [Boolean] whether objects are identical
130
+ def eql?(other)
131
+ self.class == other.class && self == other
132
+ end
133
+
134
+ # Generate a hash value for this object
135
+ # @return [Fixnum] hash value of object
136
+ def hash
137
+ [self.class, to_h].hash
138
+ end
139
+ end
140
+ end
141
+ end