ood_core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+
3
+ gem "pbs", "~> 2.0"
4
+ require "pbs"
5
+
6
+ module OodCore
7
+ module Job
8
+ class Factory
9
+ using Refinements::HashExtensions
10
+
11
+ # Build the Torque adapter from a configuration
12
+ # @param config [#to_h] the configuration for job adapter
13
+ # @option config [#to_s] :host The batch server host
14
+ # @option config [#to_s] :lib ('') Path to torque client libraries
15
+ # @option config [#to_s] :bin ('') Path to torque client binaries
16
+ def self.build_torque(config)
17
+ c = config.to_h.symbolize_keys
18
+ host = c.fetch(:host) { raise ArgumentError, "No host specified. Missing argument: host" }.to_s
19
+ lib = c.fetch(:lib, "").to_s
20
+ bin = c.fetch(:bin, "").to_s
21
+ pbs = PBS::Batch.new(host: host, lib: lib, bin: bin)
22
+ Adapters::Torque.new(pbs: pbs)
23
+ end
24
+ end
25
+
26
+ module Adapters
27
+ # An adapter object that describes the communication with a Torque resource
28
+ # manager for job management.
29
+ class Torque < Adapter
30
+ using Refinements::HashExtensions
31
+
32
+ # Mapping of state characters for PBS
33
+ STATE_MAP = {
34
+ 'Q' => :queued,
35
+ 'H' => :queued_held,
36
+ 'T' => :queued_held, # transiting, most like a held job
37
+ 'R' => :running,
38
+ 'S' => :suspended,
39
+ 'E' => :running, # exiting, but still running
40
+ 'C' => :completed
41
+ }
42
+
43
+ # @api private
44
+ # @param opts [#to_h] the options defining this adapter
45
+ # @option opts [PBS::Batch] :pbs The PBS batch object
46
+ # @see Factory.build_torque
47
+ def initialize(opts = {})
48
+ o = opts.to_h.symbolize_keys
49
+
50
+ @pbs = o.fetch(:pbs) { raise ArgumentError, "No pbs object specified. Missing argument: pbs" }
51
+ end
52
+
53
+ # Submit a job with the attributes defined in the job template instance
54
+ # @param script [Script] script object that describes the
55
+ # script and attributes for the submitted job
56
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
57
+ # at any point after dependent jobs have started execution
58
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
59
+ # execution only after dependent jobs have terminated with no errors
60
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
61
+ # execution only after dependent jobs have terminated with errors
62
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
63
+ # execution after dependent jobs have terminated
64
+ # @raise [JobAdapterError] if something goes wrong submitting a job
65
+ # @return [String] the job id returned after successfully submitting a job
66
+ # @see Adapter#submit
67
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
68
+ after = Array(after).map(&:to_s)
69
+ afterok = Array(afterok).map(&:to_s)
70
+ afternotok = Array(afternotok).map(&:to_s)
71
+ afterany = Array(afterany).map(&:to_s)
72
+
73
+ # Set headers
74
+ headers = {}
75
+ headers.merge!(job_arguments: script.args.join(' ')) unless script.args.nil?
76
+ headers.merge!(Hold_Types: :u) if script.submit_as_hold
77
+ headers.merge!(Rerunable: script.rerunnable ? 'y' : 'n') unless script.rerunnable.nil?
78
+ headers.merge!(init_work_dir: script.workdir) unless script.workdir.nil?
79
+ headers.merge!(Mail_Users: script.email.join(',')) unless script.email.nil?
80
+ mail_points = ''
81
+ mail_points += 'b' if script.email_on_started
82
+ mail_points += 'e' if script.email_on_terminated
83
+ headers.merge!(Mail_Points: mail_points) unless mail_points.empty?
84
+ headers.merge!(Job_Name: script.job_name) unless script.job_name.nil?
85
+ # ignore input_path (not defined in Torque)
86
+ headers.merge!(Output_Path: script.output_path) unless script.output_path.nil?
87
+ headers.merge!(Error_Path: script.error_path) unless script.error_path.nil?
88
+ headers.merge!(Join_Path: 'oe') if script.join_files
89
+ headers.merge!(reservation_id: script.reservation_id) unless script.reservation_id.nil?
90
+ headers.merge!(Priority: script.priority) unless script.priority.nil?
91
+ headers.merge!(Execution_Time: script.start_time.localtime.strftime("%C%y%m%d%H%M.%S")) unless script.start_time.nil?
92
+ headers.merge!(Account_Name: script.accounting_id) unless script.accounting_id.nil?
93
+
94
+ # Set dependencies
95
+ depend = []
96
+ depend << "after:#{after.join(':')}" unless after.empty?
97
+ depend << "afterok:#{afterok.join(':')}" unless afterok.empty?
98
+ depend << "afternotok:#{afternotok.join(':')}" unless afternotok.empty?
99
+ depend << "afterany:#{afterany.join(':')}" unless afterany.empty?
100
+ headers.merge!(depend: depend.join(',')) unless depend.empty?
101
+
102
+ # Set resources
103
+ resources = {}
104
+ resources.merge!(mem: "#{script.min_phys_memory}KB") unless script.min_phys_memory.nil?
105
+ resources.merge!(walltime: seconds_to_duration(script.wall_time)) unless script.wall_time.nil?
106
+ if script.nodes && !script.nodes.empty?
107
+ # Reduce an array to unique objects with count
108
+ # ["a", "a", "b"] #=> {"a" => 2, "b" => 1}
109
+ nodes = script.nodes.group_by {|v| v}.each_with_object({}) {|(k, v), h| h[k] = v.size}
110
+ resources.merge!(nodes: nodes.map {|k, v| k.is_a?(NodeRequest) ? node_request_to_str(k, v) : k }.join('+'))
111
+ end
112
+
113
+ # Set environment variables
114
+ envvars = script.job_environment || {}
115
+
116
+ # Set native options
117
+ if script.native
118
+ headers.merge! script.native.fetch(:headers, {})
119
+ resources.merge! script.native.fetch(:resources, {})
120
+ envvars.merge! script.native.fetch(:envvars, {})
121
+ end
122
+
123
+ # Submit job
124
+ @pbs.submit_string(script.content, queue: script.queue_name, headers: headers, resources: resources, envvars: envvars)
125
+ rescue PBS::Error => e
126
+ raise JobAdapterError, e.message
127
+ end
128
+
129
+ # Retrieve info for all jobs from the resource manager
130
+ # @raise [JobAdapterError] if something goes wrong getting job info
131
+ # @return [Array<Info>] information describing submitted jobs
132
+ # @see Adapter#info_all
133
+ def info_all
134
+ @pbs.get_jobs.map do |k, v|
135
+ parse_job_info(k, v)
136
+ end
137
+ rescue PBS::Error => e
138
+ raise JobAdapterError, e.message
139
+ end
140
+
141
+ # Retrieve job info from the resource manager
142
+ # @param id [#to_s] the id of the job
143
+ # @raise [JobAdapterError] if something goes wrong getting job info
144
+ # @return [Info] information describing submitted job
145
+ # @see Adapter#info
146
+ def info(id)
147
+ id = id.to_s
148
+ parse_job_info(*@pbs.get_job(id).flatten)
149
+ rescue PBS::UnkjobidError
150
+ # set completed status if can't find job id
151
+ Info.new(
152
+ id: id,
153
+ status: :completed
154
+ )
155
+ rescue PBS::Error => e
156
+ raise JobAdapterError, e.message
157
+ end
158
+
159
+ # Retrieve job status from resource manager
160
+ # @param id [#to_s] the id of the job
161
+ # @raise [JobAdapterError] if something goes wrong getting job status
162
+ # @return [Status] status of job
163
+ # @see Adapter#status
164
+ def status(id)
165
+ id = id.to_s
166
+ char = @pbs.get_job(id, filters: [:job_state])[id][:job_state]
167
+ Status.new(state: STATE_MAP.fetch(char, :undetermined))
168
+ rescue PBS::UnkjobidError
169
+ # set completed status if can't find job id
170
+ Status.new(state: :completed)
171
+ rescue PBS::Error => e
172
+ raise JobAdapterError, e.message
173
+ end
174
+
175
+ # Put the submitted job on hold
176
+ # @param id [#to_s] the id of the job
177
+ # @raise [JobAdapterError] if something goes wrong holding a job
178
+ # @return [void]
179
+ # @see Adapter#hold
180
+ def hold(id)
181
+ @pbs.hold_job(id.to_s)
182
+ rescue PBS::UnkjobidError
183
+ # assume successful job hold if can't find job id
184
+ nil
185
+ rescue PBS::Error => e
186
+ raise JobAdapterError, e.message
187
+ end
188
+
189
+ # Release the job that is on hold
190
+ # @param id [#to_s] the id of the job
191
+ # @raise [JobAdapterError] if something goes wrong releasing a job
192
+ # @return [void]
193
+ # @see Adapter#release
194
+ def release(id)
195
+ @pbs.release_job(id.to_s)
196
+ rescue PBS::UnkjobidError
197
+ # assume successful job release if can't find job id
198
+ nil
199
+ rescue PBS::Error => e
200
+ raise JobAdapterError, e.message
201
+ end
202
+
203
+ # Delete the submitted job
204
+ # @param id [#to_s] the id of the job
205
+ # @raise [JobAdapterError] if something goes wrong deleting a job
206
+ # @return [void]
207
+ # @see Adapter#delete
208
+ def delete(id)
209
+ @pbs.delete_job(id.to_s)
210
+ rescue PBS::UnkjobidError, PBS::BadstateError
211
+ # assume successful job deletion if can't find job id
212
+ # assume successful job deletion if job is exiting or completed
213
+ nil
214
+ rescue PBS::Error => e
215
+ raise JobAdapterError, e.message
216
+ end
217
+
218
+ private
219
+ # Convert duration to seconds
220
+ def duration_in_seconds(time)
221
+ time.nil? ? 0 : time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
222
+ end
223
+
224
+ # Convert seconds to duration
225
+ def seconds_to_duration(time)
226
+ '%02d:%02d:%02d' % [time/3600, time/60%60, time%60]
227
+ end
228
+
229
+ # Convert host list string to individual nodes
230
+ # "n0163/2,7,10-11+n0205/0-11+n0156/0-11"
231
+ def parse_nodes(node_list)
232
+ node_list.split('+').map do |n|
233
+ name, procs_list = n.split('/')
234
+ # count procs used in range expression
235
+ procs = procs_list.split(',').inject(0) do |sum, x|
236
+ sum + (x =~ /^(\d+)-(\d+)$/ ? ($2.to_i - $1.to_i) : 0) + 1
237
+ end
238
+ {name: name, procs: procs}
239
+ end
240
+ end
241
+
242
+ # Convert a NodeRequest object to a valid Torque string
243
+ def node_request_to_str(node, cnt)
244
+ str = cnt.to_s
245
+ str += ":ppn=#{node.procs}" if node.procs
246
+ str += ":#{node.properties.join(':')}" if node.properties
247
+ str
248
+ end
249
+
250
+ # Parse hash describing PBS job status
251
+ def parse_job_info(k, v)
252
+ /^(?<job_owner>[\w-]+)@/ =~ v[:Job_Owner]
253
+ allocated_nodes = parse_nodes(v[:exec_host] || "")
254
+ Info.new(
255
+ id: k,
256
+ status: STATE_MAP.fetch(v[:job_state], :undetermined),
257
+ allocated_nodes: allocated_nodes,
258
+ submit_host: v[:submit_host],
259
+ job_name: v[:Job_Name],
260
+ job_owner: job_owner,
261
+ accounting_id: v[:Account_Name],
262
+ procs: allocated_nodes.inject(0) { |sum, x| sum + x[:procs] },
263
+ queue_name: v[:queue],
264
+ wallclock_time: duration_in_seconds(v.fetch(:resources_used, {})[:walltime]),
265
+ cpu_time: duration_in_seconds(v.fetch(:resources_used, {})[:cput]),
266
+ submission_time: v[:ctime],
267
+ dispatch_time: v[:start_time],
268
+ native: v
269
+ )
270
+ end
271
+ end
272
+ end
273
+ end
274
+ end
@@ -0,0 +1,41 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+
3
+ module OodCore
4
+ module Job
5
+ # A factory that builds job adapter objects from a configuration.
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ class << self
10
+ # Build a job adapter from a configuration
11
+ # @param config [#to_h] configuration describing job adapter
12
+ # @option config [#to_s] :adapter The job adapter to use
13
+ # @raise [AdapterNotSpecified] if no adapter is specified
14
+ # @raise [AdapterNotFound] if the specified adapter does not exist
15
+ # @return [Adapter] the job adapter object
16
+ def build(config)
17
+ c = config.to_h.symbolize_keys
18
+
19
+ adapter = c.fetch(:adapter) { raise AdapterNotSpecified, "job configuration does not specify adapter" }.to_s
20
+
21
+ path_to_adapter = "ood_core/job/adapters/#{adapter}"
22
+ begin
23
+ require path_to_adapter
24
+ rescue Gem::LoadError => e
25
+ raise Gem::LoadError, "Specified '#{adapter}' for job adapter, but the gem is not loaded."
26
+ rescue LoadError => e
27
+ raise LoadError, "Could not load '#{adapter}'. Make sure that the job adapter in the configuration file is valid."
28
+ end
29
+
30
+ adapter_method = "build_#{adapter}"
31
+
32
+ unless respond_to?(adapter_method)
33
+ raise AdapterNotFound, "job configuration specifies nonexistent #{adapter} adapter"
34
+ end
35
+
36
+ send(adapter_method, c)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,141 @@
1
+ require 'time'
2
+
3
+ module OodCore
4
+ module Job
5
+ # An object that describes a submitted job
6
+ class Info
7
+ # The identifier of the job
8
+ # @return [String] job id
9
+ attr_reader :id
10
+
11
+ # The status of the job
12
+ # @return [Status] job state
13
+ attr_reader :status
14
+
15
+ # Set of machines that is utilized for job execution
16
+ # @return [Array<NodeInfo>] allocated nodes
17
+ attr_reader :allocated_nodes
18
+
19
+ # Name of the submission host for this job
20
+ # @return [String, nil] submit host
21
+ attr_reader :submit_host
22
+
23
+ # Name of the job
24
+ # @return [String, nil] job name
25
+ attr_reader :job_name
26
+
27
+ # Owner of job
28
+ # @return [String, nil] job owner
29
+ attr_reader :job_owner
30
+
31
+ # The account the job is charged against
32
+ # @return [String, nil] accounting id
33
+ attr_reader :accounting_id
34
+
35
+ # Number of procs allocated for job
36
+ # @return [Fixnum, nil] allocated total number of procs
37
+ attr_reader :procs
38
+
39
+ # Name of the queue in which the job was queued or started
40
+ # @return [String, nil] queue name
41
+ attr_reader :queue_name
42
+
43
+ # The accumulated wall clock time in seconds
44
+ # @return [Fixnum, nil] wallclock time
45
+ attr_reader :wallclock_time
46
+
47
+ # The accumulated CPU time in seconds
48
+ # @return [Fixnum, nil] cpu time
49
+ attr_reader :cpu_time
50
+
51
+ # The time at which the job was submitted
52
+ # @return [Time, nil] submission time
53
+ attr_reader :submission_time
54
+
55
+ # The time the job first entered a "Started" state
56
+ # @return [Time, nil] dispatch time
57
+ attr_reader :dispatch_time
58
+
59
+ # Native resource manager output for job info
60
+ # @note Should not be used by generic apps
61
+ # @return [Object] native info
62
+ attr_reader :native
63
+
64
+ # @param id [#to_s] job id
65
+ # @param status [#to_sym] job state
66
+ # @param allocated_nodes [Array<#to_h>] allocated nodes
67
+ # @param submit_host [#to_s, nil] submit host
68
+ # @param job_name [#to_s, nil] job name
69
+ # @param job_owner [#to_s, nil] job owner
70
+ # @param accounting_id [#to_s, nil] accounting id
71
+ # @param procs [#to_i, nil] allocated total number of procs
72
+ # @param queue_name [#to_s, nil] queue name
73
+ # @param wallclock_time [#to_i, nil] wallclock time
74
+ # @param cpu_time [#to_i, nil] cpu time
75
+ # @param submission_time [#to_i, nil] submission time
76
+ # @param dispatch_time [#to_i, nil] dispatch time
77
+ # @param native [Object] native info
78
+ def initialize(id:, status:, allocated_nodes: [], submit_host: nil,
79
+ job_name: nil, job_owner: nil, accounting_id: nil,
80
+ procs: nil, queue_name: nil, wallclock_time: nil,
81
+ cpu_time: nil, submission_time: nil, dispatch_time: nil,
82
+ native: nil, **_)
83
+ @id = id.to_s
84
+ @status = Status.new(state: status.to_sym)
85
+ @allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(n.to_h) }
86
+ @submit_host = submit_host && submit_host.to_s
87
+ @job_name = job_name && job_name.to_s
88
+ @job_owner = job_owner && job_owner.to_s
89
+ @accounting_id = accounting_id && accounting_id.to_s
90
+ @procs = procs && procs.to_i
91
+ @queue_name = queue_name && queue_name.to_s
92
+ @wallclock_time = wallclock_time && wallclock_time.to_i
93
+ @cpu_time = cpu_time && cpu_time.to_i
94
+ @submission_time = submission_time && Time.at(submission_time.to_i)
95
+ @dispatch_time = dispatch_time && Time.at(dispatch_time.to_i)
96
+ @native = native
97
+ end
98
+
99
+ # Convert object to hash
100
+ # @return [Hash] object as hash
101
+ def to_h
102
+ {
103
+ id: id,
104
+ status: status,
105
+ allocated_nodes: allocated_nodes,
106
+ submit_host: submit_host,
107
+ job_name: job_name,
108
+ job_owner: job_owner,
109
+ accounting_id: accounting_id,
110
+ procs: procs,
111
+ queue_name: queue_name,
112
+ wallclock_time: wallclock_time,
113
+ cpu_time: cpu_time,
114
+ submission_time: submission_time,
115
+ dispatch_time: dispatch_time,
116
+ native: native
117
+ }
118
+ end
119
+
120
+ # The comparison operator
121
+ # @param other [#to_h] object to compare against
122
+ # @return [Boolean] whether objects are equivalent
123
+ def ==(other)
124
+ to_h == other.to_h
125
+ end
126
+
127
+ # Whether objects are identical to each other
128
+ # @param other [#to_h] object to compare against
129
+ # @return [Boolean] whether objects are identical
130
+ def eql?(other)
131
+ self.class == other.class && self == other
132
+ end
133
+
134
+ # Generate a hash value for this object
135
+ # @return [Fixnum] hash value of object
136
+ def hash
137
+ [self.class, to_h].hash
138
+ end
139
+ end
140
+ end
141
+ end