pbs 1.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,403 @@
1
+ module PBS
2
+ # The root exception class that all PBS-specific exceptions inherit from
3
+ class Error < StandardError; end
4
+
5
+ # Unknown job ID error
6
+ class UnkjobidError < Error; end
7
+
8
+ # Undefined attribute
9
+ class NoattrError < Error; end
10
+
11
+ # Cannot set attribute, read only or insufficient permission
12
+ class AttrroError < Error; end
13
+
14
+ # Invalid request
15
+ class IvalreqError < Error; end
16
+
17
+ # Unknown request
18
+ class UnkreqError < Error; end
19
+
20
+ # Too many submit retries
21
+ class ToomanyError < Error; end
22
+
23
+ # Unauthorized Request
24
+ class PermError < Error; end
25
+
26
+ # trqauthd unable to authenticate
27
+ class IffNotFoundError < Error; end
28
+
29
+ # Munge executable not found, unable to authenticate
30
+ class MungeNotFoundError < Error; end
31
+
32
+ # Access from host not allowed, or unknown host
33
+ class BadhostError < Error; end
34
+
35
+ # Job with requested ID already exists
36
+ class JobexistError < Error; end
37
+
38
+ # System error
39
+ class SystemError < Error; end
40
+
41
+ # PBS server internal error
42
+ class InternalError < Error; end
43
+
44
+ # Dependent parent job currently in routing queue
45
+ class RegrouteError < Error; end
46
+
47
+ # Unknown/illegal signal name
48
+ class UnksigError < Error; end
49
+
50
+ # Illegal attribute or resource value for
51
+ class BadatvalError < Error; end
52
+
53
+ # Cannot modify attribute while job running
54
+ class ModatrrunError < Error; end
55
+
56
+ # Request invalid for state of job
57
+ class BadstateError < Error; end
58
+
59
+ # Unknown queue
60
+ class UnkqueError < Error; end
61
+
62
+ # Invalid credential
63
+ class BadcredError < Error; end
64
+
65
+ # Expired credential
66
+ class ExpiredError < Error; end
67
+
68
+ # Queue is not enabled
69
+ class QunoenbError < Error; end
70
+
71
+ # Access to queue is denied
72
+ class QacessError < Error; end
73
+
74
+ # Bad UID for job execution
75
+ class BaduserError < Error; end
76
+
77
+ # Job routing over too many hops
78
+ class HopcountError < Error; end
79
+
80
+ # Queue already exists
81
+ class QueexistError < Error; end
82
+
83
+ # Incompatible type
84
+ class AttrtypeError < Error; end
85
+
86
+ # Cannot delete busy queue
87
+ class QuebusyError < Error; end
88
+
89
+ # Queue name too long
90
+ class QuenbigError < Error; end
91
+
92
+ # No support for requested service
93
+ class NosupError < Error; end
94
+
95
+ # Cannot enable queue, incomplete definition
96
+ class QuenoenError < Error; end
97
+
98
+ # Batch protocol error
99
+ class ProtocolError < Error; end
100
+
101
+ # Bad attribute list structure
102
+ class BadatlstError < Error; end
103
+
104
+ # No free connections
105
+ class NoconnectsError < Error; end
106
+
107
+ # No server specified
108
+ class NoserverError < Error; end
109
+
110
+ # Unknown resource type
111
+ class UnkrescError < Error; end
112
+
113
+ # Job exceeds queue resource limits
114
+ class ExcqrescError < Error; end
115
+
116
+ # No default queue specified
117
+ class QuenodfltError < Error; end
118
+
119
+ # Job is not rerunnable
120
+ class NorerunError < Error; end
121
+
122
+ # Job rejected by all possible destinations (check syntax, queue resources, …)
123
+ class RouterejError < Error; end
124
+
125
+ # Time in Route Queue Expired
126
+ class RouteexpdError < Error; end
127
+
128
+ # Execution server rejected request
129
+ class MomrejectError < Error; end
130
+
131
+ # (qsub) cannot access script file
132
+ class BadscriptError < Error; end
133
+
134
+ # Stage-in of files failed
135
+ class StageinError < Error; end
136
+
137
+ # Resource temporarily unavailable
138
+ class RescunavError < Error; end
139
+
140
+ # Bad GID for job execution
141
+ class BadgrpError < Error; end
142
+
143
+ # Maximum number of jobs already in queue
144
+ class MaxquedError < Error; end
145
+
146
+ # Checkpoint busy, may retry
147
+ class CkpbsyError < Error; end
148
+
149
+ # Resource limit exceeds allowable
150
+ class ExlimitError < Error; end
151
+
152
+ # Invalid Account
153
+ class BadacctError < Error; end
154
+
155
+ # Job already in exit state
156
+ class AlrdyexitError < Error; end
157
+
158
+ # Job files not copied
159
+ class NocopyfileError < Error; end
160
+
161
+ # Unknown job id after clean init
162
+ class CleanedoutError < Error; end
163
+
164
+ # No master found for sync job set
165
+ class NosyncmstrError < Error; end
166
+
167
+ # Invalid Job Dependency
168
+ class BaddependError < Error; end
169
+
170
+ # Duplicate entry in list
171
+ class DuplistError < Error; end
172
+
173
+ # Bad DIS based Request Protocol
174
+ class DisprotoError < Error; end
175
+
176
+ # Cannot execute at specified host because of checkpoint or stagein files
177
+ class ExecthereError < Error; end
178
+
179
+ # Sister rejected
180
+ class SisrejectError < Error; end
181
+
182
+ # Sister could not communicate
183
+ class SiscommError < Error; end
184
+
185
+ # Request not allowed: Server shutting down
186
+ class SvrdownError < Error; end
187
+
188
+ # Not all tasks could checkpoint
189
+ class CkpshortError < Error; end
190
+
191
+ # Unknown node
192
+ class UnknodeError < Error; end
193
+
194
+ # Unknown node-attribute
195
+ class UnknodeatrError < Error; end
196
+
197
+ # Server has no node list
198
+ class NonodesError < Error; end
199
+
200
+ # Node name is too big
201
+ class NodenbigError < Error; end
202
+
203
+ # Node name already exists
204
+ class NodeexistError < Error; end
205
+
206
+ # Illegal value for
207
+ class BadndatvalError < Error; end
208
+
209
+ # Mutually exclusive values for
210
+ class MutualexError < Error; end
211
+
212
+ # Modification failed for
213
+ class GmoderrError < Error; end
214
+
215
+ # Server could not connect to MOM
216
+ class NorelymomError < Error; end
217
+
218
+ # No time-share node available
219
+ class NotsnodeError < Error; end
220
+
221
+ # Wrong job type
222
+ class JobtypeError < Error; end
223
+
224
+ # Bad ACL entry in host list
225
+ class BadaclhostError < Error; end
226
+
227
+ # Maximum number of jobs already in queue for user
228
+ class MaxuserquedError < Error; end
229
+
230
+ # Bad type in disallowedTypes list
231
+ class BaddisallowtypeError < Error; end
232
+
233
+ # Queue does not allow interactive jobs
234
+ class NointeractiveError < Error; end
235
+
236
+ # Queue does not allow batch jobs
237
+ class NobatchError < Error; end
238
+
239
+ # Queue does not allow rerunable jobs
240
+ class NorerunableError < Error; end
241
+
242
+ # Queue does not allow nonrerunable jobs
243
+ class NononrerunableError < Error; end
244
+
245
+ # Unknown Array ID
246
+ class UnkarrayidError < Error; end
247
+
248
+ # Bad Job Array Request
249
+ class BadArrayReqError < Error; end
250
+
251
+ # Bad data reading job array from file
252
+ class BadArrayDataError < Error; end
253
+
254
+ # Time out
255
+ class TimeoutError < Error; end
256
+
257
+ # Job not found
258
+ class JobnotfoundError < Error; end
259
+
260
+ # Queue does not allow fault tolerant jobs
261
+ class NofaulttolerantError < Error; end
262
+
263
+ # Queue does not allow fault intolerant jobs
264
+ class NofaultintolerantError < Error; end
265
+
266
+ # Queue does not allow job arrays
267
+ class NojobarraysError < Error; end
268
+
269
+ # Request was relayed to a MOM
270
+ class RelayedToMomError < Error; end
271
+
272
+ # Error allocating memory - out of memory
273
+ class MemMallocError < Error; end
274
+
275
+ # Error allocating controling mutex (lock/unlock)
276
+ class MutexError < Error; end
277
+
278
+ # Error setting thread attributes
279
+ class ThreadattrError < Error; end
280
+
281
+ # Error creating thread
282
+ class ThreadError < Error; end
283
+
284
+ # Error in socket select
285
+ class SelectError < Error; end
286
+
287
+ # Unable to get connection to socket
288
+ class SocketFaultError < Error; end
289
+
290
+ # Error writing data to socket
291
+ class SocketWriteError < Error; end
292
+
293
+ # Error reading data from socket
294
+ class SocketReadError < Error; end
295
+
296
+ # Socket close detected
297
+ class SocketCloseError < Error; end
298
+
299
+ # Error listening on socket
300
+ class SocketListenError < Error; end
301
+
302
+ # Invalid auth type in request
303
+ class AuthInvalidError < Error; end
304
+
305
+ # This functionality is not yet implemented
306
+ class NotImplementedError < Error; end
307
+
308
+ # Queue is currently not available
309
+ class QuenotavailableError < Error; end
310
+
311
+ # tmpdir owned by another user
312
+ class TmpdiffownerError < Error; end
313
+
314
+ # tmpdir exists but is not a directory
315
+ class TmpnotdirError < Error; end
316
+
317
+ # tmpdir cannot be named for job
318
+ class TmpnonameError < Error; end
319
+
320
+ # Cannot open demux sockets
321
+ class CantopensocketError < Error; end
322
+
323
+ # Cannot send join job to all sisters
324
+ class CantcontactsistersError < Error; end
325
+
326
+ # Cannot create tmpdir for job
327
+ class CantcreatetmpdirError < Error; end
328
+
329
+ # Mom is down, cannot run job
330
+ class BadmomstateError < Error; end
331
+
332
+ # Socket information is not accessible
333
+ class SocketInformationError < Error; end
334
+
335
+ # Data on socket does not process correctly
336
+ class SocketDataError < Error; end
337
+
338
+ # Client is not allowed/trusted
339
+ class ClientInvalidError < Error; end
340
+
341
+ # Premature End of File
342
+ class PrematureEofError < Error; end
343
+
344
+ # Error saving file
345
+ class CanNotSaveFileError < Error; end
346
+
347
+ # Error opening file
348
+ class CanNotOpenFileError < Error; end
349
+
350
+ # Error writing file
351
+ class CanNotWriteFileError < Error; end
352
+
353
+ # Job file corrupt
354
+ class JobFileCorruptError < Error; end
355
+
356
+ # Job can not be rerun
357
+ class JobRerunError < Error; end
358
+
359
+ # Can not establish connection
360
+ class ConnectError < Error; end
361
+
362
+ # Job function must be temporarily delayed
363
+ class JobworkdelayError < Error; end
364
+
365
+ # Parameter of function was invalid
366
+ class BadParameterError < Error; end
367
+
368
+ # Continue processing on job. (Not an error)
369
+ class ContinueError < Error; end
370
+
371
+ # Current sub state does not allow trasaction.
372
+ class JobsubstateError < Error; end
373
+
374
+ # Error moving file
375
+ class CanNotMoveFileError < Error; end
376
+
377
+ # Job is being recycled
378
+ class JobRecycledError < Error; end
379
+
380
+ # Job is already in destination queue.
381
+ class JobAlreadyInQueueError < Error; end
382
+
383
+ # Mutex is NULL or otherwise invalid
384
+ class InvalidMutexError < Error; end
385
+
386
+ # The mutex is already locked by this object
387
+ class MutexAlreadyLockedError < Error; end
388
+
389
+ # The mutex has already been unlocked by this object
390
+ class MutexAlreadyUnlockedError < Error; end
391
+
392
+ # Command syntax invalid
393
+ class InvalidSyntaxError < Error; end
394
+
395
+ # A node is down. Check the MOM and host
396
+ class NodeDownError < Error; end
397
+
398
+ # Could not connect to batch server
399
+ class ServerNotFoundError < Error; end
400
+
401
+ # Server busy. Currently no available threads
402
+ class ServerBusyError < Error; end
403
+ end
@@ -0,0 +1,189 @@
1
+ require "socket"
2
+ require "tempfile"
3
+ require "open3"
4
+
5
+ module PBS
6
+ class Job
7
+ HOSTNAME = Socket.gethostname
8
+
9
+ attr_accessor :id
10
+ attr_reader :conn
11
+
12
+ # Needs a connection object and headers
13
+ # Examples of headers found in 'headers.rb'
14
+ def initialize(args = {})
15
+ # Job specific args
16
+ @id = args[:id]
17
+ @conn = args[:conn] || Conn.new
18
+ end
19
+
20
+ # Put job on hold
21
+ def hold(args = {})
22
+ # hold_type::
23
+ # The parameter, hold_type, contains the type of hold to be applied. The possible values are (default is 'u'):
24
+ # "u" : Available to the owner of the job, the batch operator and the batch administrator.
25
+ # "o" : Available to the batch operator and the batch administrator.
26
+ # "s" : Available only to the batch administrator.
27
+ hold_type = args[:hold_type] || "u"
28
+
29
+ _pbs_hold(hold_type)
30
+ self
31
+ end
32
+
33
+ # Release job from hold
34
+ def release(args = {})
35
+ # hold_type::
36
+ # The parameter, hold_type, contains the type of hold to be applied. The possible values are (default is 'u'):
37
+ # "u" : Available to the owner of the job, the batch operator and the batch administrator.
38
+ # "o" : Available to the batch operator and the batch administrator.
39
+ # "s" : Available only to the batch administrator.
40
+ hold_type = args[:hold_type] || "u"
41
+
42
+ _pbs_release(hold_type)
43
+ self
44
+ end
45
+
46
+ # Delete job
47
+ def delete(args = {})
48
+ _pbs_delete()
49
+ end
50
+
51
+ # Get status of job by creating a Query object
52
+ def status(args = {})
53
+ q = Query.new(type: :job, conn: conn)
54
+ q.find(args.merge(id: id))[0]
55
+ end
56
+
57
+ # Can submit a script as a file or string
58
+ # @param args [Hash] The options when submitting a job.
59
+ # @option args [String] :string The batch script as a string.
60
+ # @option args [String] :file The batch script file if a string is not supplied.
61
+ # @option args [Boolean] :qsub (true) Whether the <tt>qsub</tt> command is used from command line.
62
+ # @option args [Hash] :headers ({}) PBS headers.
63
+ # @option args [Hash] :resources ({}) PBS resources.
64
+ # @option args [Hash] :envvars ({}) PBS environment variables.
65
+ # @raise [Error] if fail to submit batch job.
66
+ def submit(args)
67
+ string = args.fetch(:string) { File.open(args[:file]).read }
68
+ queue = args.fetch(:queue, nil)
69
+ qsub = args.fetch(:qsub, true)
70
+
71
+ headers = args.fetch(:headers, {})
72
+ resources = args.fetch(:resources, {})
73
+ envvars = args.fetch(:envvars, {})
74
+
75
+ # Create batch script in tmp file, submit, remove tmp file
76
+ script = Tempfile.new('qsub.')
77
+ begin
78
+ script.write string
79
+ script.close
80
+ if qsub
81
+ _qsub_submit(script.path, queue, headers, resources, envvars)
82
+ else
83
+ _pbs_submit(script.path, queue, headers, resources, envvars)
84
+ end
85
+ ensure
86
+ script.unlink # deletes the temp file
87
+ end
88
+
89
+ self
90
+ end
91
+
92
+ private
93
+ # Connect to batch server, put job on hold,
94
+ # disconnect, and finally check for errors
95
+ def _pbs_hold(hold_type)
96
+ conn.connect unless conn.connected?
97
+ Torque.pbs_holdjob(conn.conn_id, id, hold_type, nil)
98
+ conn.disconnect
99
+ Torque.check_for_error
100
+ end
101
+
102
+ # Connect to batch server, release job from hold,
103
+ # disconnect, and finally check for errors
104
+ def _pbs_release(hold_type)
105
+ conn.connect unless conn.connected?
106
+ Torque.pbs_rlsjob(conn.conn_id, id, hold_type, nil)
107
+ conn.disconnect
108
+ Torque.check_for_error
109
+ end
110
+
111
+ # Connect to batch server, delete job,
112
+ # disconnect, and finally check for errors
113
+ def _pbs_delete()
114
+ conn.connect unless conn.connected?
115
+ Torque.pbs_deljob(conn.conn_id, id, nil)
116
+ conn.disconnect
117
+ Torque.check_for_error
118
+ end
119
+
120
+ # Connect to server, submit job with headers,
121
+ # disconnect, and finally check for errors
122
+ def _pbs_submit(script, queue, headers, resources, envvars)
123
+ # Generate attribute hash for this job
124
+ attribs = _default_headers.merge(headers)
125
+ attribs[ATTR[:l]] = _default_resources.merge(resources)
126
+ attribs[ATTR[:v]] = _default_envvars.merge(envvars).map{|k,v| "#{k}=#{v}"}.join(",")
127
+
128
+ # Filter some of the attributes
129
+ attribs[ATTR[:o]].prepend("#{HOSTNAME}:")
130
+ attribs[ATTR[:e]].prepend("#{HOSTNAME}:")
131
+
132
+ # Submit job
133
+ conn.connect unless conn.connected?
134
+ attropl = Torque::Attropl.from_hash(attribs)
135
+ self.id = Torque.pbs_submit(conn.conn_id, attropl, script, queue, nil)
136
+ conn.disconnect
137
+ Torque.check_for_error
138
+ end
139
+
140
+ # Submit using system call `qsub`
141
+ # Note: Do not need to filter as OSC has personal torque filter
142
+ def _qsub_submit(script, queue, headers, resources, envvars)
143
+ params = "-q #{queue}@#{conn.server}"
144
+ params << resources.map{|k,v| " -l '#{k}=#{v}'"}.join("")
145
+ params << " -v '#{envvars.map{|k,v| "#{k}=#{v}"}.join(",")}'" unless envvars.empty?
146
+ params << headers.map do |k,v|
147
+ param = ATTR.key(k)
148
+ if param && param.length == 1
149
+ " -#{param} '#{v}'"
150
+ else
151
+ " -W '#{k}=#{v}'"
152
+ end
153
+ end.join("")
154
+ cmd = "#{conn.qsub} #{params} #{script}"
155
+ Open3.popen3(cmd) do |stdin, stdout, stderr, wait_thr|
156
+ exit_status = wait_thr.value
157
+ unless exit_status.success?
158
+ raise PBS::Error, "#{stderr.read}"
159
+ end
160
+
161
+ self.id = stdout.read.chomp # newline char at end of job id
162
+ end
163
+ end
164
+
165
+ # Hash representing the job headers
166
+ def _default_headers
167
+ {
168
+ ATTR[:N] => "Jobname",
169
+ ATTR[:o] => "#{Dir.pwd}/",
170
+ ATTR[:e] => "#{Dir.pwd}/",
171
+ ATTR[:S] => "/bin/bash",
172
+ }
173
+ end
174
+
175
+ # Hash representing the resources used
176
+ def _default_resources
177
+ {
178
+ walltime: "01:00:00",
179
+ }
180
+ end
181
+
182
+ # Hash representing the PBS working directory
183
+ def _default_envvars
184
+ {
185
+ PBS_O_WORKDIR: "#{Dir.pwd}",
186
+ }
187
+ end
188
+ end
189
+ end