ood_core 0.5.1 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/lib/ood_core/batch_connect/template.rb +17 -6
- data/lib/ood_core/batch_connect/templates/vnc.rb +2 -2
- data/lib/ood_core/job/adapters/drmaa.rb +1002 -0
- data/lib/ood_core/job/adapters/helper.rb +18 -0
- data/lib/ood_core/job/adapters/lsf/batch.rb +4 -3
- data/lib/ood_core/job/adapters/lsf.rb +4 -2
- data/lib/ood_core/job/adapters/pbspro.rb +19 -8
- data/lib/ood_core/job/adapters/sge/batch.rb +203 -0
- data/lib/ood_core/job/adapters/sge/helper.rb +65 -0
- data/lib/ood_core/job/adapters/sge/qstat_xml_j_r_listener.rb +116 -0
- data/lib/ood_core/job/adapters/sge/qstat_xml_r_listener.rb +138 -0
- data/lib/ood_core/job/adapters/sge.rb +163 -0
- data/lib/ood_core/job/adapters/slurm.rb +16 -5
- data/lib/ood_core/job/adapters/torque/attributes.rb +109 -0
- data/lib/ood_core/job/adapters/torque/batch.rb +470 -0
- data/lib/ood_core/job/adapters/torque/error.rb +403 -0
- data/lib/ood_core/job/adapters/torque/ffi.rb +430 -0
- data/lib/ood_core/job/adapters/torque.rb +23 -18
- data/lib/ood_core/job/status.rb +3 -13
- data/lib/ood_core/refinements/drmaa_extensions.rb +21 -0
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +3 -3
- metadata +23 -9
@@ -0,0 +1,430 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
class OodCore::Job::Adapters::Torque::FFI
|
4
|
+
# An interface to the C-library of Torque
|
5
|
+
extend ::FFI::Library
|
6
|
+
|
7
|
+
# @!attribute [rw] self.pbs_errno
|
8
|
+
# The internal PBS error number
|
9
|
+
# int pbs_errno
|
10
|
+
# @return [Fixnum] pbs error number
|
11
|
+
|
12
|
+
# @!attribute [r] self.pbs_server
|
13
|
+
# The PBS server name
|
14
|
+
# char *pbs_server
|
15
|
+
# @return [String] pbs server name
|
16
|
+
|
17
|
+
# @!method self.pbs_strerror(errno)
|
18
|
+
# Generates PBS error string from given error number
|
19
|
+
# char *pbs_strerror(int errno)
|
20
|
+
# @param errno [Fixnum] pbs error number
|
21
|
+
# @return [String] pbs error string
|
22
|
+
|
23
|
+
# @!method self.pbs_default
|
24
|
+
# Default PBS server name
|
25
|
+
# char *pbs_default(void)
|
26
|
+
# @see http://linux.die.net/man/3/pbs_default
|
27
|
+
# @return [String] default pbs server name
|
28
|
+
|
29
|
+
# @!method self.pbs_connect(server)
|
30
|
+
# Connect to PBS batch server
|
31
|
+
# int pbs_connect(char *server)
|
32
|
+
# @see http://linux.die.net/man/3/pbs_connect
|
33
|
+
# @param server [String] name of pbs server
|
34
|
+
# @return [Fixnum] connection identifier
|
35
|
+
|
36
|
+
# @!method self.pbs_disconnect(connect)
|
37
|
+
# Disconnect from a PBS batch server
|
38
|
+
# int pbs_disconnect(int connect)
|
39
|
+
# @see http://linux.die.net/man/3/pbs_disconnect
|
40
|
+
# @param connect [Fixnum] connection identifier
|
41
|
+
# @return [Fixnum] exit status code
|
42
|
+
|
43
|
+
# @!method self.pbs_deljob(connect, job_id, extend)
|
44
|
+
# Delete a PBS batch job
|
45
|
+
# int pbs_deljob(int connect, char *job_id, char *extend)
|
46
|
+
# @see http://linux.die.net/man/3/pbs_deljob
|
47
|
+
# @param connect [Fixnum] connection identifier
|
48
|
+
# @param job_id [String] the job id
|
49
|
+
# @param extend [String] implementation defined extensions
|
50
|
+
# @return [Fixnum] exit status code
|
51
|
+
|
52
|
+
# @!method self.pbs_holdjob(connect, job_id, hold_type, extend)
|
53
|
+
# Place a hold on a PBS batch job
|
54
|
+
# int pbs_holdjob(int connect, char *job_id, char *hold_type, char *extend)
|
55
|
+
# @see http://linux.die.net/man/3/pbs_holdjob
|
56
|
+
# @param connect [Fixnum] connection identifier
|
57
|
+
# @param job_id [String] the job id
|
58
|
+
# @param hold_type [String] type of hold to be applied
|
59
|
+
# @param extend [String] implementation defined extensions
|
60
|
+
# @return [Fixnum] exit status code
|
61
|
+
|
62
|
+
# @!method self.pbs_rlsjob(connect, job_id, hold_type, extend)
|
63
|
+
# Release a hold on a PBS batch job
|
64
|
+
# int pbs_rlsjob(int connect, char *job_id, char *hold_type, char *extend)
|
65
|
+
# @see http://linux.die.net/man/3/pbs_rlsjob
|
66
|
+
# @param connect [Fixnum] connection identifier
|
67
|
+
# @param job_id [String] the job id
|
68
|
+
# @param hold_type [String] type of hold to be released
|
69
|
+
# @param extend [String] implementation defined extensions
|
70
|
+
# @return [Fixnum] exit status code
|
71
|
+
|
72
|
+
# @!method self.pbs_statfree(stat)
|
73
|
+
# Free the memory allocated by {BatchStatus} object
|
74
|
+
# void pbs_statfree(struct batch_status *stat)
|
75
|
+
# @param stat [BatchStatus] the batch status object
|
76
|
+
# @return [void]
|
77
|
+
|
78
|
+
# @!method self.pbs_statjob(connect, id, attrib, extend)
|
79
|
+
# Obtain status of PBS batch jobs
|
80
|
+
# batch_status * pbs_statjob(int connect, char *id, struct attrl *attrib, char *extend)
|
81
|
+
# @see http://linux.die.net/man/3/pbs_statjob
|
82
|
+
# @param connect [Fixnum] connection identifier
|
83
|
+
# @param id [String] job or destination identifier
|
84
|
+
# @param attrib [Attrl] the attribute c-linked list object
|
85
|
+
# @param extend [String] implementation defined extensions
|
86
|
+
# @return [BatchStatus] c-linked list of batch status objects
|
87
|
+
# @note It is up to the user to free the space of the batch status objects
|
88
|
+
|
89
|
+
# @!method self.pbs_statnode(connect, id, attrib, extend)
|
90
|
+
# Obtain status of PBS nodes
|
91
|
+
# batch_status * pbs_statnode(int connect, char *id, struct attrl *attrib, char *extend)
|
92
|
+
# @see http://linux.die.net/man/3/pbs_statnode
|
93
|
+
# @param connect [Fixnum] connection identifier
|
94
|
+
# @param id [String] name of a node or null string
|
95
|
+
# @param attrib [Attrl] the attribute c-linked list object
|
96
|
+
# @param extend [String] implementation defined extensions
|
97
|
+
# @return [BatchStatus] c-linked list of batch status objects
|
98
|
+
# @note It is up to the user to free the space of the batch status objects
|
99
|
+
|
100
|
+
# @!method self.pbs_statque(connect, id, attrib, extend)
|
101
|
+
# Obtain status of PBS batch queues
|
102
|
+
# batch_status * pbs_statque(int connect, char *id, struct attrl *attrib, char *extend)
|
103
|
+
# @see http://linux.die.net/man/3/pbs_statque
|
104
|
+
# @param connect [Fixnum] connection identifier
|
105
|
+
# @param id [String] name of a queue or null string
|
106
|
+
# @param attrib [Attrl] the attribute c-linked list object
|
107
|
+
# @param extend [String] implementation defined extensions
|
108
|
+
# @return [BatchStatus] c-linked list of batch status objects
|
109
|
+
# @note It is up to the user to free the space of the batch status objects
|
110
|
+
|
111
|
+
# @!method self.pbs_statserver(connect, attrib, extend)
|
112
|
+
# Obtain status of a PBS batch server
|
113
|
+
# batch_status * pbs_statserver(int connect, struct attrl *attrib, char *extend)
|
114
|
+
# @see http://linux.die.net/man/3/pbs_statserver
|
115
|
+
# @param connect [Fixnum] connection identifier
|
116
|
+
# @param attrib [Attrl] the attribute c-linked list object
|
117
|
+
# @param extend [String] implementation defined extensions
|
118
|
+
# @return [BatchStatus] c-linked list of batch status objects
|
119
|
+
# @note It is up to the user to free the space of the batch status objects
|
120
|
+
|
121
|
+
# @!method self.pbs_selstat(connect, attrib, extend)
|
122
|
+
# Obtain status of selected PBS batch jobs
|
123
|
+
# batch_status * pbs_selstat(int connect, struct attropl *sel_list, char *extend)
|
124
|
+
# @see http://linux.die.net/man/3/pbs_selstat
|
125
|
+
# @param connect [Fixnum] connection identifier
|
126
|
+
# @param attrib [Attropl] the attribute operation c-linked list object
|
127
|
+
# @param extend [String] implementation defined extensions
|
128
|
+
# @return [BatchStatus] c-linked list of batch status objects
|
129
|
+
# @note It is up to the user to free the space of the batch status objects
|
130
|
+
|
131
|
+
# @!method self.pbs_submit(connect, attrib, script, destination, extend)
|
132
|
+
# Submit a PBS batch job
|
133
|
+
# char *pbs_submit(int connect, struct attropl *attrib, char *script, char *destination, char *extend)
|
134
|
+
# @see http://linux.die.net/man/3/pbs_submit
|
135
|
+
# @param connect [Fixnum] connection identifier
|
136
|
+
# @param attrib [Attropl] the attribute operation c-linked list object
|
137
|
+
# @param script [String] the path to the script
|
138
|
+
# @param destination [String] the queue to send job to
|
139
|
+
# @param extend [String] implementation defined extensions
|
140
|
+
# @return [String] the job id
|
141
|
+
|
142
|
+
# The path to the torque library file
|
143
|
+
# @return [String] path to torque library
|
144
|
+
def self.lib
|
145
|
+
@lib
|
146
|
+
end
|
147
|
+
|
148
|
+
# Define torque methods using a supplied library
|
149
|
+
# @param lib [#to_s, nil] path to library file
|
150
|
+
# @return [void]
|
151
|
+
def self.lib=(lib)
|
152
|
+
@lib = lib ? lib.to_s : 'torque'
|
153
|
+
|
154
|
+
# Set up FFI to use this library
|
155
|
+
ffi_lib @lib
|
156
|
+
|
157
|
+
attach_variable :pbs_errno, :int
|
158
|
+
attach_variable :pbs_server, :string
|
159
|
+
attach_function :pbs_strerror, [ :int ], :string
|
160
|
+
attach_function :pbs_default, [], :string
|
161
|
+
attach_function :pbs_connect, [ :string ], :int
|
162
|
+
attach_function :pbs_disconnect, [ :int ], :int
|
163
|
+
attach_function :pbs_deljob, [ :int, :string, :string ], :int
|
164
|
+
attach_function :pbs_holdjob, [ :int, :string, :string, :string ], :int
|
165
|
+
attach_function :pbs_rlsjob, [ :int, :string, :string, :string ], :int
|
166
|
+
attach_function :pbs_statfree, [ BatchStatus.ptr ], :void
|
167
|
+
attach_function :pbs_statjob, [ :int, :string, Attrl.ptr, :string ], BatchStatus.ptr
|
168
|
+
attach_function :pbs_statnode, [ :int, :string, Attrl.ptr, :string ], BatchStatus.ptr
|
169
|
+
attach_function :pbs_statque, [ :int, :string, Attrl.ptr, :string ], BatchStatus.ptr
|
170
|
+
attach_function :pbs_statserver, [ :int, Attrl.ptr, :string ], BatchStatus.ptr
|
171
|
+
attach_function :pbs_selstat, [ :int, Attropl.ptr, :string ], BatchStatus.ptr
|
172
|
+
|
173
|
+
# FIXME: The space for the job_identifier string is allocated by
|
174
|
+
# pbs_submit() and should be released via a call to free() when no longer
|
175
|
+
# needed
|
176
|
+
attach_function :pbs_submit, [ :int, Attropl.ptr, :string, :string, :string ], :string
|
177
|
+
end
|
178
|
+
|
179
|
+
# Check for any errors set in the errno
|
180
|
+
# @return [void]
|
181
|
+
def self.check_for_error
|
182
|
+
errno = pbs_errno
|
183
|
+
self.pbs_errno = 0 # reset error number
|
184
|
+
raise_error(errno) if errno > 0
|
185
|
+
end
|
186
|
+
|
187
|
+
# For a given errno, raise the corresponding error with error message
|
188
|
+
# @param errno [Fixnum] the error number
|
189
|
+
# @raise [Error] if errno is not 0
|
190
|
+
# @return [void]
|
191
|
+
def self.raise_error(errno)
|
192
|
+
raise (ERROR_CODES[errno] || PBS::Error), "#{pbs_strerror(errno)}"
|
193
|
+
end
|
194
|
+
|
195
|
+
#
|
196
|
+
# Data structures defined in pbs_ifl.h
|
197
|
+
#
|
198
|
+
|
199
|
+
# Enum for Batch Operation
|
200
|
+
BatchOp = enum(:set, :unset, :incr, :decr, :eq, :ne, :ge, :gt, :le, :lt, :dflt, :merge, :incr_old)
|
201
|
+
|
202
|
+
# Struct for Attribute C-linked list
|
203
|
+
class Attrl < ::FFI::Struct
|
204
|
+
layout :next, Attrl.ptr, # pointer to next Attrl object
|
205
|
+
:name, :pointer, # string for name of attribute
|
206
|
+
:resource, :pointer, # string for resource if this attribute is a resource
|
207
|
+
:value, :pointer, # string for value of attribute
|
208
|
+
:op, BatchOp # not used in an Attrl object
|
209
|
+
|
210
|
+
# Given an array of attribute names convert it to {Attrl} C-linked list
|
211
|
+
# @param list [Array<Symbol>] list of attribute names
|
212
|
+
# @return [Attrl] generated attribute c-linked list object
|
213
|
+
def self.from_list(list)
|
214
|
+
attrl = nil
|
215
|
+
prev = Attrl.new(::FFI::Pointer::NULL)
|
216
|
+
list.each do |key|
|
217
|
+
attrl = Attrl.new
|
218
|
+
attrl[:name] = ::FFI::MemoryPointer.from_string(key.to_s)
|
219
|
+
attrl[:next] = prev
|
220
|
+
prev = attrl
|
221
|
+
end
|
222
|
+
attrl
|
223
|
+
end
|
224
|
+
|
225
|
+
# Convert to hash describing this linked list
|
226
|
+
# @return [Hash] hash describing linked list
|
227
|
+
def to_h
|
228
|
+
attrl = self
|
229
|
+
hash = {}
|
230
|
+
until attrl.to_ptr.null?
|
231
|
+
n = attrl[:name].read_string
|
232
|
+
v = attrl[:value].read_string
|
233
|
+
r = attrl[:resource].null? ? nil : attrl[:resource].read_string
|
234
|
+
r ? (hash[n.to_sym] ||= {} and hash[n.to_sym][r.to_sym] = v) : hash[n.to_sym] = v
|
235
|
+
attrl = attrl[:next]
|
236
|
+
end
|
237
|
+
hash
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# Struct for Attribute Operation C-linked list
|
242
|
+
class Attropl < ::FFI::Struct
|
243
|
+
layout :next, Attropl.ptr, # pointer to next Attropl object
|
244
|
+
:name, :pointer, # string for name of attribute
|
245
|
+
:resource, :pointer, # string for resource if this attribute is a resource
|
246
|
+
:value, :pointer, # string for value of attribute
|
247
|
+
:op, BatchOp # operation to perform for this attribute
|
248
|
+
|
249
|
+
# Convert to C-linked list of structs from list of hashes
|
250
|
+
# @param list [Array<#to_h>] list of hashes describing attribute
|
251
|
+
# @return [Attropl] generated attribute operation c-linked list object
|
252
|
+
def self.from_list(list)
|
253
|
+
list = list.map(&:to_h)
|
254
|
+
attropl = nil
|
255
|
+
prev = Attropl.new(::FFI::Pointer::NULL)
|
256
|
+
list.each do |attrib|
|
257
|
+
attropl = Attropl.new
|
258
|
+
attropl[:name] = ::FFI::MemoryPointer.from_string attrib[:name].to_s
|
259
|
+
attropl[:value] = ::FFI::MemoryPointer.from_string attrib[:value].to_s
|
260
|
+
attropl[:resource] = ::FFI::MemoryPointer.from_string attrib[:resource].to_s
|
261
|
+
attropl[:op] = (attrib[:op] || :eq).to_sym
|
262
|
+
attropl[:next] = prev
|
263
|
+
prev = attropl
|
264
|
+
end
|
265
|
+
attropl
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
# Struct for PBS batch server status responses
|
270
|
+
class BatchStatus < ::FFI::ManagedStruct
|
271
|
+
layout :next, BatchStatus.ptr, # pointer to next BatchStatus object
|
272
|
+
:name, :string, # string for name of this status
|
273
|
+
:attribs, Attrl.ptr, # pointer to beginning of C-linked list of an Attrl object
|
274
|
+
:text, :string # string containing unknown text
|
275
|
+
|
276
|
+
# Free memory for allocated {BatchStatus} C-linked list
|
277
|
+
def self.release(ptr)
|
278
|
+
pbs_statfree(ptr)
|
279
|
+
end
|
280
|
+
|
281
|
+
# Convert to hash describing this linked list
|
282
|
+
# @return [Hash] hash describing linked list
|
283
|
+
def to_h
|
284
|
+
batch = self
|
285
|
+
hash = {}
|
286
|
+
until batch.to_ptr.null?
|
287
|
+
hash[batch[:name]] = batch[:attribs].to_h
|
288
|
+
batch = batch[:next]
|
289
|
+
end
|
290
|
+
hash
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
# Defined error codes, valid as of Torque >=4.2.10
|
295
|
+
ERROR_CODES = {
|
296
|
+
15001 => UnkjobidError,
|
297
|
+
15002 => NoattrError,
|
298
|
+
15003 => AttrroError,
|
299
|
+
15004 => IvalreqError,
|
300
|
+
15005 => UnkreqError,
|
301
|
+
15006 => ToomanyError,
|
302
|
+
15007 => PermError,
|
303
|
+
15008 => IffNotFoundError,
|
304
|
+
15009 => MungeNotFoundError,
|
305
|
+
15010 => BadhostError,
|
306
|
+
15011 => JobexistError,
|
307
|
+
15012 => SystemError,
|
308
|
+
15013 => InternalError,
|
309
|
+
15014 => RegrouteError,
|
310
|
+
15015 => UnksigError,
|
311
|
+
15016 => BadatvalError,
|
312
|
+
15017 => ModatrrunError,
|
313
|
+
15018 => BadstateError,
|
314
|
+
15020 => UnkqueError,
|
315
|
+
15021 => BadcredError,
|
316
|
+
15022 => ExpiredError,
|
317
|
+
15023 => QunoenbError,
|
318
|
+
15024 => QacessError,
|
319
|
+
15025 => BaduserError,
|
320
|
+
15026 => HopcountError,
|
321
|
+
15027 => QueexistError,
|
322
|
+
15028 => AttrtypeError,
|
323
|
+
15029 => QuebusyError,
|
324
|
+
15030 => QuenbigError,
|
325
|
+
15031 => NosupError,
|
326
|
+
15032 => QuenoenError,
|
327
|
+
15033 => ProtocolError,
|
328
|
+
15034 => BadatlstError,
|
329
|
+
15035 => NoconnectsError,
|
330
|
+
15036 => NoserverError,
|
331
|
+
15037 => UnkrescError,
|
332
|
+
15038 => ExcqrescError,
|
333
|
+
15039 => QuenodfltError,
|
334
|
+
15040 => NorerunError,
|
335
|
+
15041 => RouterejError,
|
336
|
+
15042 => RouteexpdError,
|
337
|
+
15043 => MomrejectError,
|
338
|
+
15044 => BadscriptError,
|
339
|
+
15045 => StageinError,
|
340
|
+
15046 => RescunavError,
|
341
|
+
15047 => BadgrpError,
|
342
|
+
15048 => MaxquedError,
|
343
|
+
15049 => CkpbsyError,
|
344
|
+
15050 => ExlimitError,
|
345
|
+
15051 => BadacctError,
|
346
|
+
15052 => AlrdyexitError,
|
347
|
+
15053 => NocopyfileError,
|
348
|
+
15054 => CleanedoutError,
|
349
|
+
15055 => NosyncmstrError,
|
350
|
+
15056 => BaddependError,
|
351
|
+
15057 => DuplistError,
|
352
|
+
15058 => DisprotoError,
|
353
|
+
15059 => ExecthereError,
|
354
|
+
15060 => SisrejectError,
|
355
|
+
15061 => SiscommError,
|
356
|
+
15062 => SvrdownError,
|
357
|
+
15063 => CkpshortError,
|
358
|
+
15064 => UnknodeError,
|
359
|
+
15065 => UnknodeatrError,
|
360
|
+
15066 => NonodesError,
|
361
|
+
15067 => NodenbigError,
|
362
|
+
15068 => NodeexistError,
|
363
|
+
15069 => BadndatvalError,
|
364
|
+
15070 => MutualexError,
|
365
|
+
15071 => GmoderrError,
|
366
|
+
15072 => NorelymomError,
|
367
|
+
15073 => NotsnodeError,
|
368
|
+
15074 => JobtypeError,
|
369
|
+
15075 => BadaclhostError,
|
370
|
+
15076 => MaxuserquedError,
|
371
|
+
15077 => BaddisallowtypeError,
|
372
|
+
15078 => NointeractiveError,
|
373
|
+
15079 => NobatchError,
|
374
|
+
15080 => NorerunableError,
|
375
|
+
15081 => NononrerunableError,
|
376
|
+
15082 => UnkarrayidError,
|
377
|
+
15083 => BadArrayReqError,
|
378
|
+
15084 => BadArrayDataError,
|
379
|
+
15085 => TimeoutError,
|
380
|
+
15086 => JobnotfoundError,
|
381
|
+
15087 => NofaulttolerantError,
|
382
|
+
15088 => NofaultintolerantError,
|
383
|
+
15089 => NojobarraysError,
|
384
|
+
15090 => RelayedToMomError,
|
385
|
+
15091 => MemMallocError,
|
386
|
+
15092 => MutexError,
|
387
|
+
15093 => ThreadattrError,
|
388
|
+
15094 => ThreadError,
|
389
|
+
15095 => SelectError,
|
390
|
+
15096 => SocketFaultError,
|
391
|
+
15097 => SocketWriteError,
|
392
|
+
15098 => SocketReadError,
|
393
|
+
15099 => SocketCloseError,
|
394
|
+
15100 => SocketListenError,
|
395
|
+
15101 => AuthInvalidError,
|
396
|
+
15102 => NotImplementedError,
|
397
|
+
15103 => QuenotavailableError,
|
398
|
+
15104 => TmpdiffownerError,
|
399
|
+
15105 => TmpnotdirError,
|
400
|
+
15106 => TmpnonameError,
|
401
|
+
15107 => CantopensocketError,
|
402
|
+
15108 => CantcontactsistersError,
|
403
|
+
15109 => CantcreatetmpdirError,
|
404
|
+
15110 => BadmomstateError,
|
405
|
+
15111 => SocketInformationError,
|
406
|
+
15112 => SocketDataError,
|
407
|
+
15113 => ClientInvalidError,
|
408
|
+
15114 => PrematureEofError,
|
409
|
+
15115 => CanNotSaveFileError,
|
410
|
+
15116 => CanNotOpenFileError,
|
411
|
+
15117 => CanNotWriteFileError,
|
412
|
+
15118 => JobFileCorruptError,
|
413
|
+
15119 => JobRerunError,
|
414
|
+
15120 => ConnectError,
|
415
|
+
15121 => JobworkdelayError,
|
416
|
+
15122 => BadParameterError,
|
417
|
+
15123 => ContinueError,
|
418
|
+
15124 => JobsubstateError,
|
419
|
+
15125 => CanNotMoveFileError,
|
420
|
+
15126 => JobRecycledError,
|
421
|
+
15127 => JobAlreadyInQueueError,
|
422
|
+
15128 => InvalidMutexError,
|
423
|
+
15129 => MutexAlreadyLockedError,
|
424
|
+
15130 => MutexAlreadyUnlockedError,
|
425
|
+
15131 => InvalidSyntaxError,
|
426
|
+
15132 => NodeDownError,
|
427
|
+
15133 => ServerNotFoundError,
|
428
|
+
15134 => ServerBusyError,
|
429
|
+
}
|
430
|
+
end
|
@@ -1,7 +1,5 @@
|
|
1
1
|
require "ood_core/refinements/hash_extensions"
|
2
|
-
|
3
|
-
gem "pbs", "~> 2.1"
|
4
|
-
require "pbs"
|
2
|
+
require "ood_core/job/adapters/helper"
|
5
3
|
|
6
4
|
module OodCore
|
7
5
|
module Job
|
@@ -13,12 +11,14 @@ module OodCore
|
|
13
11
|
# @option config [#to_s] :host The batch server host
|
14
12
|
# @option config [#to_s] :lib ('') Path to torque client libraries
|
15
13
|
# @option config [#to_s] :bin ('') Path to torque client binaries
|
14
|
+
# @option config [#to_h] :custom_bin ({}) Optional overrides to Torque client executables
|
16
15
|
def self.build_torque(config)
|
17
16
|
c = config.to_h.symbolize_keys
|
18
17
|
host = c.fetch(:host) { raise ArgumentError, "No host specified. Missing argument: host" }.to_s
|
19
18
|
lib = c.fetch(:lib, "").to_s
|
20
19
|
bin = c.fetch(:bin, "").to_s
|
21
|
-
|
20
|
+
custom_bin = c.fetch(:custom_bin, {})
|
21
|
+
pbs = Adapters::Torque::Batch.new(host: host, lib: lib, bin: bin, custom_bin: custom_bin)
|
22
22
|
Adapters::Torque.new(pbs: pbs)
|
23
23
|
end
|
24
24
|
end
|
@@ -30,6 +30,11 @@ module OodCore
|
|
30
30
|
using Refinements::ArrayExtensions
|
31
31
|
using Refinements::HashExtensions
|
32
32
|
|
33
|
+
require "ood_core/job/adapters/torque/error"
|
34
|
+
require "ood_core/job/adapters/torque/attributes"
|
35
|
+
require "ood_core/job/adapters/torque/ffi"
|
36
|
+
require "ood_core/job/adapters/torque/batch"
|
37
|
+
|
33
38
|
# Mapping of state characters for PBS
|
34
39
|
STATE_MAP = {
|
35
40
|
'Q' => :queued,
|
@@ -44,7 +49,7 @@ module OodCore
|
|
44
49
|
|
45
50
|
# @api private
|
46
51
|
# @param opts [#to_h] the options defining this adapter
|
47
|
-
# @option opts [
|
52
|
+
# @option opts [Torque::Batch] :pbs The PBS batch object
|
48
53
|
# @see Factory.build_torque
|
49
54
|
def initialize(opts = {})
|
50
55
|
o = opts.to_h.symbolize_keys
|
@@ -160,7 +165,7 @@ module OodCore
|
|
160
165
|
# Submit job
|
161
166
|
@pbs.submit(script.content, args: args, env: env, chdir: script.workdir)
|
162
167
|
end
|
163
|
-
rescue
|
168
|
+
rescue Torque::Batch::Error => e
|
164
169
|
raise JobAdapterError, e.message
|
165
170
|
end
|
166
171
|
|
@@ -172,7 +177,7 @@ module OodCore
|
|
172
177
|
@pbs.get_jobs.map do |k, v|
|
173
178
|
parse_job_info(k, v)
|
174
179
|
end
|
175
|
-
rescue
|
180
|
+
rescue Torque::Batch::Error => e
|
176
181
|
raise JobAdapterError, e.message
|
177
182
|
end
|
178
183
|
|
@@ -190,7 +195,7 @@ module OodCore
|
|
190
195
|
).map do |k, v|
|
191
196
|
parse_job_info(k, v)
|
192
197
|
end
|
193
|
-
rescue
|
198
|
+
rescue Torque::Batch::Error => e
|
194
199
|
raise JobAdapterError, e.message
|
195
200
|
end
|
196
201
|
|
@@ -202,13 +207,13 @@ module OodCore
|
|
202
207
|
def info(id)
|
203
208
|
id = id.to_s
|
204
209
|
parse_job_info(*@pbs.get_job(id).flatten)
|
205
|
-
rescue
|
210
|
+
rescue Torque::FFI::UnkjobidError
|
206
211
|
# set completed status if can't find job id
|
207
212
|
Info.new(
|
208
213
|
id: id,
|
209
214
|
status: :completed
|
210
215
|
)
|
211
|
-
rescue
|
216
|
+
rescue Torque::Batch::Error => e
|
212
217
|
raise JobAdapterError, e.message
|
213
218
|
end
|
214
219
|
|
@@ -221,10 +226,10 @@ module OodCore
|
|
221
226
|
id = id.to_s
|
222
227
|
char = @pbs.get_job(id, filters: [:job_state])[id][:job_state]
|
223
228
|
Status.new(state: STATE_MAP.fetch(char, :undetermined))
|
224
|
-
rescue
|
229
|
+
rescue Torque::FFI::UnkjobidError
|
225
230
|
# set completed status if can't find job id
|
226
231
|
Status.new(state: :completed)
|
227
|
-
rescue
|
232
|
+
rescue Torque::Batch::Error => e
|
228
233
|
raise JobAdapterError, e.message
|
229
234
|
end
|
230
235
|
|
@@ -235,10 +240,10 @@ module OodCore
|
|
235
240
|
# @see Adapter#hold
|
236
241
|
def hold(id)
|
237
242
|
@pbs.hold_job(id.to_s)
|
238
|
-
rescue
|
243
|
+
rescue Torque::FFI::UnkjobidError
|
239
244
|
# assume successful job hold if can't find job id
|
240
245
|
nil
|
241
|
-
rescue
|
246
|
+
rescue Torque::Batch::Error => e
|
242
247
|
raise JobAdapterError, e.message
|
243
248
|
end
|
244
249
|
|
@@ -249,10 +254,10 @@ module OodCore
|
|
249
254
|
# @see Adapter#release
|
250
255
|
def release(id)
|
251
256
|
@pbs.release_job(id.to_s)
|
252
|
-
rescue
|
257
|
+
rescue Torque::FFI::UnkjobidError
|
253
258
|
# assume successful job release if can't find job id
|
254
259
|
nil
|
255
|
-
rescue
|
260
|
+
rescue Torque::Batch::Error => e
|
256
261
|
raise JobAdapterError, e.message
|
257
262
|
end
|
258
263
|
|
@@ -263,11 +268,11 @@ module OodCore
|
|
263
268
|
# @see Adapter#delete
|
264
269
|
def delete(id)
|
265
270
|
@pbs.delete_job(id.to_s)
|
266
|
-
rescue
|
271
|
+
rescue Torque::FFI::UnkjobidError, Torque::FFI::BadstateError
|
267
272
|
# assume successful job deletion if can't find job id
|
268
273
|
# assume successful job deletion if job is exiting or completed
|
269
274
|
nil
|
270
|
-
rescue
|
275
|
+
rescue Torque::Batch::Error => e
|
271
276
|
raise JobAdapterError, e.message
|
272
277
|
end
|
273
278
|
|
data/lib/ood_core/job/status.rb
CHANGED
@@ -108,21 +108,11 @@ module OodCore
|
|
108
108
|
# @param block an optional block for the call
|
109
109
|
# @raise [NoMethodError] if method name doesn't pass checks
|
110
110
|
# @return [Boolean] whether it is in this state
|
111
|
-
|
112
|
-
|
113
|
-
self ==
|
114
|
-
else
|
115
|
-
super
|
111
|
+
states.each do |state|
|
112
|
+
define_method("#{state}?") do
|
113
|
+
self == state
|
116
114
|
end
|
117
115
|
end
|
118
|
-
|
119
|
-
# Determines whether this method corresponds to a status check for a valid
|
120
|
-
# state
|
121
|
-
# @param method_name the method name called
|
122
|
-
# @return [Boolean]
|
123
|
-
def respond_to_missing?(method_name, include_private = false)
|
124
|
-
/^(?<other_state>.+)\?$/ =~ method_name && self.class.states.include?(other_state.to_sym) || super
|
125
|
-
end
|
126
116
|
end
|
127
117
|
end
|
128
118
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
module DRMAA
|
3
|
+
# The one and only connection with DRMAA
|
4
|
+
# Attempting to instantiate a DRMAA::Session more than once causes it to crash
|
5
|
+
class SessionSingleton < DRMAA::Session
|
6
|
+
include Singleton
|
7
|
+
end
|
8
|
+
DRMMA_TO_OOD_STATE_MAP = {
|
9
|
+
DRMAA::STATE_UNDETERMINED => :undetermined,
|
10
|
+
DRMAA::STATE_QUEUED_ACTIVE => :queued,
|
11
|
+
DRMAA::STATE_SYSTEM_ON_HOLD => :queued_held,
|
12
|
+
DRMAA::STATE_USER_ON_HOLD => :queued_held,
|
13
|
+
DRMAA::STATE_USER_SYSTEM_ON_HOLD => :queued_held,
|
14
|
+
DRMAA::STATE_RUNNING => :running,
|
15
|
+
DRMAA::STATE_SYSTEM_SUSPENDED => :suspended,
|
16
|
+
DRMAA::STATE_USER_SUSPENDED => :suspended,
|
17
|
+
DRMAA::STATE_USER_SYSTEM_SUSPENDED => :suspended,
|
18
|
+
DRMAA::STATE_DONE => :completed,
|
19
|
+
DRMAA::STATE_FAILED => :completed
|
20
|
+
}
|
21
|
+
end
|
data/lib/ood_core/version.rb
CHANGED
data/ood_core.gemspec
CHANGED
@@ -6,8 +6,8 @@ require 'ood_core/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = "ood_core"
|
8
8
|
spec.version = OodCore::VERSION
|
9
|
-
spec.authors = ["Jeremy Nicklas"]
|
10
|
-
spec.email = ["jnicklas@osc.edu"]
|
9
|
+
spec.authors = ["Jeremy Nicklas", "Morgan Rodgers"]
|
10
|
+
spec.email = ["jnicklas@osc.edu", "mrodgers@osc.edu"]
|
11
11
|
|
12
12
|
spec.summary = %q{Open OnDemand core library}
|
13
13
|
spec.description = %q{Open OnDemand core library that provides support for an HPC Center to globally define HPC services that web applications can then take advantage of.}
|
@@ -23,7 +23,7 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.required_ruby_version = ">= 2.2.0"
|
24
24
|
|
25
25
|
spec.add_runtime_dependency "ood_support", "~> 0.0.2"
|
26
|
-
spec.
|
26
|
+
spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
|
27
27
|
spec.add_development_dependency "bundler", "~> 1.7"
|
28
28
|
spec.add_development_dependency "rake", "~> 10.0"
|
29
29
|
spec.add_development_dependency "rspec", "~> 3.0"
|