ood_core 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,470 @@
1
+ require 'open3'
2
+
3
+ class OodCore::Job::Adapters::Torque
4
+ # Object used for simplified communication with a batch server
5
+ class Batch
6
+ # The host of the Torque batch server
7
+ # @example OSC's Oakley batch server
8
+ # my_conn.host #=> "oak-batch.osc.edu"
9
+ # @return [String] the batch server host
10
+ attr_reader :host
11
+
12
+ # The path to the Torque client installation libraries
13
+ # @example For Torque 5.0.0
14
+ # my_conn.lib.to_s #=> "/usr/local/Torque/5.0.0/lib"
15
+ # @return [Pathname] path to Torque libraries
16
+ attr_reader :lib
17
+
18
+ # The path to the Torque client installation binaries
19
+ # @example For Torque 5.0.0
20
+ # my_conn.bin.to_s #=> "/usr/local/Torque/5.0.0/bin"
21
+ # @return [Pathname] path to Torque binaries
22
+ attr_reader :bin
23
+
24
+ # Optional overrides for Torque client executables
25
+ # @example
26
+ # {'qsub' => '/usr/local/bin/qsub'}
27
+ # @return Hash<String, String>
28
+ attr_reader :bin_overrides
29
+
30
+ # The root exception class that all Torque-specific exceptions inherit
31
+ # from
32
+ class Error < StandardError; end
33
+
34
+ # @param host [#to_s] the batch server host
35
+ # @param lib [#to_s] path to FFI installation libraries
36
+ # @param bin [#to_s] path to FFI installation binaries
37
+ def initialize(host:, lib: "", bin: "", bin_overrides: {}, **_)
38
+ @host = host.to_s
39
+ @lib = Pathname.new(lib.to_s)
40
+ @bin = Pathname.new(bin.to_s)
41
+ @bin_overrides = bin_overrides
42
+ end
43
+
44
+ # Convert object to hash
45
+ # @return [Hash] the hash describing this object
46
+ def to_h
47
+ {host: host, lib: lib, bin: bin}
48
+ end
49
+
50
+ # The comparison operator
51
+ # @param other [#to_h] batch server to compare against
52
+ # @return [Boolean] how batch servers compare
53
+ def ==(other)
54
+ to_h == other.to_h
55
+ end
56
+
57
+ # Checks whether two batch server objects are completely identical to each
58
+ # other
59
+ # @param other [Batch] batch server to compare against
60
+ # @return [Boolean] whether same objects
61
+ def eql?(other)
62
+ self.class == other.class && self == other
63
+ end
64
+
65
+ # Generates a hash value for this object
66
+ # @return [Fixnum] hash value of object
67
+ def hash
68
+ [self.class, to_h].hash
69
+ end
70
+
71
+ # Creates a connection to batch server and calls block in context of this
72
+ # connection
73
+ # @yieldparam cid [Fixnum] connection id from established batch server connection
74
+ # @yieldreturn the final value of the block
75
+ def connect(&block)
76
+ FFI.lib = lib.join('libtorque.so')
77
+ cid = FFI.pbs_connect(host)
78
+ FFI.raise_error(cid.abs) if cid < 0 # raise error if negative connection id
79
+ begin
80
+ value = yield cid
81
+ ensure
82
+ FFI.pbs_disconnect(cid) # always close connection
83
+ end
84
+ FFI.check_for_error # check for errors at end
85
+ value
86
+ end
87
+
88
+ # Get a hash with status info for this batch server
89
+ # @example Status info for OSC Oakley batch server
90
+ # my_conn.get_status
91
+ # #=>
92
+ # #{
93
+ # # "oak-batch.osc.edu:15001" => {
94
+ # # :server_state => "Idle",
95
+ # # ...
96
+ # # }
97
+ # #}
98
+ # @param filters [Array<Symbol>] list of attribs to filter on
99
+ # @return [Hash] status info for batch server
100
+ def get_status(filters: [])
101
+ connect do |cid|
102
+ filters = Attrl.from_list filters
103
+ batch_status = FFI.pbs_statserver cid, filters, nil
104
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
105
+ end
106
+ end
107
+
108
+ # Get a list of hashes of the queues on the batch server
109
+ # @example Status info for OSC Oakley queues
110
+ # my_conn.get_queues
111
+ # #=>
112
+ # #{
113
+ # # "parallel" => {
114
+ # # :queue_type => "Execution",
115
+ # # ...
116
+ # # },
117
+ # # "serial" => {
118
+ # # :queue_type => "Execution",
119
+ # # ...
120
+ # # },
121
+ # # ...
122
+ # #}
123
+ # @param id [#to_s] the id of requested information
124
+ # @param filters [Array<Symbol>] list of attribs to filter on
125
+ # @return [Hash] hash of details for the queues
126
+ def get_queues(id: '', filters: [])
127
+ connect do |cid|
128
+ filters = Attrl.from_list(filters)
129
+ batch_status = FFI.pbs_statque cid, id.to_s, filters, nil
130
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
131
+ end
132
+ end
133
+
134
+ # Get info for given batch server's queue
135
+ # @example Status info for OSC Oakley's parallel queue
136
+ # my_conn.get_queue("parallel")
137
+ # #=>
138
+ # #{
139
+ # # "parallel" => {
140
+ # # :queue_type => "Execution",
141
+ # # ...
142
+ # # }
143
+ # #}
144
+ # @param (see @get_queues)
145
+ # @return [Hash] status info for the queue
146
+ def get_queue(id, **kwargs)
147
+ get_queues(id: id, **kwargs)
148
+ end
149
+
150
+
151
+ # Get a list of hashes of the nodes on the batch server
152
+ # @example Status info for OSC Oakley nodes
153
+ # my_conn.get_nodes
154
+ # #=>
155
+ # #{
156
+ # # "n0001" => {
157
+ # # :np => "12",
158
+ # # ...
159
+ # # },
160
+ # # "n0002" => {
161
+ # # :np => "12",
162
+ # # ...
163
+ # # },
164
+ # # ...
165
+ # #}
166
+ # @param id [#to_s] the id of requested information
167
+ # @param filters [Array<Symbol>] list of attribs to filter on
168
+ # @return [Hash] hash of details for nodes
169
+ def get_nodes(id: '', filters: [])
170
+ connect do |cid|
171
+ filters = Attrl.from_list(filters)
172
+ batch_status = FFI.pbs_statnode cid, id.to_s, filters, nil
173
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
174
+ end
175
+ end
176
+
177
+ # Get info for given batch server's node
178
+ # @example Status info for OSC Oakley's 'n0001' node
179
+ # my_conn.get_node('n0001')
180
+ # #=>
181
+ # #{
182
+ # # "n0001" => {
183
+ # # :np => "12",
184
+ # # ...
185
+ # # }
186
+ # #}
187
+ # @param (see #get_nodes)
188
+ # @return [Hash] status info for the node
189
+ def get_node(id, **kwargs)
190
+ get_nodes(id: id, **kwargs)
191
+ end
192
+
193
+ # Get a list of hashes of the selected jobs on the batch server
194
+ # @example Status info for jobs owned by Bob
195
+ # my_conn.select_jobs(attribs: [{name: "User_List", value: "bob", op: :eq}])
196
+ # #=>
197
+ # #{
198
+ # # "10219837.oak-batch.osc.edu" => {
199
+ # # :Job_Owner => "bob@oakley02.osc.edu",
200
+ # # :Job_Name => "CFD_Solver",
201
+ # # ...
202
+ # # },
203
+ # # "10219839.oak-batch.osc.edu" => {
204
+ # # :Job_Owner => "bob@oakley02.osc.edu",
205
+ # # :Job_Name => "CFD_Solver2",
206
+ # # ...
207
+ # # },
208
+ # # ...
209
+ # #}
210
+ # @param attribs [Array<#to_h>] list of hashes describing attributes to
211
+ # select on
212
+ # @return [Hash] hash of details of selected jobs
213
+ #
214
+ def select_jobs(attribs: [])
215
+ connect do |cid|
216
+ attribs = Attropl.from_list(attribs.map(&:to_h))
217
+ batch_status = FFI.pbs_selstat cid, attribs, nil
218
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
219
+ end
220
+ end
221
+
222
+ # Get a list of hashes of the jobs on the batch server
223
+ # @example Status info for OSC Oakley jobs
224
+ # my_conn.get_jobs
225
+ # #=>
226
+ # #{
227
+ # # "10219837.oak-batch.osc.edu" => {
228
+ # # :Job_Owner => "bob@oakley02.osc.edu",
229
+ # # :Job_Name => "CFD_Solver",
230
+ # # ...
231
+ # # },
232
+ # # "10219838.oak-batch.osc.edu" => {
233
+ # # :Job_Owner => "sally@oakley01.osc.edu",
234
+ # # :Job_Name => "FEA_Solver",
235
+ # # ...
236
+ # # },
237
+ # # ...
238
+ # #}
239
+ # @param id [#to_s] the id of requested information
240
+ # @param filters [Array<Symbol>] list of attribs to filter on
241
+ # @return [Hash] hash of details for jobs
242
+ def get_jobs(id: '', filters: [])
243
+ connect do |cid|
244
+ filters = FFI::Attrl.from_list(filters)
245
+ batch_status = FFI.pbs_statjob cid, id.to_s, filters, nil
246
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
247
+ end
248
+ end
249
+
250
+ # Get info for given batch server's job
251
+ # @example Status info for OSC Oakley's '10219837.oak-batch.osc.edu' job
252
+ # my_conn.get_job('102719837.oak-batch.osc.edu')
253
+ # #=>
254
+ # #{
255
+ # # "10219837.oak-batch.osc.edu" => {
256
+ # # :Job_Owner => "bob@oakley02.osc.edu",
257
+ # # :Job_Name => "CFD_Solver",
258
+ # # ...
259
+ # # }
260
+ # #}
261
+ # @param (see #get_jobs)
262
+ # @return [Hash] hash with details of job
263
+ def get_job(id, **kwargs)
264
+ get_jobs(id: id, **kwargs)
265
+ end
266
+
267
+ # Put specified job on hold
268
+ # Possible hold types:
269
+ # :u => Available to the owner of the job, the batch operator and the batch administrator
270
+ # :o => Available to the batch operator and the batch administrator
271
+ # :s => Available to the batch administrator
272
+ # @example Put job '10219837.oak-batch.osc.edu' on hold
273
+ # my_conn.hold_job('10219837.oak-batch.osc.edu')
274
+ # @param id [#to_s] the id of the job
275
+ # @param type [#to_s] type of hold to be applied
276
+ # @return [void]
277
+ def hold_job(id, type: :u)
278
+ connect do |cid|
279
+ FFI.pbs_holdjob cid, id.to_s, type.to_s, nil
280
+ end
281
+ end
282
+
283
+ # Release a specified job that is on hold
284
+ # Possible hold types:
285
+ # :u => Available to the owner of the job, the batch operator and the batch administrator
286
+ # :o => Available to the batch operator and the batch administrator
287
+ # :s => Available to the batch administrator
288
+ # @example Release job '10219837.oak-batch.osc.edu' from hold
289
+ # my_conn.release_job('10219837.oak-batch.osc.edu')
290
+ # @param id [#to_s] the id of the job
291
+ # @param type [#to_s] type of hold to be removed
292
+ # @return [void]
293
+ def release_job(id, type: :u)
294
+ connect do |cid|
295
+ FFI.pbs_rlsjob cid, id.to_s, type.to_s, nil
296
+ end
297
+ end
298
+
299
+ # Delete a specified job from batch server
300
+ # @example Delete job '10219837.oak-batch.osc.edu' from batch
301
+ # my_conn.delete_job('10219837.oak-batch.osc.edu')
302
+ # @param id [#to_s] the id of the job
303
+ # @return [void]
304
+ def delete_job(id)
305
+ connect do |cid|
306
+ FFI.pbs_deljob cid, id.to_s, nil
307
+ end
308
+ end
309
+
310
+ # Submit a script to the batch server
311
+ # @example Submit a script with a few PBS directives
312
+ # my_conn.submit_script("/path/to/script",
313
+ # headers: {
314
+ # Job_Name: "myjob",
315
+ # Join_Path: "oe"
316
+ # },
317
+ # resources: {
318
+ # nodes: "4:ppn=12",
319
+ # walltime: "12:00:00"
320
+ # },
321
+ # envvars: {
322
+ # TOKEN: "asd90f9sd8g90hk34"
323
+ # }
324
+ # )
325
+ # #=> "6621251.oak-batch.osc.edu"
326
+ # @param script [#to_s] path to the script
327
+ # @param queue [#to_s] queue to submit script to
328
+ # @param headers [Hash] pbs headers
329
+ # @param resources [Hash] pbs resources
330
+ # @param envvars [Hash] pbs environment variables
331
+ # @param qsub [Boolean] whether use library or binary for submission
332
+ # @return [String] the id of the job that was created
333
+ # @deprecated Use {#submit} instead.
334
+ def submit_script(script, queue: nil, headers: {}, resources: {}, envvars: {}, qsub: true)
335
+ send(qsub ? :qsub_submit : :pbs_submit, script.to_s, queue.to_s, headers, resources, envvars)
336
+ end
337
+
338
+ # Submit a script expanded into a string to the batch server
339
+ # @param string [#to_s] script as a string
340
+ # @param (see #submit_script)
341
+ # @return [String] the id of the job that was created
342
+ # @deprecated Use {#submit} instead.
343
+ def submit_string(string, **kwargs)
344
+ Tempfile.open('qsub.') do |f|
345
+ f.write string.to_s
346
+ f.close
347
+ submit_script(f.path, **kwargs)
348
+ end
349
+ end
350
+
351
+ # Submit a script expanded as a string to the batch server
352
+ # @param content [#to_s] script as a string
353
+ # @param args [Array<#to_s>] arguments passed to `qsub` command
354
+ # @param env [Hash{#to_s => #to_s}] environment variables set
355
+ # @param chdir [#to_s, nil] working directory where `qsub` is called from
356
+ # @raise [Error] if `qsub` command exited unsuccessfully
357
+ # @return [String] the id of the job that was created
358
+ def submit(content, args: [], env: {}, chdir: nil)
359
+ call(:qsub, *args, env: env, stdin: content, chdir: chdir).strip
360
+ end
361
+
362
+ private
363
+ # Submit a script using FFI library
364
+ def pbs_submit(script, queue, headers, resources, envvars)
365
+ attribs = []
366
+ headers.each do |name, value|
367
+ attribs << { name: name, value: value }
368
+ end
369
+ resources.each do |rsc, value|
370
+ attribs << { name: :Resource_List, resource: rsc, value: value }
371
+ end
372
+ unless envvars.empty?
373
+ attribs << {
374
+ name: :Variable_List,
375
+ value: envvars.map {|k,v| "#{k}=#{v}"}.join(",")
376
+ }
377
+ end
378
+
379
+ connect do |cid|
380
+ attropl = Attropl.from_list attribs
381
+ FFI.pbs_submit cid, attropl, script, queue, nil
382
+ end
383
+ end
384
+
385
+ # Mapping of FFI attribute to `qsub` arguments
386
+ def qsub_arg(key, value)
387
+ case key
388
+ # common attributes
389
+ when :Execution_Time
390
+ ['-a', value.to_s]
391
+ when :Checkpoint
392
+ ['-c', value.to_s]
393
+ when :Error_Path
394
+ ['-e', value.to_s]
395
+ when :fault_tolerant
396
+ ['-f']
397
+ when :Hold_Types
398
+ ['-h']
399
+ when :Join_Path
400
+ ['-j', value.to_s]
401
+ when :Keep_Files
402
+ ['-k', value.to_s]
403
+ when :Mail_Points
404
+ ['-m', value.to_s]
405
+ when :Output_Path
406
+ ['-o', value.to_s]
407
+ when :Priority
408
+ ['-p', value.to_s]
409
+ when :Rerunable
410
+ ['-r', value.to_s]
411
+ when :job_array_request
412
+ ['-t', value.to_s]
413
+ when :User_List
414
+ ['-u', value.to_s]
415
+ when :Account_Name
416
+ ['-A', value.to_s]
417
+ when :Mail_Users
418
+ ['-M', value.to_s]
419
+ when :Job_Name
420
+ ['-N', value.to_s]
421
+ when :Shell_Path_List
422
+ ['-S', value.to_s]
423
+ # uncommon attributes
424
+ when :job_arguments
425
+ ['-F', value.to_s]
426
+ when :init_work_dir
427
+ ['-d', value.to_s] # sets PBS_O_INITDIR
428
+ when :reservation_id
429
+ ['-W', "x=advres:#{value}"] # use resource manager extensions for Moab
430
+ # everything else
431
+ else
432
+ ['-W', "#{key}=#{value}"]
433
+ end
434
+ end
435
+
436
+ # Submit a script using FFI binary
437
+ # NB: The binary includes many useful filters and is preferred
438
+ def qsub_submit(script, queue, headers, resources, envvars)
439
+ params = []
440
+ params += ["-q", "#{queue}"] unless queue.empty?
441
+ params += headers.map {|k,v| qsub_arg(k,v)}.flatten
442
+ params += resources.map{|k,v| ["-l", "#{k}=#{v}"]}.flatten
443
+ params += ["-v", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
444
+ params << script
445
+
446
+ env = {
447
+ "PBS_DEFAULT" => "#{host}",
448
+ "LD_LIBRARY_PATH" => "#{lib}:#{ENV['LD_LIBRARY_PATH']}"
449
+ }
450
+ cmd = OodCore::Job::Adapters::Helper.bin_path('qsub', bin, bin_overrides)
451
+ o, e, s = Open3.capture3(env, cmd, *params)
452
+ raise Error, e unless s.success?
453
+ o.chomp
454
+ end
455
+
456
+ # Call a forked PBS command for a given host
457
+ def call(cmd, *args, env: {}, stdin: "", chdir: nil)
458
+ cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
459
+ args = args.map(&:to_s)
460
+ env = env.to_h.each_with_object({}) {|(k,v), h| h[k.to_s] = v.to_s}.merge({
461
+ "PBS_DEFAULT" => host,
462
+ "LD_LIBRARY_PATH" => %{#{lib}:#{ENV["LD_LIBRARY_PATH"]}}
463
+ })
464
+ stdin = stdin.to_s
465
+ chdir ||= "."
466
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin, chdir: chdir.to_s)
467
+ s.success? ? o : raise(Error, e)
468
+ end
469
+ end
470
+ end