ood_core 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,470 @@
1
+ require 'open3'
2
+
3
+ class OodCore::Job::Adapters::Torque
4
+ # Object used for simplified communication with a batch server
5
+ class Batch
6
+ # The host of the Torque batch server
7
+ # @example OSC's Oakley batch server
8
+ # my_conn.host #=> "oak-batch.osc.edu"
9
+ # @return [String] the batch server host
10
+ attr_reader :host
11
+
12
+ # The path to the Torque client installation libraries
13
+ # @example For Torque 5.0.0
14
+ # my_conn.lib.to_s #=> "/usr/local/Torque/5.0.0/lib"
15
+ # @return [Pathname] path to Torque libraries
16
+ attr_reader :lib
17
+
18
+ # The path to the Torque client installation binaries
19
+ # @example For Torque 5.0.0
20
+ # my_conn.bin.to_s #=> "/usr/local/Torque/5.0.0/bin"
21
+ # @return [Pathname] path to Torque binaries
22
+ attr_reader :bin
23
+
24
+ # Optional overrides for Torque client executables
25
+ # @example
26
+ # {'qsub' => '/usr/local/bin/qsub'}
27
+ # @return Hash<String, String>
28
+ attr_reader :bin_overrides
29
+
30
+ # The root exception class that all Torque-specific exceptions inherit
31
+ # from
32
+ class Error < StandardError; end
33
+
34
+ # @param host [#to_s] the batch server host
35
+ # @param lib [#to_s] path to FFI installation libraries
36
+ # @param bin [#to_s] path to FFI installation binaries
37
+ def initialize(host:, lib: "", bin: "", bin_overrides: {}, **_)
38
+ @host = host.to_s
39
+ @lib = Pathname.new(lib.to_s)
40
+ @bin = Pathname.new(bin.to_s)
41
+ @bin_overrides = bin_overrides
42
+ end
43
+
44
+ # Convert object to hash
45
+ # @return [Hash] the hash describing this object
46
+ def to_h
47
+ {host: host, lib: lib, bin: bin}
48
+ end
49
+
50
+ # The comparison operator
51
+ # @param other [#to_h] batch server to compare against
52
+ # @return [Boolean] how batch servers compare
53
+ def ==(other)
54
+ to_h == other.to_h
55
+ end
56
+
57
+ # Checks whether two batch server objects are completely identical to each
58
+ # other
59
+ # @param other [Batch] batch server to compare against
60
+ # @return [Boolean] whether same objects
61
+ def eql?(other)
62
+ self.class == other.class && self == other
63
+ end
64
+
65
+ # Generates a hash value for this object
66
+ # @return [Fixnum] hash value of object
67
+ def hash
68
+ [self.class, to_h].hash
69
+ end
70
+
71
+ # Creates a connection to batch server and calls block in context of this
72
+ # connection
73
+ # @yieldparam cid [Fixnum] connection id from established batch server connection
74
+ # @yieldreturn the final value of the block
75
+ def connect(&block)
76
+ FFI.lib = lib.join('libtorque.so')
77
+ cid = FFI.pbs_connect(host)
78
+ FFI.raise_error(cid.abs) if cid < 0 # raise error if negative connection id
79
+ begin
80
+ value = yield cid
81
+ ensure
82
+ FFI.pbs_disconnect(cid) # always close connection
83
+ end
84
+ FFI.check_for_error # check for errors at end
85
+ value
86
+ end
87
+
88
+ # Get a hash with status info for this batch server
89
+ # @example Status info for OSC Oakley batch server
90
+ # my_conn.get_status
91
+ # #=>
92
+ # #{
93
+ # # "oak-batch.osc.edu:15001" => {
94
+ # # :server_state => "Idle",
95
+ # # ...
96
+ # # }
97
+ # #}
98
+ # @param filters [Array<Symbol>] list of attribs to filter on
99
+ # @return [Hash] status info for batch server
100
+ def get_status(filters: [])
101
+ connect do |cid|
102
+ filters = Attrl.from_list filters
103
+ batch_status = FFI.pbs_statserver cid, filters, nil
104
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
105
+ end
106
+ end
107
+
108
+ # Get a list of hashes of the queues on the batch server
109
+ # @example Status info for OSC Oakley queues
110
+ # my_conn.get_queues
111
+ # #=>
112
+ # #{
113
+ # # "parallel" => {
114
+ # # :queue_type => "Execution",
115
+ # # ...
116
+ # # },
117
+ # # "serial" => {
118
+ # # :queue_type => "Execution",
119
+ # # ...
120
+ # # },
121
+ # # ...
122
+ # #}
123
+ # @param id [#to_s] the id of requested information
124
+ # @param filters [Array<Symbol>] list of attribs to filter on
125
+ # @return [Hash] hash of details for the queues
126
+ def get_queues(id: '', filters: [])
127
+ connect do |cid|
128
+ filters = Attrl.from_list(filters)
129
+ batch_status = FFI.pbs_statque cid, id.to_s, filters, nil
130
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
131
+ end
132
+ end
133
+
134
+ # Get info for given batch server's queue
135
+ # @example Status info for OSC Oakley's parallel queue
136
+ # my_conn.get_queue("parallel")
137
+ # #=>
138
+ # #{
139
+ # # "parallel" => {
140
+ # # :queue_type => "Execution",
141
+ # # ...
142
+ # # }
143
+ # #}
144
+ # @param (see @get_queues)
145
+ # @return [Hash] status info for the queue
146
+ def get_queue(id, **kwargs)
147
+ get_queues(id: id, **kwargs)
148
+ end
149
+
150
+
151
+ # Get a list of hashes of the nodes on the batch server
152
+ # @example Status info for OSC Oakley nodes
153
+ # my_conn.get_nodes
154
+ # #=>
155
+ # #{
156
+ # # "n0001" => {
157
+ # # :np => "12",
158
+ # # ...
159
+ # # },
160
+ # # "n0002" => {
161
+ # # :np => "12",
162
+ # # ...
163
+ # # },
164
+ # # ...
165
+ # #}
166
+ # @param id [#to_s] the id of requested information
167
+ # @param filters [Array<Symbol>] list of attribs to filter on
168
+ # @return [Hash] hash of details for nodes
169
+ def get_nodes(id: '', filters: [])
170
+ connect do |cid|
171
+ filters = Attrl.from_list(filters)
172
+ batch_status = FFI.pbs_statnode cid, id.to_s, filters, nil
173
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
174
+ end
175
+ end
176
+
177
+ # Get info for given batch server's node
178
+ # @example Status info for OSC Oakley's 'n0001' node
179
+ # my_conn.get_node('n0001')
180
+ # #=>
181
+ # #{
182
+ # # "n0001" => {
183
+ # # :np => "12",
184
+ # # ...
185
+ # # }
186
+ # #}
187
+ # @param (see #get_nodes)
188
+ # @return [Hash] status info for the node
189
+ def get_node(id, **kwargs)
190
+ get_nodes(id: id, **kwargs)
191
+ end
192
+
193
+ # Get a list of hashes of the selected jobs on the batch server
194
+ # @example Status info for jobs owned by Bob
195
+ # my_conn.select_jobs(attribs: [{name: "User_List", value: "bob", op: :eq}])
196
+ # #=>
197
+ # #{
198
+ # # "10219837.oak-batch.osc.edu" => {
199
+ # # :Job_Owner => "bob@oakley02.osc.edu",
200
+ # # :Job_Name => "CFD_Solver",
201
+ # # ...
202
+ # # },
203
+ # # "10219839.oak-batch.osc.edu" => {
204
+ # # :Job_Owner => "bob@oakley02.osc.edu",
205
+ # # :Job_Name => "CFD_Solver2",
206
+ # # ...
207
+ # # },
208
+ # # ...
209
+ # #}
210
+ # @param attribs [Array<#to_h>] list of hashes describing attributes to
211
+ # select on
212
+ # @return [Hash] hash of details of selected jobs
213
+ #
214
+ def select_jobs(attribs: [])
215
+ connect do |cid|
216
+ attribs = Attropl.from_list(attribs.map(&:to_h))
217
+ batch_status = FFI.pbs_selstat cid, attribs, nil
218
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
219
+ end
220
+ end
221
+
222
+ # Get a list of hashes of the jobs on the batch server
223
+ # @example Status info for OSC Oakley jobs
224
+ # my_conn.get_jobs
225
+ # #=>
226
+ # #{
227
+ # # "10219837.oak-batch.osc.edu" => {
228
+ # # :Job_Owner => "bob@oakley02.osc.edu",
229
+ # # :Job_Name => "CFD_Solver",
230
+ # # ...
231
+ # # },
232
+ # # "10219838.oak-batch.osc.edu" => {
233
+ # # :Job_Owner => "sally@oakley01.osc.edu",
234
+ # # :Job_Name => "FEA_Solver",
235
+ # # ...
236
+ # # },
237
+ # # ...
238
+ # #}
239
+ # @param id [#to_s] the id of requested information
240
+ # @param filters [Array<Symbol>] list of attribs to filter on
241
+ # @return [Hash] hash of details for jobs
242
+ def get_jobs(id: '', filters: [])
243
+ connect do |cid|
244
+ filters = FFI::Attrl.from_list(filters)
245
+ batch_status = FFI.pbs_statjob cid, id.to_s, filters, nil
246
+ batch_status.to_h.tap { FFI.pbs_statfree batch_status }
247
+ end
248
+ end
249
+
250
+ # Get info for given batch server's job
251
+ # @example Status info for OSC Oakley's '10219837.oak-batch.osc.edu' job
252
+ # my_conn.get_job('102719837.oak-batch.osc.edu')
253
+ # #=>
254
+ # #{
255
+ # # "10219837.oak-batch.osc.edu" => {
256
+ # # :Job_Owner => "bob@oakley02.osc.edu",
257
+ # # :Job_Name => "CFD_Solver",
258
+ # # ...
259
+ # # }
260
+ # #}
261
+ # @param (see #get_jobs)
262
+ # @return [Hash] hash with details of job
263
+ def get_job(id, **kwargs)
264
+ get_jobs(id: id, **kwargs)
265
+ end
266
+
267
+ # Put specified job on hold
268
+ # Possible hold types:
269
+ # :u => Available to the owner of the job, the batch operator and the batch administrator
270
+ # :o => Available to the batch operator and the batch administrator
271
+ # :s => Available to the batch administrator
272
+ # @example Put job '10219837.oak-batch.osc.edu' on hold
273
+ # my_conn.hold_job('10219837.oak-batch.osc.edu')
274
+ # @param id [#to_s] the id of the job
275
+ # @param type [#to_s] type of hold to be applied
276
+ # @return [void]
277
+ def hold_job(id, type: :u)
278
+ connect do |cid|
279
+ FFI.pbs_holdjob cid, id.to_s, type.to_s, nil
280
+ end
281
+ end
282
+
283
+ # Release a specified job that is on hold
284
+ # Possible hold types:
285
+ # :u => Available to the owner of the job, the batch operator and the batch administrator
286
+ # :o => Available to the batch operator and the batch administrator
287
+ # :s => Available to the batch administrator
288
+ # @example Release job '10219837.oak-batch.osc.edu' from hold
289
+ # my_conn.release_job('10219837.oak-batch.osc.edu')
290
+ # @param id [#to_s] the id of the job
291
+ # @param type [#to_s] type of hold to be removed
292
+ # @return [void]
293
+ def release_job(id, type: :u)
294
+ connect do |cid|
295
+ FFI.pbs_rlsjob cid, id.to_s, type.to_s, nil
296
+ end
297
+ end
298
+
299
+ # Delete a specified job from batch server
300
+ # @example Delete job '10219837.oak-batch.osc.edu' from batch
301
+ # my_conn.delete_job('10219837.oak-batch.osc.edu')
302
+ # @param id [#to_s] the id of the job
303
+ # @return [void]
304
+ def delete_job(id)
305
+ connect do |cid|
306
+ FFI.pbs_deljob cid, id.to_s, nil
307
+ end
308
+ end
309
+
310
+ # Submit a script to the batch server
311
+ # @example Submit a script with a few PBS directives
312
+ # my_conn.submit_script("/path/to/script",
313
+ # headers: {
314
+ # Job_Name: "myjob",
315
+ # Join_Path: "oe"
316
+ # },
317
+ # resources: {
318
+ # nodes: "4:ppn=12",
319
+ # walltime: "12:00:00"
320
+ # },
321
+ # envvars: {
322
+ # TOKEN: "asd90f9sd8g90hk34"
323
+ # }
324
+ # )
325
+ # #=> "6621251.oak-batch.osc.edu"
326
+ # @param script [#to_s] path to the script
327
+ # @param queue [#to_s] queue to submit script to
328
+ # @param headers [Hash] pbs headers
329
+ # @param resources [Hash] pbs resources
330
+ # @param envvars [Hash] pbs environment variables
331
+ # @param qsub [Boolean] whether use library or binary for submission
332
+ # @return [String] the id of the job that was created
333
+ # @deprecated Use {#submit} instead.
334
+ def submit_script(script, queue: nil, headers: {}, resources: {}, envvars: {}, qsub: true)
335
+ send(qsub ? :qsub_submit : :pbs_submit, script.to_s, queue.to_s, headers, resources, envvars)
336
+ end
337
+
338
+ # Submit a script expanded into a string to the batch server
339
+ # @param string [#to_s] script as a string
340
+ # @param (see #submit_script)
341
+ # @return [String] the id of the job that was created
342
+ # @deprecated Use {#submit} instead.
343
+ def submit_string(string, **kwargs)
344
+ Tempfile.open('qsub.') do |f|
345
+ f.write string.to_s
346
+ f.close
347
+ submit_script(f.path, **kwargs)
348
+ end
349
+ end
350
+
351
+ # Submit a script expanded as a string to the batch server
352
+ # @param content [#to_s] script as a string
353
+ # @param args [Array<#to_s>] arguments passed to `qsub` command
354
+ # @param env [Hash{#to_s => #to_s}] environment variables set
355
+ # @param chdir [#to_s, nil] working directory where `qsub` is called from
356
+ # @raise [Error] if `qsub` command exited unsuccessfully
357
+ # @return [String] the id of the job that was created
358
+ def submit(content, args: [], env: {}, chdir: nil)
359
+ call(:qsub, *args, env: env, stdin: content, chdir: chdir).strip
360
+ end
361
+
362
+ private
363
+ # Submit a script using FFI library
364
+ def pbs_submit(script, queue, headers, resources, envvars)
365
+ attribs = []
366
+ headers.each do |name, value|
367
+ attribs << { name: name, value: value }
368
+ end
369
+ resources.each do |rsc, value|
370
+ attribs << { name: :Resource_List, resource: rsc, value: value }
371
+ end
372
+ unless envvars.empty?
373
+ attribs << {
374
+ name: :Variable_List,
375
+ value: envvars.map {|k,v| "#{k}=#{v}"}.join(",")
376
+ }
377
+ end
378
+
379
+ connect do |cid|
380
+ attropl = Attropl.from_list attribs
381
+ FFI.pbs_submit cid, attropl, script, queue, nil
382
+ end
383
+ end
384
+
385
+ # Mapping of FFI attribute to `qsub` arguments
386
+ def qsub_arg(key, value)
387
+ case key
388
+ # common attributes
389
+ when :Execution_Time
390
+ ['-a', value.to_s]
391
+ when :Checkpoint
392
+ ['-c', value.to_s]
393
+ when :Error_Path
394
+ ['-e', value.to_s]
395
+ when :fault_tolerant
396
+ ['-f']
397
+ when :Hold_Types
398
+ ['-h']
399
+ when :Join_Path
400
+ ['-j', value.to_s]
401
+ when :Keep_Files
402
+ ['-k', value.to_s]
403
+ when :Mail_Points
404
+ ['-m', value.to_s]
405
+ when :Output_Path
406
+ ['-o', value.to_s]
407
+ when :Priority
408
+ ['-p', value.to_s]
409
+ when :Rerunable
410
+ ['-r', value.to_s]
411
+ when :job_array_request
412
+ ['-t', value.to_s]
413
+ when :User_List
414
+ ['-u', value.to_s]
415
+ when :Account_Name
416
+ ['-A', value.to_s]
417
+ when :Mail_Users
418
+ ['-M', value.to_s]
419
+ when :Job_Name
420
+ ['-N', value.to_s]
421
+ when :Shell_Path_List
422
+ ['-S', value.to_s]
423
+ # uncommon attributes
424
+ when :job_arguments
425
+ ['-F', value.to_s]
426
+ when :init_work_dir
427
+ ['-d', value.to_s] # sets PBS_O_INITDIR
428
+ when :reservation_id
429
+ ['-W', "x=advres:#{value}"] # use resource manager extensions for Moab
430
+ # everything else
431
+ else
432
+ ['-W', "#{key}=#{value}"]
433
+ end
434
+ end
435
+
436
+ # Submit a script using FFI binary
437
+ # NB: The binary includes many useful filters and is preferred
438
+ def qsub_submit(script, queue, headers, resources, envvars)
439
+ params = []
440
+ params += ["-q", "#{queue}"] unless queue.empty?
441
+ params += headers.map {|k,v| qsub_arg(k,v)}.flatten
442
+ params += resources.map{|k,v| ["-l", "#{k}=#{v}"]}.flatten
443
+ params += ["-v", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
444
+ params << script
445
+
446
+ env = {
447
+ "PBS_DEFAULT" => "#{host}",
448
+ "LD_LIBRARY_PATH" => "#{lib}:#{ENV['LD_LIBRARY_PATH']}"
449
+ }
450
+ cmd = OodCore::Job::Adapters::Helper.bin_path('qsub', bin, bin_overrides)
451
+ o, e, s = Open3.capture3(env, cmd, *params)
452
+ raise Error, e unless s.success?
453
+ o.chomp
454
+ end
455
+
456
+ # Call a forked PBS command for a given host
457
+ def call(cmd, *args, env: {}, stdin: "", chdir: nil)
458
+ cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
459
+ args = args.map(&:to_s)
460
+ env = env.to_h.each_with_object({}) {|(k,v), h| h[k.to_s] = v.to_s}.merge({
461
+ "PBS_DEFAULT" => host,
462
+ "LD_LIBRARY_PATH" => %{#{lib}:#{ENV["LD_LIBRARY_PATH"]}}
463
+ })
464
+ stdin = stdin.to_s
465
+ chdir ||= "."
466
+ o, e, s = Open3.capture3(env, cmd, *args, stdin_data: stdin, chdir: chdir.to_s)
467
+ s.success? ? o : raise(Error, e)
468
+ end
469
+ end
470
+ end