ood_core 0.20.2 → 0.21.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2c4e013f80e987d4d1cefbc78cc76bcff52e4083e0b84192b42807ae46806946
4
- data.tar.gz: c4a1607904baccc1b063916ecf8e5a9692a9c0102a0d8cda3a9edf0ae760191f
3
+ metadata.gz: 6d1d489149a451b24284191ba966ef7c5d85f859c939b050d50b6501fd49a4cb
4
+ data.tar.gz: 3d438089095a42b66f4edee0d3a6afe683e1d87ebb865d908120b977733c6169
5
5
  SHA512:
6
- metadata.gz: ab3333366fc7802d59a15dead3b21e863d0017385053eea629a109a076c6e768ed1575378a34e68bb6c163b050be87a9cf323f087d02e4e2be4d349550bf5531
7
- data.tar.gz: 234c13fbbc428717532bd93ba4e977cfd825e480c69daf66c737165ed7c5d8a951c329ced0312d525efc0b70cb4d11234c016c6216c2bb7f74573de854340889
6
+ metadata.gz: 4106f8af4babd7ae5cf59e133d42e5d1ecda3c1436727740f91f8b3e8a21112254ec8d35f724e2614cc18b65043944ada99adf6dfdeb0959f618d5c90e8178c0
7
+ data.tar.gz: a3caaaf21cc6ee4bd68fea96817f58b61592c93993813f69333e8acdce95df8b0d8b49b169000a50ddcb7bcd6031d151272882e5dc3cd02e09a0595ef1d31116
data/CHANGELOG.md CHANGED
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.21.0] - 08-01-2022
11
+
12
+ ### Added
13
+
14
+ - Added the `fujitsu_tcs` adapter in [766](https://github.com/OSC/ood_core/pull/766).
15
+
10
16
  ## [0.20.2] - 07-28-2022
11
17
 
12
18
  - Fixed an issue with Slurm's `cluster_info` in [762](https://github.com/OSC/ood_core/pull/762).
@@ -437,7 +443,8 @@ Functionally the same as [0.17.3] but with some CI updates.
437
443
  ### Added
438
444
  - Initial release!
439
445
 
440
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.20.2...HEAD
446
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.21.0...HEAD
447
+ [0.21.0]: https://github.com/OSC/ood_core/compare/v0.20.2...v0.21.0
441
448
  [0.20.2]: https://github.com/OSC/ood_core/compare/v0.20.1...v0.20.2
442
449
  [0.20.1]: https://github.com/OSC/ood_core/compare/v0.20.0...v0.20.1
443
450
  [0.20.0]: https://github.com/OSC/ood_core/compare/v0.19.0...v0.20.0
@@ -0,0 +1,403 @@
1
+ require "time"
2
+ require "ood_core/refinements/hash_extensions"
3
+ require "ood_core/refinements/array_extensions"
4
+ require "ood_core/job/adapters/helper"
5
+
6
+ module OodCore
7
+ module Job
8
+ class Factory
9
+ using Refinements::HashExtensions
10
+
11
+ # Build the Fujitsu TCS (Technical Computing Suite) adapter from a configuration
12
+ # @param config [#to_h] the configuration for job adapter
13
+ # @option config [Object] :bin (nil) Path to Fujitsu TCS resource manager binaries
14
+ # @option config [#to_h] :bin_overrides ({}) Optional overrides to Fujitsu TCS resource manager executables
15
+ def self.build_fujitsu_tcs(config)
16
+ c = config.to_h.symbolize_keys
17
+ bin = c.fetch(:bin, nil)
18
+ bin_overrides = c.fetch(:bin_overrides, {})
19
+ fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides)
20
+ Adapters::Fujitsu_TCS.new(fujitsu_tcs: fujitsu_tcs)
21
+ end
22
+ end
23
+
24
+ module Adapters
25
+ # An adapter object that describes the communication with a Fujitsu TCS
26
+ # resource manager for job management.
27
+ class Fujitsu_TCS < Adapter
28
+ using Refinements::HashExtensions
29
+ using Refinements::ArrayExtensions
30
+
31
+ # Object used for simplified communication with a Fujitsu TCS batch server
32
+ # @api private
33
+ class Batch
34
+ # The path to the Fujitsu TCS binaries
35
+ # @example
36
+ # my_batch.bin.to_s #=> "/usr/local/fujitsu_tcs/10.0.0/bin"
37
+ # @return [Pathname] path to Fujitsu TCS binaries
38
+ attr_reader :bin
39
+
40
+ # Optional overrides for Fujitsu TCS executables
41
+ # @example
42
+ # {'pjsub' => '/usr/local/bin/pjsub'}
43
+ # @return Hash<String, String>
44
+ attr_reader :bin_overrides
45
+
46
+ # The root exception class that all Fujitsu TCS specific exceptions inherit
47
+ # from
48
+ class Error < StandardError; end
49
+
50
+ # An error indicating the Fujitsu TCS command timed out
51
+ class Fujitsu_TCS_TimeoutError < Error; end
52
+
53
+ # @param bin [#to_s] path to Fujitsu TCS installation binaries
54
+ # @param bin_overrides [#to_h] a hash of bin ovverides to be used in job
55
+ def initialize(bin: nil, bin_overrides: {})
56
+ @bin = Pathname.new(bin.to_s)
57
+ @bin_overrides = bin_overrides
58
+ end
59
+
60
+ # Get a list of hashes detailing each of the jobs on the batch server
61
+ # @example Status info for all jobs
62
+ # my_batch.get_jobs
63
+ # #=>
64
+ # #[
65
+ # # {
66
+ # # :JOB_ID => "123",
67
+ # # :JOB_NAME => "my_job",
68
+ # # ...
69
+ # # },
70
+ # # {
71
+ # # :JOB_ID => "125",
72
+ # # :JOB_NAME => "my_other_job",
73
+ # # ...
74
+ # # },
75
+ # # ...
76
+ # #]
77
+ # @param id [#to_s] the id of the job
78
+ # @param owner [String] the owner(s) of the job
79
+ # @raise [Error] if `pjstat` command exited unsuccessfully
80
+ # @return [Array<Hash>] list of details for jobs
81
+ def get_jobs(id: "", owner: nil)
82
+ args = ["-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
83
+ args.concat ["--filter jid=" + id.to_s] unless id.to_s.empty?
84
+ args.concat ["--filter usr=" + owner.to_s] unless owner.to_s.empty?
85
+
86
+ StringIO.open(call("pjstat", *args)) do |output|
87
+ output.gets() # Skip header
88
+ jobs = []
89
+ output.each_line do |line|
90
+ l = line.split(",")
91
+ jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split(" ")[0],
92
+ :ST => l[4], :STD => l[5], :STDE => l[6],
93
+ :ACCEPT => l[7], :START_DATE => l[8], :NODES => l[9].split(":")[0],
94
+ :USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split(" ")[0] }
95
+ end
96
+ jobs
97
+ end
98
+ rescue Fujitsu_TCS_TimeoutError
99
+ return [{ JOB_ID: id, ST: 'undetermined' }]
100
+ end
101
+
102
+ # Put a specified job on hold
103
+ # @example Put job "1234" on hold
104
+ # my_batch.hold_job("1234")
105
+ # @param id [#to_s] the id of the job
106
+ # @raise [Error] if `pjhold` command exited unsuccessfully
107
+ # @return [void]
108
+ def hold_job(id)
109
+ call("pjhold", id.to_s)
110
+ end
111
+
112
+ # Release a specified job that is on hold
113
+ # @example Release job "1234" from on hold
114
+ # my_batch.release_job("1234")
115
+ # @param id [#to_s] the id of the job
116
+ # @raise [Error] if `pjrls` command exited unsuccessfully
117
+ # @return [void]
118
+ def release_job(id)
119
+ call("pjrls", id.to_s)
120
+ end
121
+
122
+ # Delete a specified job from batch server
123
+ # @example Delete job "1234"
124
+ # my_batch.delete_job("1234")
125
+ # @param id [#to_s] the id of the job
126
+ # @raise [Error] if `pjdel` command exited unsuccessfully
127
+ # @return [void]
128
+ def delete_job(id)
129
+ call("pjdel", id.to_s)
130
+ end
131
+
132
+ # Submit a script expanded as a string to the batch server
133
+ # @param str [#to_s] script as a string
134
+ # @param args [Array<#to_s>] arguments passed to `pjsub` command
135
+ # @raise [Error] if `pjsub` command exited unsuccessfully
136
+ # @return [String] the id of the job that was created
137
+ def submit_string(str, args: [])
138
+ args = args.map(&:to_s)
139
+ call("pjsub", *args, stdin: str.to_s).split(" ")[5]
140
+ end
141
+
142
+ private
143
+ # Call a forked Fujitsu TCS command
144
+ def call(cmd, *args, stdin: "")
145
+ cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
146
+ args = args.map(&:to_s)
147
+ o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
148
+ s.success? ? o : raise(Error, e)
149
+ end
150
+ end
151
+
152
+ # Mapping of state codes for Fujitsu TCS resource manager
153
+ STATE_MAP = {
154
+ 'ACC' => :queued, # Accepted job submission
155
+ 'RJT' => :completed, # Rejected job submission
156
+ 'QUE' => :queued, # Waiting for job execution
157
+ 'RNA' => :queued, # Acquiring resources required for job execution
158
+ 'RNP' => :running, # Executing prologue
159
+ 'RUN' => :running, # Executing job
160
+ 'RNE' => :running, # Executing epilogue
161
+ 'RNO' => :running, # Waiting for completion of job termination processing
162
+ 'SPP' => :suspended, # Suspend in progress
163
+ 'SPD' => :suspended, # Suspended
164
+ 'RSM' => :running, # Resume in progress
165
+ 'EXT' => :completed, # Exited job end execution
166
+ 'CCL' => :completed, # Exited job execution by interruption
167
+ 'HLD' => :suspended, # In fixed state due to users
168
+ 'ERR' => :completed, # In fixed state due to an error
169
+ }
170
+
171
+ # @api private
172
+ # @param opts [#to_h] the options defining this adapter
173
+ # @option opts [Batch] :the Fujitsu TCS batch object
174
+ # @see Factory.build_fujitsu_tcs
175
+ def initialize(opts = {})
176
+ o = opts.to_h.symbolize_keys
177
+
178
+ @fujitsu_tcs = o.fetch(:fujitsu_tcs) { raise ArgumentError, "No Fujitsu TCS object specified. Missing argument: fujitsu_tcs" }
179
+ end
180
+
181
+ # Submit a job with the attributes defined in the job template instance
182
+ # @param script [Script] script object that describes the script and
183
+ # attributes for the submitted job
184
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
185
+ # execution at any point after dependent jobs have started execution
186
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
187
+ # execution only after dependent jobs have terminated with no errors
188
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
189
+ # execution only after dependent jobs have terminated with errors
190
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
191
+ # execution after dependent jobs have terminated
192
+ # @raise [JobAdapterError] if something goes wrong submitting a job
193
+ # @return [String] the job id returned after successfully submitting a
194
+ # job
195
+ # @see Adapter#submit
196
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
197
+ #after = Array(after).map(&:to_s)
198
+ #afterok = Array(afterok).map(&:to_s)
199
+ #afternotok = Array(afternotok).map(&:to_s)
200
+ #afterany = Array(afterany).map(&:to_s)
201
+ if !after.empty? || !afterok.empty? || !afternotok.empty? || !afterany.empty?
202
+ raise JobAdapterError, "Dependency between jobs has not implemented yet."
203
+ end
204
+
205
+ # Set pjsub options
206
+ args = []
207
+ args.concat (script.rerunnable ? ["--restart"] : ["--norestart"]) unless script.rerunnable.nil?
208
+ args.concat ["--mail-list", script.email.join(",")] unless script.email.nil?
209
+ if script.email_on_started && script.email_on_terminated
210
+ args.concat ["-m", "b,e"]
211
+ elsif script.email_on_started
212
+ args.concat ["-m", "b"]
213
+ elsif script.email_on_terminated
214
+ args.concat ["-m", "e"]
215
+ end
216
+
217
+ args.concat ["-N", script.job_name] unless script.job_name.nil?
218
+ args.concat ["-o", script.output_path] unless script.output_path.nil?
219
+ if script.error_path.nil?
220
+ args.concat ["-j"]
221
+ else
222
+ args.concat ["-e", script.error_path]
223
+ end
224
+ args.concat ["--rscgrp", script.queue_name] unless script.queue_name.nil?
225
+ args.concat ["-p", script.priority] unless script.priority.nil?
226
+ args.concat ["--at", script.start_time.localtime.strftime("%C%y-%m-%dT%H:%M:%S")] unless script.start_time.nil?
227
+ args.concat ["-L \"elapse=" + seconds_to_duration(script.wall_time) + "\""] unless script.wall_time.nil?
228
+ args.concat ["--bulk", "--sparam", script.job_array_request] unless script.job_array_request.nil?
229
+
230
+ # Set environment variables
231
+ envvars = script.job_environment.to_h
232
+ args.concat ["-x", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
233
+ args.concat ["-X"] if script.copy_environment?
234
+
235
+ # Set native options
236
+ args.concat script.native if script.native
237
+
238
+ # Set content
239
+ content = if script.shell_path.nil?
240
+ script.content
241
+ else
242
+ "#!#{script.shell_path}\n#{script.content}"
243
+ end
244
+
245
+ # Submit job
246
+ @fujitsu_tcs.submit_string(content, args: args)
247
+ rescue Batch::Error => e
248
+ raise JobAdapterError, e.message
249
+ end
250
+
251
+ # Retrieve info for all jobs from the resource manager
252
+ # @raise [JobAdapterError] if something goes wrong getting job info
253
+ # @return [Array<Info>] information describing submitted jobs
254
+ # @see Adapter#info_all
255
+ def info_all(attrs: nil)
256
+ @fujitsu_tcs.get_jobs().map do |v|
257
+ parse_job_info(v)
258
+ end
259
+ rescue Batch::Error => e
260
+ raise JobAdapterError, e.message
261
+ end
262
+
263
+ # Retrieve job info from the resource manager
264
+ # @param id [#to_s] the id of the job
265
+ # @raise [JobAdapterError] if something goes wrong getting job info
266
+ # @return [Info] information describing submitted job
267
+ # @see Adapter#info
268
+ def info(id)
269
+ id = id.to_s
270
+ info_ary = @fujitsu_tcs.get_jobs(id: id).map do |v|
271
+ parse_job_info(v)
272
+ end
273
+
274
+ # If no job was found we assume that it has completed
275
+ info_ary.empty? ? Info.new(id: id, status: :completed) : info_ary.first # @fujitsu_tcs.get_jobs() must return only one element.
276
+ rescue Batch::Error => e
277
+ # set completed status if can't find job id
278
+ if /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
279
+ Info.new(
280
+ id: id,
281
+ status: :completed
282
+ )
283
+ else
284
+ raise JobAdapterError, e.message
285
+ end
286
+ end
287
+
288
+ # Retrieve info for all jobs for a given owner or owners from the
289
+ # resource manager
290
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
291
+ # @raise [JobAdapterError] if something goes wrong getting job info
292
+ # @return [Array<Info>] information describing submitted jobs
293
+ def info_where_owner(owner, attrs: nil)
294
+ owner = Array.wrap(owner).map(&:to_s).join('+')
295
+ @fujitsu_tcs.get_jobs(owner: owner).map do |v|
296
+ parse_job_info(v)
297
+ end
298
+ rescue Batch::Error => e
299
+ raise JobAdapterError, e.message
300
+ end
301
+
302
+ # Retrieve job status from resource manager
303
+ # @param id [#to_s] the id of the job
304
+ # @raise [JobAdapterError] if something goes wrong getting job status
305
+ # @return [Status] status of job
306
+ # @see Adapter#status
307
+ def status(id)
308
+ id = id.to_s
309
+ jobs = @fujitsu_tcs.get_jobs(id: id)
310
+
311
+ if job = jobs.detect { |j| j[:JOB_ID] == id }
312
+ Status.new(state: get_state(job[:ST]))
313
+ else
314
+ # set completed status if can't find job id
315
+ Status.new(state: :completed)
316
+ end
317
+ rescue Batch::Error => e
318
+ # set completed status if can't find job id
319
+ if /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
320
+ Status.new(state: :completed)
321
+ else
322
+ raise JobAdapterError, e.message
323
+ end
324
+ end
325
+
326
+ # Put the submitted job on hold
327
+ # @param id [#to_s] the id of the job
328
+ # @raise [JobAdapterError] if something goes wrong holding a job
329
+ # @return [void]
330
+ # @see Adapter#hold
331
+ def hold(id)
332
+ @fujitsu_tcs.hold_job(id.to_s)
333
+ rescue Batch::Error => e
334
+ # assume successful job hold if can't find job id
335
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
336
+ end
337
+
338
+ # Release the job that is on hold
339
+ # @param id [#to_s] the id of the job
340
+ # @raise [JobAdapterError] if something goes wrong releasing a job
341
+ # @return [void]
342
+ # @see Adapter#release
343
+ def release(id)
344
+ @fujitsu_tcs.release_job(id.to_s)
345
+ rescue Batch::Error => e
346
+ # assume successful job release if can't find job id
347
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
348
+ end
349
+
350
+ # Delete the submitted job
351
+ # @param id [#to_s] the id of the job
352
+ # @raise [JobAdapterError] if something goes wrong deleting a job
353
+ # @return [void]
354
+ # @see Adapter#delete
355
+ def delete(id)
356
+ @fujitsu_tcs.delete_job(id.to_s)
357
+ rescue Batch::Error => e
358
+ # assume successful job deletion if can't find job id
359
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
360
+ end
361
+
362
+ def directive_prefix
363
+ '#PJM'
364
+ end
365
+
366
+ private
367
+ # Convert duration to seconds
368
+ def duration_in_seconds(time)
369
+ return 0 if time.nil?
370
+ time, days = time.split("-").reverse
371
+ days.to_i * 24 * 3600 +
372
+ time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
373
+ end
374
+
375
+ # Convert seconds to duration
376
+ def seconds_to_duration(time)
377
+ "%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
378
+ end
379
+
380
+ # Determine state from Fujitsu TCS state code
381
+ def get_state(st)
382
+ STATE_MAP.fetch(st, :undetermined)
383
+ end
384
+
385
+ # Parse hash describing Fujitsu TCS job status
386
+ def parse_job_info(v)
387
+ Info.new(
388
+ id: v[:JOB_ID],
389
+ job_name: v[:JOB_NAME],
390
+ status: get_state(v[:ST]),
391
+ job_owner: v[:USER],
392
+ dispatch_time: v[:START_DATE],
393
+ wallclock_time: duration_in_seconds(v[:ELAPSE_TIM]),
394
+ wallclock_limit: duration_in_seconds(v[:ELAPSE_LIM]),
395
+ submission_time: v[:ACCEPT],
396
+ queue_name: v[:RSC_GRP],
397
+ native: v
398
+ )
399
+ end
400
+ end
401
+ end
402
+ end
403
+ end
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.20.2"
3
+ VERSION = "0.21.0"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.20.2
4
+ version: 0.21.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2022-07-28 00:00:00.000000000 Z
13
+ date: 2022-08-01 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -179,6 +179,7 @@ files:
179
179
  - lib/ood_core/job/adapter.rb
180
180
  - lib/ood_core/job/adapters/ccq.rb
181
181
  - lib/ood_core/job/adapters/drmaa.rb
182
+ - lib/ood_core/job/adapters/fujitsu_tcs.rb
182
183
  - lib/ood_core/job/adapters/helper.rb
183
184
  - lib/ood_core/job/adapters/kubernetes.rb
184
185
  - lib/ood_core/job/adapters/kubernetes/batch.rb