ood_core 0.20.2 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2c4e013f80e987d4d1cefbc78cc76bcff52e4083e0b84192b42807ae46806946
4
- data.tar.gz: c4a1607904baccc1b063916ecf8e5a9692a9c0102a0d8cda3a9edf0ae760191f
3
+ metadata.gz: 6d1d489149a451b24284191ba966ef7c5d85f859c939b050d50b6501fd49a4cb
4
+ data.tar.gz: 3d438089095a42b66f4edee0d3a6afe683e1d87ebb865d908120b977733c6169
5
5
  SHA512:
6
- metadata.gz: ab3333366fc7802d59a15dead3b21e863d0017385053eea629a109a076c6e768ed1575378a34e68bb6c163b050be87a9cf323f087d02e4e2be4d349550bf5531
7
- data.tar.gz: 234c13fbbc428717532bd93ba4e977cfd825e480c69daf66c737165ed7c5d8a951c329ced0312d525efc0b70cb4d11234c016c6216c2bb7f74573de854340889
6
+ metadata.gz: 4106f8af4babd7ae5cf59e133d42e5d1ecda3c1436727740f91f8b3e8a21112254ec8d35f724e2614cc18b65043944ada99adf6dfdeb0959f618d5c90e8178c0
7
+ data.tar.gz: a3caaaf21cc6ee4bd68fea96817f58b61592c93993813f69333e8acdce95df8b0d8b49b169000a50ddcb7bcd6031d151272882e5dc3cd02e09a0595ef1d31116
data/CHANGELOG.md CHANGED
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.21.0] - 08-01-2022
11
+
12
+ ### Added
13
+
14
+ - Added the `fujitsu_tcs` adapter in [766](https://github.com/OSC/ood_core/pull/766).
15
+
10
16
  ## [0.20.2] - 07-28-2022
11
17
 
12
18
  - Fixed an issue with Slurm's `cluster_info` in [762](https://github.com/OSC/ood_core/pull/762).
@@ -437,7 +443,8 @@ Functionally the same as [0.17.3] but with some CI updates.
437
443
  ### Added
438
444
  - Initial release!
439
445
 
440
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.20.2...HEAD
446
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.21.0...HEAD
447
+ [0.21.0]: https://github.com/OSC/ood_core/compare/v0.20.2...v0.21.0
441
448
  [0.20.2]: https://github.com/OSC/ood_core/compare/v0.20.1...v0.20.2
442
449
  [0.20.1]: https://github.com/OSC/ood_core/compare/v0.20.0...v0.20.1
443
450
  [0.20.0]: https://github.com/OSC/ood_core/compare/v0.19.0...v0.20.0
@@ -0,0 +1,403 @@
1
+ require "time"
2
+ require "ood_core/refinements/hash_extensions"
3
+ require "ood_core/refinements/array_extensions"
4
+ require "ood_core/job/adapters/helper"
5
+
6
+ module OodCore
7
+ module Job
8
+ class Factory
9
+ using Refinements::HashExtensions
10
+
11
+ # Build the Fujitsu TCS (Technical Computing Suite) adapter from a configuration
12
+ # @param config [#to_h] the configuration for job adapter
13
+ # @option config [Object] :bin (nil) Path to Fujitsu TCS resource manager binaries
14
+ # @option config [#to_h] :bin_overrides ({}) Optional overrides to Fujitsu TCS resource manager executables
15
+ def self.build_fujitsu_tcs(config)
16
+ c = config.to_h.symbolize_keys
17
+ bin = c.fetch(:bin, nil)
18
+ bin_overrides = c.fetch(:bin_overrides, {})
19
+ fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides)
20
+ Adapters::Fujitsu_TCS.new(fujitsu_tcs: fujitsu_tcs)
21
+ end
22
+ end
23
+
24
+ module Adapters
25
+ # An adapter object that describes the communication with a Fujitsu TCS
26
+ # resource manager for job management.
27
+ class Fujitsu_TCS < Adapter
28
+ using Refinements::HashExtensions
29
+ using Refinements::ArrayExtensions
30
+
31
+ # Object used for simplified communication with a Fujitsu TCS batch server
32
+ # @api private
33
+ class Batch
34
+ # The path to the Fujitsu TCS binaries
35
+ # @example
36
+ # my_batch.bin.to_s #=> "/usr/local/fujitsu_tcs/10.0.0/bin"
37
+ # @return [Pathname] path to Fujitsu TCS binaries
38
+ attr_reader :bin
39
+
40
+ # Optional overrides for Fujitsu TCS executables
41
+ # @example
42
+ # {'pjsub' => '/usr/local/bin/pjsub'}
43
+ # @return Hash<String, String>
44
+ attr_reader :bin_overrides
45
+
46
+ # The root exception class that all Fujitsu TCS specific exceptions inherit
47
+ # from
48
+ class Error < StandardError; end
49
+
50
+ # An error indicating the Fujitsu TCS command timed out
51
+ class Fujitsu_TCS_TimeoutError < Error; end
52
+
53
+ # @param bin [#to_s] path to Fujitsu TCS installation binaries
54
+ # @param bin_overrides [#to_h] a hash of bin ovverides to be used in job
55
+ def initialize(bin: nil, bin_overrides: {})
56
+ @bin = Pathname.new(bin.to_s)
57
+ @bin_overrides = bin_overrides
58
+ end
59
+
60
+ # Get a list of hashes detailing each of the jobs on the batch server
61
+ # @example Status info for all jobs
62
+ # my_batch.get_jobs
63
+ # #=>
64
+ # #[
65
+ # # {
66
+ # # :JOB_ID => "123",
67
+ # # :JOB_NAME => "my_job",
68
+ # # ...
69
+ # # },
70
+ # # {
71
+ # # :JOB_ID => "125",
72
+ # # :JOB_NAME => "my_other_job",
73
+ # # ...
74
+ # # },
75
+ # # ...
76
+ # #]
77
+ # @param id [#to_s] the id of the job
78
+ # @param owner [String] the owner(s) of the job
79
+ # @raise [Error] if `pjstat` command exited unsuccessfully
80
+ # @return [Array<Hash>] list of details for jobs
81
+ def get_jobs(id: "", owner: nil)
82
+ args = ["-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
83
+ args.concat ["--filter jid=" + id.to_s] unless id.to_s.empty?
84
+ args.concat ["--filter usr=" + owner.to_s] unless owner.to_s.empty?
85
+
86
+ StringIO.open(call("pjstat", *args)) do |output|
87
+ output.gets() # Skip header
88
+ jobs = []
89
+ output.each_line do |line|
90
+ l = line.split(",")
91
+ jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split(" ")[0],
92
+ :ST => l[4], :STD => l[5], :STDE => l[6],
93
+ :ACCEPT => l[7], :START_DATE => l[8], :NODES => l[9].split(":")[0],
94
+ :USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split(" ")[0] }
95
+ end
96
+ jobs
97
+ end
98
+ rescue Fujitsu_TCS_TimeoutError
99
+ return [{ JOB_ID: id, ST: 'undetermined' }]
100
+ end
101
+
102
+ # Put a specified job on hold
103
+ # @example Put job "1234" on hold
104
+ # my_batch.hold_job("1234")
105
+ # @param id [#to_s] the id of the job
106
+ # @raise [Error] if `pjhold` command exited unsuccessfully
107
+ # @return [void]
108
+ def hold_job(id)
109
+ call("pjhold", id.to_s)
110
+ end
111
+
112
+ # Release a specified job that is on hold
113
+ # @example Release job "1234" from on hold
114
+ # my_batch.release_job("1234")
115
+ # @param id [#to_s] the id of the job
116
+ # @raise [Error] if `pjrls` command exited unsuccessfully
117
+ # @return [void]
118
+ def release_job(id)
119
+ call("pjrls", id.to_s)
120
+ end
121
+
122
+ # Delete a specified job from batch server
123
+ # @example Delete job "1234"
124
+ # my_batch.delete_job("1234")
125
+ # @param id [#to_s] the id of the job
126
+ # @raise [Error] if `pjdel` command exited unsuccessfully
127
+ # @return [void]
128
+ def delete_job(id)
129
+ call("pjdel", id.to_s)
130
+ end
131
+
132
+ # Submit a script expanded as a string to the batch server
133
+ # @param str [#to_s] script as a string
134
+ # @param args [Array<#to_s>] arguments passed to `pjsub` command
135
+ # @raise [Error] if `pjsub` command exited unsuccessfully
136
+ # @return [String] the id of the job that was created
137
+ def submit_string(str, args: [])
138
+ args = args.map(&:to_s)
139
+ call("pjsub", *args, stdin: str.to_s).split(" ")[5]
140
+ end
141
+
142
+ private
143
+ # Call a forked Fujitsu TCS command
144
+ def call(cmd, *args, stdin: "")
145
+ cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
146
+ args = args.map(&:to_s)
147
+ o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
148
+ s.success? ? o : raise(Error, e)
149
+ end
150
+ end
151
+
152
+ # Mapping of state codes for Fujitsu TCS resource manager
153
+ STATE_MAP = {
154
+ 'ACC' => :queued, # Accepted job submission
155
+ 'RJT' => :completed, # Rejected job submission
156
+ 'QUE' => :queued, # Waiting for job execution
157
+ 'RNA' => :queued, # Acquiring resources required for job execution
158
+ 'RNP' => :running, # Executing prologue
159
+ 'RUN' => :running, # Executing job
160
+ 'RNE' => :running, # Executing epilogue
161
+ 'RNO' => :running, # Waiting for completion of job termination processing
162
+ 'SPP' => :suspended, # Suspend in progress
163
+ 'SPD' => :suspended, # Suspended
164
+ 'RSM' => :running, # Resume in progress
165
+ 'EXT' => :completed, # Exited job end execution
166
+ 'CCL' => :completed, # Exited job execution by interruption
167
+ 'HLD' => :suspended, # In fixed state due to users
168
+ 'ERR' => :completed, # In fixed state due to an error
169
+ }
170
+
171
+ # @api private
172
+ # @param opts [#to_h] the options defining this adapter
173
+ # @option opts [Batch] :the Fujitsu TCS batch object
174
+ # @see Factory.build_fujitsu_tcs
175
+ def initialize(opts = {})
176
+ o = opts.to_h.symbolize_keys
177
+
178
+ @fujitsu_tcs = o.fetch(:fujitsu_tcs) { raise ArgumentError, "No Fujitsu TCS object specified. Missing argument: fujitsu_tcs" }
179
+ end
180
+
181
+ # Submit a job with the attributes defined in the job template instance
182
+ # @param script [Script] script object that describes the script and
183
+ # attributes for the submitted job
184
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
185
+ # execution at any point after dependent jobs have started execution
186
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
187
+ # execution only after dependent jobs have terminated with no errors
188
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
189
+ # execution only after dependent jobs have terminated with errors
190
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
191
+ # execution after dependent jobs have terminated
192
+ # @raise [JobAdapterError] if something goes wrong submitting a job
193
+ # @return [String] the job id returned after successfully submitting a
194
+ # job
195
+ # @see Adapter#submit
196
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
197
+ #after = Array(after).map(&:to_s)
198
+ #afterok = Array(afterok).map(&:to_s)
199
+ #afternotok = Array(afternotok).map(&:to_s)
200
+ #afterany = Array(afterany).map(&:to_s)
201
+ if !after.empty? || !afterok.empty? || !afternotok.empty? || !afterany.empty?
202
+ raise JobAdapterError, "Dependency between jobs has not implemented yet."
203
+ end
204
+
205
+ # Set pjsub options
206
+ args = []
207
+ args.concat (script.rerunnable ? ["--restart"] : ["--norestart"]) unless script.rerunnable.nil?
208
+ args.concat ["--mail-list", script.email.join(",")] unless script.email.nil?
209
+ if script.email_on_started && script.email_on_terminated
210
+ args.concat ["-m", "b,e"]
211
+ elsif script.email_on_started
212
+ args.concat ["-m", "b"]
213
+ elsif script.email_on_terminated
214
+ args.concat ["-m", "e"]
215
+ end
216
+
217
+ args.concat ["-N", script.job_name] unless script.job_name.nil?
218
+ args.concat ["-o", script.output_path] unless script.output_path.nil?
219
+ if script.error_path.nil?
220
+ args.concat ["-j"]
221
+ else
222
+ args.concat ["-e", script.error_path]
223
+ end
224
+ args.concat ["--rscgrp", script.queue_name] unless script.queue_name.nil?
225
+ args.concat ["-p", script.priority] unless script.priority.nil?
226
+ args.concat ["--at", script.start_time.localtime.strftime("%C%y-%m-%dT%H:%M:%S")] unless script.start_time.nil?
227
+ args.concat ["-L \"elapse=" + seconds_to_duration(script.wall_time) + "\""] unless script.wall_time.nil?
228
+ args.concat ["--bulk", "--sparam", script.job_array_request] unless script.job_array_request.nil?
229
+
230
+ # Set environment variables
231
+ envvars = script.job_environment.to_h
232
+ args.concat ["-x", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
233
+ args.concat ["-X"] if script.copy_environment?
234
+
235
+ # Set native options
236
+ args.concat script.native if script.native
237
+
238
+ # Set content
239
+ content = if script.shell_path.nil?
240
+ script.content
241
+ else
242
+ "#!#{script.shell_path}\n#{script.content}"
243
+ end
244
+
245
+ # Submit job
246
+ @fujitsu_tcs.submit_string(content, args: args)
247
+ rescue Batch::Error => e
248
+ raise JobAdapterError, e.message
249
+ end
250
+
251
+ # Retrieve info for all jobs from the resource manager
252
+ # @raise [JobAdapterError] if something goes wrong getting job info
253
+ # @return [Array<Info>] information describing submitted jobs
254
+ # @see Adapter#info_all
255
+ def info_all(attrs: nil)
256
+ @fujitsu_tcs.get_jobs().map do |v|
257
+ parse_job_info(v)
258
+ end
259
+ rescue Batch::Error => e
260
+ raise JobAdapterError, e.message
261
+ end
262
+
263
+ # Retrieve job info from the resource manager
264
+ # @param id [#to_s] the id of the job
265
+ # @raise [JobAdapterError] if something goes wrong getting job info
266
+ # @return [Info] information describing submitted job
267
+ # @see Adapter#info
268
+ def info(id)
269
+ id = id.to_s
270
+ info_ary = @fujitsu_tcs.get_jobs(id: id).map do |v|
271
+ parse_job_info(v)
272
+ end
273
+
274
+ # If no job was found we assume that it has completed
275
+ info_ary.empty? ? Info.new(id: id, status: :completed) : info_ary.first # @fujitsu_tcs.get_jobs() must return only one element.
276
+ rescue Batch::Error => e
277
+ # set completed status if can't find job id
278
+ if /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
279
+ Info.new(
280
+ id: id,
281
+ status: :completed
282
+ )
283
+ else
284
+ raise JobAdapterError, e.message
285
+ end
286
+ end
287
+
288
+ # Retrieve info for all jobs for a given owner or owners from the
289
+ # resource manager
290
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
291
+ # @raise [JobAdapterError] if something goes wrong getting job info
292
+ # @return [Array<Info>] information describing submitted jobs
293
+ def info_where_owner(owner, attrs: nil)
294
+ owner = Array.wrap(owner).map(&:to_s).join('+')
295
+ @fujitsu_tcs.get_jobs(owner: owner).map do |v|
296
+ parse_job_info(v)
297
+ end
298
+ rescue Batch::Error => e
299
+ raise JobAdapterError, e.message
300
+ end
301
+
302
+ # Retrieve job status from resource manager
303
+ # @param id [#to_s] the id of the job
304
+ # @raise [JobAdapterError] if something goes wrong getting job status
305
+ # @return [Status] status of job
306
+ # @see Adapter#status
307
+ def status(id)
308
+ id = id.to_s
309
+ jobs = @fujitsu_tcs.get_jobs(id: id)
310
+
311
+ if job = jobs.detect { |j| j[:JOB_ID] == id }
312
+ Status.new(state: get_state(job[:ST]))
313
+ else
314
+ # set completed status if can't find job id
315
+ Status.new(state: :completed)
316
+ end
317
+ rescue Batch::Error => e
318
+ # set completed status if can't find job id
319
+ if /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
320
+ Status.new(state: :completed)
321
+ else
322
+ raise JobAdapterError, e.message
323
+ end
324
+ end
325
+
326
+ # Put the submitted job on hold
327
+ # @param id [#to_s] the id of the job
328
+ # @raise [JobAdapterError] if something goes wrong holding a job
329
+ # @return [void]
330
+ # @see Adapter#hold
331
+ def hold(id)
332
+ @fujitsu_tcs.hold_job(id.to_s)
333
+ rescue Batch::Error => e
334
+ # assume successful job hold if can't find job id
335
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
336
+ end
337
+
338
+ # Release the job that is on hold
339
+ # @param id [#to_s] the id of the job
340
+ # @raise [JobAdapterError] if something goes wrong releasing a job
341
+ # @return [void]
342
+ # @see Adapter#release
343
+ def release(id)
344
+ @fujitsu_tcs.release_job(id.to_s)
345
+ rescue Batch::Error => e
346
+ # assume successful job release if can't find job id
347
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
348
+ end
349
+
350
+ # Delete the submitted job
351
+ # @param id [#to_s] the id of the job
352
+ # @raise [JobAdapterError] if something goes wrong deleting a job
353
+ # @return [void]
354
+ # @see Adapter#delete
355
+ def delete(id)
356
+ @fujitsu_tcs.delete_job(id.to_s)
357
+ rescue Batch::Error => e
358
+ # assume successful job deletion if can't find job id
359
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
360
+ end
361
+
362
+ def directive_prefix
363
+ '#PJM'
364
+ end
365
+
366
+ private
367
+ # Convert duration to seconds
368
+ def duration_in_seconds(time)
369
+ return 0 if time.nil?
370
+ time, days = time.split("-").reverse
371
+ days.to_i * 24 * 3600 +
372
+ time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
373
+ end
374
+
375
+ # Convert seconds to duration
376
+ def seconds_to_duration(time)
377
+ "%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
378
+ end
379
+
380
+ # Determine state from Fujitsu TCS state code
381
+ def get_state(st)
382
+ STATE_MAP.fetch(st, :undetermined)
383
+ end
384
+
385
+ # Parse hash describing Fujitsu TCS job status
386
+ def parse_job_info(v)
387
+ Info.new(
388
+ id: v[:JOB_ID],
389
+ job_name: v[:JOB_NAME],
390
+ status: get_state(v[:ST]),
391
+ job_owner: v[:USER],
392
+ dispatch_time: v[:START_DATE],
393
+ wallclock_time: duration_in_seconds(v[:ELAPSE_TIM]),
394
+ wallclock_limit: duration_in_seconds(v[:ELAPSE_LIM]),
395
+ submission_time: v[:ACCEPT],
396
+ queue_name: v[:RSC_GRP],
397
+ native: v
398
+ )
399
+ end
400
+ end
401
+ end
402
+ end
403
+ end
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.20.2"
3
+ VERSION = "0.21.0"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.20.2
4
+ version: 0.21.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2022-07-28 00:00:00.000000000 Z
13
+ date: 2022-08-01 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -179,6 +179,7 @@ files:
179
179
  - lib/ood_core/job/adapter.rb
180
180
  - lib/ood_core/job/adapters/ccq.rb
181
181
  - lib/ood_core/job/adapters/drmaa.rb
182
+ - lib/ood_core/job/adapters/fujitsu_tcs.rb
182
183
  - lib/ood_core/job/adapters/helper.rb
183
184
  - lib/ood_core/job/adapters/kubernetes.rb
184
185
  - lib/ood_core/job/adapters/kubernetes/batch.rb