ood_core 0.20.2 → 0.22.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2c4e013f80e987d4d1cefbc78cc76bcff52e4083e0b84192b42807ae46806946
4
- data.tar.gz: c4a1607904baccc1b063916ecf8e5a9692a9c0102a0d8cda3a9edf0ae760191f
3
+ metadata.gz: b8700bb802df78b66c3bb58ec5464a7f5e54a4dc3ebdc978f9816b8b5e5f7373
4
+ data.tar.gz: 1e2b4ca05369c8072afe8d1069679a8f2744cdcb3ed766924a778e78af65afa2
5
5
  SHA512:
6
- metadata.gz: ab3333366fc7802d59a15dead3b21e863d0017385053eea629a109a076c6e768ed1575378a34e68bb6c163b050be87a9cf323f087d02e4e2be4d349550bf5531
7
- data.tar.gz: 234c13fbbc428717532bd93ba4e977cfd825e480c69daf66c737165ed7c5d8a951c329ced0312d525efc0b70cb4d11234c016c6216c2bb7f74573de854340889
6
+ metadata.gz: a40fc234d2be728b697b9b68884b2286ac68e63ed36eeaa4a48487af99c262fdc1f9b3010554f9e369555734c6d2c0693ca54bc2308c5817f424ec4032759563
7
+ data.tar.gz: 5215f5c924002bfdc40576560898cae49b9a9750ff8d11c87d1e43cd43d0e86327a511ee97eea765a20f8ce6d6903acfaece7b26ad8e7733fd1d3f985b8a97e2
data/.gitignore CHANGED
@@ -50,4 +50,8 @@ Gemfile.lock
50
50
  .rvmrc
51
51
 
52
52
  # SSHFS temp files
53
- ._*
53
+ ._*
54
+
55
+ # docs are only held in the gh-pages branch
56
+ /docs/*
57
+ !/docs/.keep
data/CHANGELOG.md CHANGED
@@ -7,6 +7,24 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.22.0] - 10-31-2022
11
+
12
+ ### Added
13
+
14
+ - Added the `vnc_container` batch connect template in [774](https://github.com/OSC/ood_core/pull/774).
15
+ - https://osc.github.io/ood_core is now updated on every commit to master in [765](https://github.com/OSC/ood_core/pull/765).
16
+
17
+ ### Fixed
18
+
19
+ - Kubernetes can now read mulitple secrets in [778](https://github.com/OSC/ood_core/pull/778).
20
+ - PBSPro correctly reads usernames with periods in them in [780](https://github.com/OSC/ood_core/pull/780).
21
+
22
+ ## [0.21.0] - 08-01-2022
23
+
24
+ ### Added
25
+
26
+ - Added the `fujitsu_tcs` adapter in [766](https://github.com/OSC/ood_core/pull/766).
27
+
10
28
  ## [0.20.2] - 07-28-2022
11
29
 
12
30
  - Fixed an issue with Slurm's `cluster_info` in [762](https://github.com/OSC/ood_core/pull/762).
@@ -437,7 +455,9 @@ Functionally the same as [0.17.3] but with some CI updates.
437
455
  ### Added
438
456
  - Initial release!
439
457
 
440
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.20.2...HEAD
458
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.22.0...HEAD
459
+ [0.22.0]: https://github.com/OSC/ood_core/compare/v0.21.0...v0.22.0
460
+ [0.21.0]: https://github.com/OSC/ood_core/compare/v0.20.2...v0.21.0
441
461
  [0.20.2]: https://github.com/OSC/ood_core/compare/v0.20.1...v0.20.2
442
462
  [0.20.1]: https://github.com/OSC/ood_core/compare/v0.20.0...v0.20.1
443
463
  [0.20.0]: https://github.com/OSC/ood_core/compare/v0.19.0...v0.20.0
data/docs/.keep ADDED
File without changes
@@ -0,0 +1,252 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "securerandom"
3
+
4
+ module OodCore
5
+ module BatchConnect
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ # Build the VNC template from a configuration
10
+ # @param config [#to_h] the configuration for the batch connect template
11
+ def self.build_vnc_container(config)
12
+ context = config.to_h.symbolize_keys.reject { |k, _| k == :template }
13
+
14
+ unless context.key?(:container_path)
15
+ raise JobAdapterError, "You are missing the configuration 'container_path' for a vnc_container template."
16
+ end
17
+
18
+ Templates::VNC_Container.new(context)
19
+ end
20
+ end
21
+
22
+ module Templates
23
+ # A batch connect template that starts up a VNC server within a batch job
24
+ class VNC_Container < Template
25
+ # @param context [#to_h] the context used to render the template
26
+ # @option context [#to_sym, Array<#to_sym>] :conn_params ([]) A list of
27
+ # connection parameters added to the connection file (`:host`,
28
+ # `:port`, `:password`, `:spassword`, `:display` and `:websocket`
29
+ # will always exist)
30
+ # @option context [#to_s] :websockify_cmd
31
+ # ("${WEBSOCKIFY_CMD:-/opt/websockify/run}") the path to the
32
+ # websockify script (assumes you don't modify `:after_script`)
33
+ # @option context [#to_s] :vnc_log ("vnc.log") path to vnc server log
34
+ # file (assumes you don't modify `:before_script` or `:after_script`)
35
+ # @option context [#to_s] :vnc_passwd ("vnc.passwd") path to the file
36
+ # generated that contains the encrypted vnc password (assumes you
37
+ # don't modify `:before_script`)
38
+ # @option context [#to_s] :vnc_args arguments used when starting up the
39
+ # vnc server (overrides any specific vnc argument) (assumes you don't
40
+ # modify `:before_script`)
41
+ # @option context [#to_s] :name ("") name of the vnc server session
42
+ # (not set if blank or `:vnc_args` is set) (assumes you don't modify
43
+ # `:before_script`)
44
+ # @option context [#to_s] :geometry ("") resolution of vnc display (not
45
+ # set if blank or `:vnc_args` is set) (assumes you don't modify
46
+ # `:before_script`)
47
+ # @option context [#to_s] :dpi ("") dpi of vnc display (not set if
48
+ # blank or `:vnc_args` is set) (assumes you don't modify
49
+ # `:before_script`)
50
+ # @option context [#to_s] :fonts ("") command delimited list of fonts
51
+ # available in vnc display (not set if blank or `:vnc_args` is set)
52
+ # (assumes you don't modify `:before_script`)
53
+ # @option context [#to_s] :idle ("") timeout vnc server if no
54
+ # connection in this amount of time in seconds (not set if blank or
55
+ # `:vnc_args` is set) (assumes you don't modify `:before_script`)
56
+ # @option context [#to_s] :extra_args ("") any extra arguments used
57
+ # when initializing the vnc server process (not set if blank or
58
+ # `:vnc_args` is set) (assumes you don't modify `:before_script`)
59
+ # @option context [#to_s] :vnc_clean ("...") script used to clean up
60
+ # any active vnc sessions (assumes you don't modify `:before_script`
61
+ # or `:clean_script`)
62
+ # @option context [#to_s] :container_path ("vnc_container.sif") the path
63
+ # to the container with VNC
64
+ # @option context [#to_s] :container_bindpath ("") paths to bind into
65
+ # the container with VNC
66
+ # @option context [#to_s] :container_module ("singularity") the module
67
+ # that loads Singularity or Apptainer with Lmod. Supports versions (i.e.
68
+ # apptainer/1.10). If Singularity or Apptainer are installed at a
69
+ # system level (i.e., no module loaded to activate), set this to an
70
+ # empty string.
71
+ # @option context [#to_s] :container_command ("singularity") the
72
+ # singularity or apptainer execution command
73
+ # @param instance_name (uuid) a name for the instance
74
+ # @see Template
75
+
76
+ def initialize(context = {})
77
+ @instance_name = SecureRandom.uuid
78
+ super
79
+ end
80
+
81
+ private
82
+ # We need to know the VNC and websockify connection information
83
+ def conn_params
84
+ (super + [:display, :websocket, :spassword, :instance_name]).uniq
85
+ end
86
+
87
+ # Before running the main script, start up a VNC server and record
88
+ # the connection information
89
+ def before_script
90
+ container_path = context.fetch(:container_path, "vnc_container.sif").to_s
91
+ container_bindpath = context.fetch(:container_bindpath, "").to_s
92
+
93
+ <<-EOT.gsub(/^ {14}/, "")
94
+
95
+ # Load #{container_module}
96
+ echo "Loading #{container_module}..."
97
+ module load #{container_module}
98
+ export #{container_command.upcase}_BINDPATH="#{container_bindpath}"
99
+ export INSTANCE_NAME="#{@instance_name}"
100
+ export instance_name="#{@instance_name}"
101
+ echo "Starting instance..."
102
+ #{container_command} instance start #{container_path} #{@instance_name}
103
+
104
+ # Setup one-time use passwords and initialize the VNC password
105
+ function change_passwd () {
106
+ echo "Setting VNC password..."
107
+ password=$(create_passwd "#{password_size}")
108
+ spassword=${spassword:-$(create_passwd "#{password_size}")}
109
+ (
110
+ umask 077
111
+ echo -ne "${password}\\n${spassword}" | #{container_command} exec instance://#{@instance_name} vncpasswd -f > "#{vnc_passwd}"
112
+ )
113
+ }
114
+ change_passwd
115
+
116
+
117
+ # Start up vnc server (if at first you don't succeed, try, try again)
118
+ echo "Starting VNC server..."
119
+ for i in $(seq 1 10); do
120
+ # Clean up any old VNC sessions that weren't cleaned before
121
+ #{vnc_clean}
122
+
123
+ # for turbovnc 3.0 compatability.
124
+ if timeout 2 #{container_command} exec instance://#{@instance_name} vncserver --help 2>&1 | grep 'nohttpd' >/dev/null 2>&1; then
125
+ HTTPD_OPT='-nohttpd'
126
+ fi
127
+
128
+ # Attempt to start VNC server
129
+ VNC_OUT=$(#{container_command} exec instance://#{@instance_name} vncserver -log "#{vnc_log}" -rfbauth "#{vnc_passwd}" $HTTPD_OPT -noxstartup #{vnc_args} 2>&1)
130
+ VNC_PID=$(pgrep -s 0 Xvnc) # the script above will daemonize the Xvnc process
131
+ echo "${VNC_PID}"
132
+ echo "${VNC_OUT}"
133
+
134
+ # Sometimes Xvnc hangs if it fails to find working disaply, we
135
+ # should kill it and try again
136
+ kill -0 ${VNC_PID} 2>/dev/null && [[ "${VNC_OUT}" =~ "Fatal server error" ]] && kill -TERM ${VNC_PID}
137
+
138
+ # Check that Xvnc process is running, if not assume it died and
139
+ # wait some random period of time before restarting
140
+ kill -0 ${VNC_PID} 2>/dev/null || sleep 0.$(random_number 1 9)s
141
+
142
+ # If running, then all is well and break out of loop
143
+ kill -0 ${VNC_PID} 2>/dev/null && break
144
+ done
145
+
146
+ # If we fail to start it after so many tries, then just give up
147
+ kill -0 ${VNC_PID} 2>/dev/null || clean_up 1
148
+
149
+ # Parse output for ports used
150
+ display=$(echo "${VNC_OUT}" | awk -F':' '/^Desktop/{print $NF}')
151
+ port=$((5900+display))
152
+
153
+ echo "Successfully started VNC server on ${host}:${port}..."
154
+
155
+ #{super}
156
+ EOT
157
+ end
158
+
159
+ # Run the script under the VNC server's display
160
+ def run_script
161
+ %(DISPLAY=:${display} #{super})
162
+ end
163
+
164
+ # After startup the main script, scan the VNC server log file for
165
+ # successful connections so that the password can be reset
166
+ def after_script
167
+ websockify_cmd = context.fetch(:websockify_cmd, "${WEBSOCKIFY_CMD:-/opt/websockify/run}").to_s
168
+
169
+ <<-EOT.gsub(/^ {14}/, "")
170
+ #{super}
171
+
172
+ # Launch websockify websocket server
173
+ module load #{container_module}
174
+ echo "Starting websocket server..."
175
+ websocket=$(find_port)
176
+ #{container_command} exec instance://#{@instance_name} #{websockify_cmd} -D ${websocket} localhost:${port}
177
+
178
+ # Set up background process that scans the log file for successful
179
+ # connections by users, and change the password after every
180
+ # connection
181
+ echo "Scanning VNC log file for user authentications..."
182
+ while read -r line; do
183
+ if [[ ${line} =~ "Full-control authentication enabled for" ]]; then
184
+ change_passwd
185
+ create_yml
186
+ fi
187
+ done < <(tail -f --pid=${SCRIPT_PID} "#{vnc_log}") &
188
+ EOT
189
+ end
190
+
191
+ # Clean up the running VNC server and any other stale VNC servers
192
+ def clean_script
193
+ <<-EOT.gsub(/^ {14}/, "")
194
+ #{super}
195
+ module load #{container_module}
196
+
197
+ #{vnc_clean}
198
+ [[ -n ${display} ]] && vncserver -kill :${display}
199
+ #{container_command} instance stop #{@instance_name}
200
+ EOT
201
+ end
202
+
203
+ # Log file for VNC server
204
+ def vnc_log
205
+ context.fetch(:vnc_log, "vnc.log").to_s
206
+ end
207
+
208
+ # Password file for VNC server
209
+ def vnc_passwd
210
+ context.fetch(:vnc_passwd, "vnc.passwd").to_s
211
+ end
212
+
213
+ def container_module
214
+ context.fetch(:container_module, "singularity").to_s
215
+ end
216
+
217
+ def container_command
218
+ context.fetch(:container_command, "singularity").to_s
219
+ end
220
+
221
+ # Arguments sent to `vncserver` command
222
+ def vnc_args
223
+ context.fetch(:vnc_args) do
224
+ name = context.fetch(:name, "").to_s
225
+ geometry = context.fetch(:geometry, "").to_s
226
+ dpi = context.fetch(:dpi, "").to_s
227
+ fonts = context.fetch(:fonts, "").to_s
228
+ idle = context.fetch(:idle, "").to_s
229
+ extra_args = context.fetch(:extra_args, "").to_s
230
+
231
+ args = []
232
+ args << "-name #{name}" unless name.empty?
233
+ args << "-geometry #{geometry}" unless geometry.empty?
234
+ args << "-dpi #{dpi}" unless dpi.empty?
235
+ args << "-fp #{fonts}" unless fonts.empty?
236
+ args << "-idletimeout #{idle}" unless idle.empty?
237
+ args << extra_args
238
+
239
+ args.join(" ")
240
+ end.to_s
241
+ end
242
+
243
+ # Clean up any stale VNC sessions
244
+ def vnc_clean
245
+ context.fetch(:vnc_clean) do
246
+ %(#{container_command} exec instance://#{@instance_name} vncserver -list | awk '/^:/{system("kill -0 "$2" 2>/dev/null || #{container_command} exec instance://#{@instance_name} vncserver -kill "$1)}')
247
+ end.to_s
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end
@@ -23,6 +23,7 @@ module OodCore
23
23
 
24
24
  class PromptError < StandardError; end
25
25
 
26
+ # The adapter class for the Cloudy Cluster product CCQ.
26
27
  class CCQ < Adapter
27
28
  using Refinements::ArrayExtensions
28
29
 
@@ -0,0 +1,403 @@
1
+ require "time"
2
+ require "ood_core/refinements/hash_extensions"
3
+ require "ood_core/refinements/array_extensions"
4
+ require "ood_core/job/adapters/helper"
5
+
6
+ module OodCore
7
+ module Job
8
+ class Factory
9
+ using Refinements::HashExtensions
10
+
11
+ # Build the Fujitsu TCS (Technical Computing Suite) adapter from a configuration
12
+ # @param config [#to_h] the configuration for job adapter
13
+ # @option config [Object] :bin (nil) Path to Fujitsu TCS resource manager binaries
14
+ # @option config [#to_h] :bin_overrides ({}) Optional overrides to Fujitsu TCS resource manager executables
15
+ def self.build_fujitsu_tcs(config)
16
+ c = config.to_h.symbolize_keys
17
+ bin = c.fetch(:bin, nil)
18
+ bin_overrides = c.fetch(:bin_overrides, {})
19
+ fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides)
20
+ Adapters::Fujitsu_TCS.new(fujitsu_tcs: fujitsu_tcs)
21
+ end
22
+ end
23
+
24
+ module Adapters
25
+ # An adapter object that describes the communication with a Fujitsu TCS
26
+ # resource manager for job management.
27
+ class Fujitsu_TCS < Adapter
28
+ using Refinements::HashExtensions
29
+ using Refinements::ArrayExtensions
30
+
31
+ # Object used for simplified communication with a Fujitsu TCS batch server
32
+ # @api private
33
+ class Batch
34
+ # The path to the Fujitsu TCS binaries
35
+ # @example
36
+ # my_batch.bin.to_s #=> "/usr/local/fujitsu_tcs/10.0.0/bin"
37
+ # @return [Pathname] path to Fujitsu TCS binaries
38
+ attr_reader :bin
39
+
40
+ # Optional overrides for Fujitsu TCS executables
41
+ # @example
42
+ # {'pjsub' => '/usr/local/bin/pjsub'}
43
+ # @return Hash<String, String>
44
+ attr_reader :bin_overrides
45
+
46
+ # The root exception class that all Fujitsu TCS specific exceptions inherit
47
+ # from
48
+ class Error < StandardError; end
49
+
50
+ # An error indicating the Fujitsu TCS command timed out
51
+ class Fujitsu_TCS_TimeoutError < Error; end
52
+
53
+ # @param bin [#to_s] path to Fujitsu TCS installation binaries
54
+ # @param bin_overrides [#to_h] a hash of bin ovverides to be used in job
55
+ def initialize(bin: nil, bin_overrides: {})
56
+ @bin = Pathname.new(bin.to_s)
57
+ @bin_overrides = bin_overrides
58
+ end
59
+
60
+ # Get a list of hashes detailing each of the jobs on the batch server
61
+ # @example Status info for all jobs
62
+ # my_batch.get_jobs
63
+ # #=>
64
+ # #[
65
+ # # {
66
+ # # :JOB_ID => "123",
67
+ # # :JOB_NAME => "my_job",
68
+ # # ...
69
+ # # },
70
+ # # {
71
+ # # :JOB_ID => "125",
72
+ # # :JOB_NAME => "my_other_job",
73
+ # # ...
74
+ # # },
75
+ # # ...
76
+ # #]
77
+ # @param id [#to_s] the id of the job
78
+ # @param owner [String] the owner(s) of the job
79
+ # @raise [Error] if `pjstat` command exited unsuccessfully
80
+ # @return [Array<Hash>] list of details for jobs
81
+ def get_jobs(id: "", owner: nil)
82
+ args = ["-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
83
+ args.concat ["--filter jid=" + id.to_s] unless id.to_s.empty?
84
+ args.concat ["--filter usr=" + owner.to_s] unless owner.to_s.empty?
85
+
86
+ StringIO.open(call("pjstat", *args)) do |output|
87
+ output.gets() # Skip header
88
+ jobs = []
89
+ output.each_line do |line|
90
+ l = line.split(",")
91
+ jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split(" ")[0],
92
+ :ST => l[4], :STD => l[5], :STDE => l[6],
93
+ :ACCEPT => l[7], :START_DATE => l[8], :NODES => l[9].split(":")[0],
94
+ :USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split(" ")[0] }
95
+ end
96
+ jobs
97
+ end
98
+ rescue Fujitsu_TCS_TimeoutError
99
+ return [{ JOB_ID: id, ST: 'undetermined' }]
100
+ end
101
+
102
+ # Put a specified job on hold
103
+ # @example Put job "1234" on hold
104
+ # my_batch.hold_job("1234")
105
+ # @param id [#to_s] the id of the job
106
+ # @raise [Error] if `pjhold` command exited unsuccessfully
107
+ # @return [void]
108
+ def hold_job(id)
109
+ call("pjhold", id.to_s)
110
+ end
111
+
112
+ # Release a specified job that is on hold
113
+ # @example Release job "1234" from on hold
114
+ # my_batch.release_job("1234")
115
+ # @param id [#to_s] the id of the job
116
+ # @raise [Error] if `pjrls` command exited unsuccessfully
117
+ # @return [void]
118
+ def release_job(id)
119
+ call("pjrls", id.to_s)
120
+ end
121
+
122
+ # Delete a specified job from batch server
123
+ # @example Delete job "1234"
124
+ # my_batch.delete_job("1234")
125
+ # @param id [#to_s] the id of the job
126
+ # @raise [Error] if `pjdel` command exited unsuccessfully
127
+ # @return [void]
128
+ def delete_job(id)
129
+ call("pjdel", id.to_s)
130
+ end
131
+
132
+ # Submit a script expanded as a string to the batch server
133
+ # @param str [#to_s] script as a string
134
+ # @param args [Array<#to_s>] arguments passed to `pjsub` command
135
+ # @raise [Error] if `pjsub` command exited unsuccessfully
136
+ # @return [String] the id of the job that was created
137
+ def submit_string(str, args: [])
138
+ args = args.map(&:to_s)
139
+ call("pjsub", *args, stdin: str.to_s).split(" ")[5]
140
+ end
141
+
142
+ private
143
+ # Call a forked Fujitsu TCS command
144
+ def call(cmd, *args, stdin: "")
145
+ cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
146
+ args = args.map(&:to_s)
147
+ o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
148
+ s.success? ? o : raise(Error, e)
149
+ end
150
+ end
151
+
152
+ # Mapping of state codes for Fujitsu TCS resource manager
153
+ STATE_MAP = {
154
+ 'ACC' => :queued, # Accepted job submission
155
+ 'RJT' => :completed, # Rejected job submission
156
+ 'QUE' => :queued, # Waiting for job execution
157
+ 'RNA' => :queued, # Acquiring resources required for job execution
158
+ 'RNP' => :running, # Executing prologue
159
+ 'RUN' => :running, # Executing job
160
+ 'RNE' => :running, # Executing epilogue
161
+ 'RNO' => :running, # Waiting for completion of job termination processing
162
+ 'SPP' => :suspended, # Suspend in progress
163
+ 'SPD' => :suspended, # Suspended
164
+ 'RSM' => :running, # Resume in progress
165
+ 'EXT' => :completed, # Exited job end execution
166
+ 'CCL' => :completed, # Exited job execution by interruption
167
+ 'HLD' => :suspended, # In fixed state due to users
168
+ 'ERR' => :completed, # In fixed state due to an error
169
+ }
170
+
171
+ # @api private
172
+ # @param opts [#to_h] the options defining this adapter
173
+ # @option opts [Batch] :the Fujitsu TCS batch object
174
+ # @see Factory.build_fujitsu_tcs
175
+ def initialize(opts = {})
176
+ o = opts.to_h.symbolize_keys
177
+
178
+ @fujitsu_tcs = o.fetch(:fujitsu_tcs) { raise ArgumentError, "No Fujitsu TCS object specified. Missing argument: fujitsu_tcs" }
179
+ end
180
+
181
+ # Submit a job with the attributes defined in the job template instance
182
+ # @param script [Script] script object that describes the script and
183
+ # attributes for the submitted job
184
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for
185
+ # execution at any point after dependent jobs have started execution
186
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
187
+ # execution only after dependent jobs have terminated with no errors
188
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
189
+ # execution only after dependent jobs have terminated with errors
190
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
191
+ # execution after dependent jobs have terminated
192
+ # @raise [JobAdapterError] if something goes wrong submitting a job
193
+ # @return [String] the job id returned after successfully submitting a
194
+ # job
195
+ # @see Adapter#submit
196
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
197
+ #after = Array(after).map(&:to_s)
198
+ #afterok = Array(afterok).map(&:to_s)
199
+ #afternotok = Array(afternotok).map(&:to_s)
200
+ #afterany = Array(afterany).map(&:to_s)
201
+ if !after.empty? || !afterok.empty? || !afternotok.empty? || !afterany.empty?
202
+ raise JobAdapterError, "Dependency between jobs has not implemented yet."
203
+ end
204
+
205
+ # Set pjsub options
206
+ args = []
207
+ args.concat (script.rerunnable ? ["--restart"] : ["--norestart"]) unless script.rerunnable.nil?
208
+ args.concat ["--mail-list", script.email.join(",")] unless script.email.nil?
209
+ if script.email_on_started && script.email_on_terminated
210
+ args.concat ["-m", "b,e"]
211
+ elsif script.email_on_started
212
+ args.concat ["-m", "b"]
213
+ elsif script.email_on_terminated
214
+ args.concat ["-m", "e"]
215
+ end
216
+
217
+ args.concat ["-N", script.job_name] unless script.job_name.nil?
218
+ args.concat ["-o", script.output_path] unless script.output_path.nil?
219
+ if script.error_path.nil?
220
+ args.concat ["-j"]
221
+ else
222
+ args.concat ["-e", script.error_path]
223
+ end
224
+ args.concat ["--rscgrp", script.queue_name] unless script.queue_name.nil?
225
+ args.concat ["-p", script.priority] unless script.priority.nil?
226
+ args.concat ["--at", script.start_time.localtime.strftime("%C%y-%m-%dT%H:%M:%S")] unless script.start_time.nil?
227
+ args.concat ["-L \"elapse=" + seconds_to_duration(script.wall_time) + "\""] unless script.wall_time.nil?
228
+ args.concat ["--bulk", "--sparam", script.job_array_request] unless script.job_array_request.nil?
229
+
230
+ # Set environment variables
231
+ envvars = script.job_environment.to_h
232
+ args.concat ["-x", envvars.map{|k,v| "#{k}=#{v}"}.join(",")] unless envvars.empty?
233
+ args.concat ["-X"] if script.copy_environment?
234
+
235
+ # Set native options
236
+ args.concat script.native if script.native
237
+
238
+ # Set content
239
+ content = if script.shell_path.nil?
240
+ script.content
241
+ else
242
+ "#!#{script.shell_path}\n#{script.content}"
243
+ end
244
+
245
+ # Submit job
246
+ @fujitsu_tcs.submit_string(content, args: args)
247
+ rescue Batch::Error => e
248
+ raise JobAdapterError, e.message
249
+ end
250
+
251
+ # Retrieve info for all jobs from the resource manager
252
+ # @raise [JobAdapterError] if something goes wrong getting job info
253
+ # @return [Array<Info>] information describing submitted jobs
254
+ # @see Adapter#info_all
255
+ def info_all(attrs: nil)
256
+ @fujitsu_tcs.get_jobs().map do |v|
257
+ parse_job_info(v)
258
+ end
259
+ rescue Batch::Error => e
260
+ raise JobAdapterError, e.message
261
+ end
262
+
263
+ # Retrieve job info from the resource manager
264
+ # @param id [#to_s] the id of the job
265
+ # @raise [JobAdapterError] if something goes wrong getting job info
266
+ # @return [Info] information describing submitted job
267
+ # @see Adapter#info
268
+ def info(id)
269
+ id = id.to_s
270
+ info_ary = @fujitsu_tcs.get_jobs(id: id).map do |v|
271
+ parse_job_info(v)
272
+ end
273
+
274
+ # If no job was found we assume that it has completed
275
+ info_ary.empty? ? Info.new(id: id, status: :completed) : info_ary.first # @fujitsu_tcs.get_jobs() must return only one element.
276
+ rescue Batch::Error => e
277
+ # set completed status if can't find job id
278
+ if /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
279
+ Info.new(
280
+ id: id,
281
+ status: :completed
282
+ )
283
+ else
284
+ raise JobAdapterError, e.message
285
+ end
286
+ end
287
+
288
+ # Retrieve info for all jobs for a given owner or owners from the
289
+ # resource manager
290
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
291
+ # @raise [JobAdapterError] if something goes wrong getting job info
292
+ # @return [Array<Info>] information describing submitted jobs
293
+ def info_where_owner(owner, attrs: nil)
294
+ owner = Array.wrap(owner).map(&:to_s).join('+')
295
+ @fujitsu_tcs.get_jobs(owner: owner).map do |v|
296
+ parse_job_info(v)
297
+ end
298
+ rescue Batch::Error => e
299
+ raise JobAdapterError, e.message
300
+ end
301
+
302
+ # Retrieve job status from resource manager
303
+ # @param id [#to_s] the id of the job
304
+ # @raise [JobAdapterError] if something goes wrong getting job status
305
+ # @return [Status] status of job
306
+ # @see Adapter#status
307
+ def status(id)
308
+ id = id.to_s
309
+ jobs = @fujitsu_tcs.get_jobs(id: id)
310
+
311
+ if job = jobs.detect { |j| j[:JOB_ID] == id }
312
+ Status.new(state: get_state(job[:ST]))
313
+ else
314
+ # set completed status if can't find job id
315
+ Status.new(state: :completed)
316
+ end
317
+ rescue Batch::Error => e
318
+ # set completed status if can't find job id
319
+ if /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
320
+ Status.new(state: :completed)
321
+ else
322
+ raise JobAdapterError, e.message
323
+ end
324
+ end
325
+
326
+ # Put the submitted job on hold
327
+ # @param id [#to_s] the id of the job
328
+ # @raise [JobAdapterError] if something goes wrong holding a job
329
+ # @return [void]
330
+ # @see Adapter#hold
331
+ def hold(id)
332
+ @fujitsu_tcs.hold_job(id.to_s)
333
+ rescue Batch::Error => e
334
+ # assume successful job hold if can't find job id
335
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
336
+ end
337
+
338
+ # Release the job that is on hold
339
+ # @param id [#to_s] the id of the job
340
+ # @raise [JobAdapterError] if something goes wrong releasing a job
341
+ # @return [void]
342
+ # @see Adapter#release
343
+ def release(id)
344
+ @fujitsu_tcs.release_job(id.to_s)
345
+ rescue Batch::Error => e
346
+ # assume successful job release if can't find job id
347
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
348
+ end
349
+
350
+ # Delete the submitted job
351
+ # @param id [#to_s] the id of the job
352
+ # @raise [JobAdapterError] if something goes wrong deleting a job
353
+ # @return [void]
354
+ # @see Adapter#delete
355
+ def delete(id)
356
+ @fujitsu_tcs.delete_job(id.to_s)
357
+ rescue Batch::Error => e
358
+ # assume successful job deletion if can't find job id
359
+ raise JobAdapterError, e.message unless /\[ERR\.\] PJM .+ Job .+ does not exist/ =~ e.message
360
+ end
361
+
362
+ def directive_prefix
363
+ '#PJM'
364
+ end
365
+
366
+ private
367
+ # Convert duration to seconds
368
+ def duration_in_seconds(time)
369
+ return 0 if time.nil?
370
+ time, days = time.split("-").reverse
371
+ days.to_i * 24 * 3600 +
372
+ time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
373
+ end
374
+
375
+ # Convert seconds to duration
376
+ def seconds_to_duration(time)
377
+ "%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
378
+ end
379
+
380
+ # Determine state from Fujitsu TCS state code
381
+ def get_state(st)
382
+ STATE_MAP.fetch(st, :undetermined)
383
+ end
384
+
385
+ # Parse hash describing Fujitsu TCS job status
386
+ def parse_job_info(v)
387
+ Info.new(
388
+ id: v[:JOB_ID],
389
+ job_name: v[:JOB_NAME],
390
+ status: get_state(v[:ST]),
391
+ job_owner: v[:USER],
392
+ dispatch_time: v[:START_DATE],
393
+ wallclock_time: duration_in_seconds(v[:ELAPSE_TIM]),
394
+ wallclock_limit: duration_in_seconds(v[:ELAPSE_LIM]),
395
+ submission_time: v[:ACCEPT],
396
+ queue_name: v[:RSC_GRP],
397
+ native: v
398
+ )
399
+ end
400
+ end
401
+ end
402
+ end
403
+ end
@@ -1,6 +1,8 @@
1
1
  require "ood_core/refinements/hash_extensions"
2
2
  require "json"
3
3
 
4
+ # Utility class for the Kubernetes adapter to interact
5
+ # with the Kuberenetes APIs.
4
6
  class OodCore::Job::Adapters::Kubernetes::Batch
5
7
 
6
8
  require_relative "helper"
@@ -1,3 +1,5 @@
1
+ # Utility class for the Kubernetes adapter to parse
2
+ # json data into Ruby objects.
1
3
  class OodCore::Job::Adapters::Kubernetes::Helper
2
4
 
3
5
  require_relative 'resources'
@@ -193,10 +195,14 @@ class OodCore::Job::Adapters::Kubernetes::Helper
193
195
  end
194
196
 
195
197
  def secret_info_from_json(json_data)
196
- raw = json_data.dig(:data, :password)
197
- { ood_connection_info: { password: Base64.decode64(raw) } }
198
- rescue
199
- {}
198
+ data = json_data.to_h[:data] || {}
199
+
200
+ info = data.symbolize_keys.each_with_object({}) do |data_kv, hash|
201
+ hash[data_kv[0]] = Base64.decode64(data_kv[1])
202
+ rescue
203
+ next
204
+ end
205
+ { ood_connection_info: info }
200
206
  end
201
207
 
202
208
  def dispatch_time(json_data)
@@ -1,4 +1,4 @@
1
- # An object that describes a submitted kubernetes job with extended information
1
+ # An object that describes a submitted kubernetes job with extended information.
2
2
  class OodCore::Job::Adapters::Kubernetes::K8sJobInfo < OodCore::Job::Info
3
3
  attr_reader :ood_connection_info
4
4
 
@@ -1,5 +1,5 @@
1
1
  module OodCore::Job::Adapters::Kubernetes::Resources
2
-
2
+ # Utility class for kubernetes configmap objects.
3
3
  class ConfigMap
4
4
  attr_accessor :name, :files
5
5
 
@@ -20,6 +20,7 @@ module OodCore::Job::Adapters::Kubernetes::Resources
20
20
  end
21
21
  end
22
22
 
23
+ # Utility class for mounting files in kubernetes configmap objects.
23
24
  class ConfigMapFile
24
25
  attr_accessor :filename, :data, :mount_path, :sub_path, :init_mount_path, :init_sub_path
25
26
 
@@ -33,6 +34,7 @@ module OodCore::Job::Adapters::Kubernetes::Resources
33
34
  end
34
35
  end
35
36
 
37
+ # Utility class for kuberenetes probe settings.
36
38
  class TCPProbe
37
39
  attr_accessor :port, :initial_delay_seconds, :failure_threshold, :period_seconds
38
40
 
@@ -54,6 +56,7 @@ module OodCore::Job::Adapters::Kubernetes::Resources
54
56
  end
55
57
  end
56
58
 
59
+ # Utility class for kuberenetes container object.
57
60
  class Container
58
61
  attr_accessor :name, :image, :command, :port, :env, :working_dir,
59
62
  :memory_limit, :memory_request, :cpu_limit, :cpu_request,
@@ -106,6 +109,7 @@ module OodCore::Job::Adapters::Kubernetes::Resources
106
109
  end
107
110
  end
108
111
 
112
+ # Utility class for kuberenetes podspec object.
109
113
  class PodSpec
110
114
  attr_accessor :container, :init_containers
111
115
  def initialize(container, init_containers: nil)
@@ -13,6 +13,8 @@ module OodCore
13
13
  end
14
14
 
15
15
  module Adapters
16
+
17
+ # The adapter class for Kubernetes.
16
18
  class Kubernetes < Adapter
17
19
 
18
20
  using Refinements::ArrayExtensions
@@ -22,6 +22,8 @@ module OodCore
22
22
  end
23
23
 
24
24
  module Adapters
25
+
26
+ # The adapter class for the LSF scheduler.
25
27
  class Lsf < Adapter
26
28
  using Refinements::ArrayExtensions
27
29
 
@@ -453,7 +453,7 @@ module OodCore
453
453
 
454
454
  # Parse hash describing PBS Pro job status
455
455
  def parse_job_info(v)
456
- /^(?<job_owner>[\w-]+)@(?<submit_host>.+)$/ =~ v[:Job_Owner]
456
+ /^(?<job_owner>[\w\-.]+)@(?<submit_host>.+)$/ =~ v[:Job_Owner]
457
457
  allocated_nodes = parse_nodes(v[:exec_host] || "")
458
458
  procs = allocated_nodes.inject(0) { |sum, x| sum + x[:procs] }
459
459
  if allocated_nodes.empty? # fill in with requested resources
@@ -1,3 +1,5 @@
1
+ # Utility helper class for the SGE adapter to parse
2
+ # input and generate submission arguements.
1
3
  class OodCore::Job::Adapters::Sge::Helper
2
4
  require 'ood_core/job/adapters/sge'
3
5
 
@@ -16,7 +16,6 @@ require 'ood_core/job/array_ids'
16
16
  # :status
17
17
  # :wallclock_limit
18
18
  # :wallclock_time
19
-
20
19
  class QstatXmlJRListener
21
20
  # [Hash]
22
21
  attr_reader :parsed_job
@@ -13,7 +13,6 @@ require 'date'
13
13
  # :queue_name
14
14
  # :status
15
15
  # :wallclock_limit
16
-
17
16
  class QstatXmlRListener
18
17
  # [Array<Hash>]
19
18
  attr_reader :parsed_jobs
@@ -22,6 +22,8 @@ module OodCore
22
22
  end
23
23
 
24
24
  module Adapters
25
+
26
+ # The adpater class for Grid Engine (GE) flavors like Sun Grid Engine.
25
27
  class Sge < Adapter
26
28
  using Refinements::HashExtensions
27
29
  using Refinements::ArrayExtensions
@@ -37,8 +37,7 @@ module OodCore
37
37
  end
38
38
 
39
39
  module Adapters
40
- # An adapter object that describes the communication with a remote host
41
- # for job management.
40
+ # The adapter for using systemd timers as the scheduler.
42
41
  class LinuxSystemd < Adapter
43
42
  using Refinements::ArrayExtensions
44
43
 
@@ -1,3 +1,4 @@
1
+ # Utility class to maintain all the Torque attributes available.
1
2
  class OodCore::Job::Adapters::Torque
2
3
  # Maintains a constant Hash of defined PBS attribute types
3
4
  # Includes:
@@ -1,5 +1,7 @@
1
1
  require 'open3'
2
2
 
3
+ # Utility class for the Torque adapter to communicate with the
4
+ # Torque scheduler.
3
5
  class OodCore::Job::Adapters::Torque
4
6
  # Object used for simplified communication with a batch server
5
7
  class Batch
@@ -1,3 +1,4 @@
1
+ # FFI errors for the Torque adapter.
1
2
  class OodCore::Job::Adapters::Torque::FFI
2
3
  # The root exception class that all PBS-specific exceptions inherit from
3
4
  class Error < StandardError; end
@@ -1,6 +1,6 @@
1
1
  require 'ffi'
2
2
 
3
- # An interface to the C-library of Torque
3
+ # An interface to the C-library of Torque.
4
4
  class OodCore::Job::Adapters::Torque::FFI
5
5
 
6
6
  extend ::FFI::Library
@@ -1,14 +1,14 @@
1
- # Builds a sorted array of job ids given a job array spec string
2
- #
3
- # Job array spec strings:
4
- # 1 Single id
5
- # 1-10 Range
6
- # 1-10:2 Range with step
7
- # 1-10,13 Compound (range with single id)
8
- #
9
- # Note that Ranges are expected to be inclusive
10
1
  module OodCore
11
2
  module Job
3
+ # Builds a sorted array of job ids given a job array spec string
4
+ #
5
+ # Job array spec strings:
6
+ # 1 Single id
7
+ # 1-10 Range
8
+ # 1-10:2 Range with step
9
+ # 1-10,13 Compound (range with single id)
10
+ #
11
+ # Note that Ranges are expected to be inclusive
12
12
  class ArrayIds
13
13
  attr_reader :spec_string
14
14
 
@@ -1,6 +1,6 @@
1
1
  module OodCore
2
2
  module Job
3
- # An object that contains details about the cluster's active and total nodes, processors, and gpus
3
+ # An object that contains details about the cluster's active and total nodes, processors and gpus.
4
4
  class ClusterInfo
5
5
  using Refinements::HashExtensions
6
6
 
@@ -2,7 +2,7 @@ require 'time'
2
2
 
3
3
  module OodCore
4
4
  module Job
5
- # An object that describes a submitted job
5
+ # An object that describes a submitted job.
6
6
  class Info
7
7
  # The identifier of the job
8
8
  # @return [String] job id
@@ -1,6 +1,6 @@
1
1
  module OodCore
2
2
  module Job
3
- # An object that describes the resources used on a specific node
3
+ # An object that describes the resources used on a specific node.
4
4
  class NodeInfo
5
5
  # The name of the host machine
6
6
  # @return [String] node name
@@ -1,6 +1,6 @@
1
1
  module OodCore
2
2
  module Job
3
- # An object that describes the current state of a submitted job
3
+ # An object that describes the current state of a submitted job.
4
4
  class Status
5
5
  class << self
6
6
  # Possible states a submitted job can be in:
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.20.2"
3
+ VERSION = "0.22.0"
4
4
  end
data/ood_core.gemspec CHANGED
@@ -30,5 +30,5 @@ Gem::Specification.new do |spec|
30
30
  spec.add_development_dependency "rspec", "~> 3.0"
31
31
  spec.add_development_dependency "pry", "~> 0.10"
32
32
  spec.add_development_dependency "timecop", "~> 0.8"
33
- spec.add_development_dependency "climate_control", "~> 1.1.1"
33
+ spec.add_development_dependency "climate_control", "~> 1.2.0"
34
34
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.20.2
4
+ version: 0.22.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2022-07-28 00:00:00.000000000 Z
13
+ date: 2022-10-31 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -136,14 +136,14 @@ dependencies:
136
136
  requirements:
137
137
  - - "~>"
138
138
  - !ruby/object:Gem::Version
139
- version: 1.1.1
139
+ version: 1.2.0
140
140
  type: :development
141
141
  prerelease: false
142
142
  version_requirements: !ruby/object:Gem::Requirement
143
143
  requirements:
144
144
  - - "~>"
145
145
  - !ruby/object:Gem::Version
146
- version: 1.1.1
146
+ version: 1.2.0
147
147
  description: Open OnDemand core library that provides support for an HPC Center to
148
148
  globally define HPC services that web applications can then take advantage of.
149
149
  email:
@@ -164,6 +164,7 @@ files:
164
164
  - Rakefile
165
165
  - bin/console
166
166
  - bin/setup
167
+ - docs/.keep
167
168
  - lib/ood_core.rb
168
169
  - lib/ood_core/acl/adapter.rb
169
170
  - lib/ood_core/acl/adapters/group.rb
@@ -172,6 +173,7 @@ files:
172
173
  - lib/ood_core/batch_connect/template.rb
173
174
  - lib/ood_core/batch_connect/templates/basic.rb
174
175
  - lib/ood_core/batch_connect/templates/vnc.rb
176
+ - lib/ood_core/batch_connect/templates/vnc_container.rb
175
177
  - lib/ood_core/cluster.rb
176
178
  - lib/ood_core/clusters.rb
177
179
  - lib/ood_core/errors.rb
@@ -179,6 +181,7 @@ files:
179
181
  - lib/ood_core/job/adapter.rb
180
182
  - lib/ood_core/job/adapters/ccq.rb
181
183
  - lib/ood_core/job/adapters/drmaa.rb
184
+ - lib/ood_core/job/adapters/fujitsu_tcs.rb
182
185
  - lib/ood_core/job/adapters/helper.rb
183
186
  - lib/ood_core/job/adapters/kubernetes.rb
184
187
  - lib/ood_core/job/adapters/kubernetes/batch.rb