ood_core 0.23.5 → 0.24.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/lib/ood_core/batch_connect/template.rb +19 -5
- data/lib/ood_core/batch_connect/templates/vnc.rb +36 -1
- data/lib/ood_core/batch_connect/templates/vnc_container.rb +1 -0
- data/lib/ood_core/job/adapters/fujitsu_tcs.rb +31 -14
- data/lib/ood_core/job/adapters/slurm.rb +2 -1
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2d7ae635ec6299414feac4de3589bea126a97cd427507bd99242f0bcb79553b4
|
4
|
+
data.tar.gz: 29855a0d0573e1d51fe6ebf7a0679c37993c608f77e45f804f6851d84160620c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6b93ec8179ffca892538e5bdd8bcc1eae6adceac3a2d898079c7f8f3a60d5cd04ac64569e71ba0af2fb9ab9833a9b7f6d0c1f845cde1487eaf3d9c8f90b382d8
|
7
|
+
data.tar.gz: 2919276a9ade663afce93339602304f0fc727fa392d6f711f0166b82003c51e8765520603c93047919e4f1d1b81104723fd5ac05c760e891e90ab3a13a5d1c8e
|
data/CHANGELOG.md
CHANGED
@@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
9
|
|
10
|
+
## [0.24.1] - 11-29-2023
|
11
|
+
|
12
|
+
[820](https://github.com/OSC/ood_core/pull/820) Reverts [818](https://github.com/OSC/ood_core/pull/818)
|
13
|
+
|
14
|
+
## [0.24.0] - 11-28-2023
|
15
|
+
|
16
|
+
- Code cleanup and separate arguments with whitespace in Fujitsu TCS adapter by @mnakao in https://github.com/OSC/ood_core/pull/808
|
17
|
+
- Add OUT_OF_MEMORY state for Slurm by @robinkar in https://github.com/OSC/ood_core/pull/809
|
18
|
+
- find_port: avoid infinite loop by @utkarshayachit in https://github.com/OSC/ood_core/pull/811
|
19
|
+
- handle find_port error codes by @utkarshayachit in https://github.com/OSC/ood_core/pull/812
|
20
|
+
- vnc: run websockify as background process by @utkarshayachit in https://github.com/OSC/ood_core/pull/813
|
21
|
+
- Add working_dir option for Fujitsu TCS job scheduler by @mnakao in https://github.com/OSC/ood_core/pull/816
|
22
|
+
- Minor fix for Fujitsu TCS by @mnakao in https://github.com/OSC/ood_core/pull/817
|
23
|
+
- Update rake requirement from ~> 13.0.1 to ~> 13.1.0 by @dependabot in https://github.com/OSC/ood_core/pull/814
|
24
|
+
- Changes default return value for cluster.batch_connect_ssh_allow? by @HazelGrant in https://github.com/OSC/ood_core/pull/818
|
25
|
+
|
10
26
|
## [0.23.5] - 04-10-2023
|
11
27
|
|
12
28
|
### Fixed
|
@@ -162,14 +162,28 @@ module OodCore
|
|
162
162
|
export -f port_used
|
163
163
|
|
164
164
|
# Find available port in range [$2..$3] for host $1
|
165
|
-
# Default:
|
165
|
+
# Default host: localhost
|
166
|
+
# Default port range: [#{min_port}..#{max_port}]
|
167
|
+
# returns error code (0: success, 1: failed)
|
168
|
+
# On success, the chosen port is echoed on stdout.
|
166
169
|
find_port () {
|
167
170
|
local host="${1:-localhost}"
|
168
|
-
local
|
169
|
-
|
170
|
-
|
171
|
+
local min_port=${2:-#{min_port}}
|
172
|
+
local max_port=${3:-#{max_port}}
|
173
|
+
local port_range=($(shuf -i ${min_port}-${max_port}))
|
174
|
+
local retries=1 # number of retries over the port range if first attempt fails
|
175
|
+
for ((attempt=0; attempt<=$retries; attempt++)); do
|
176
|
+
for port in "${port_range[@]}"; do
|
177
|
+
if port_used "${host}:${port}"; then
|
178
|
+
continue
|
179
|
+
fi
|
180
|
+
echo "${port}"
|
181
|
+
return 0 # success
|
182
|
+
done
|
171
183
|
done
|
172
|
-
|
184
|
+
|
185
|
+
echo "error: failed to find available port in range ${min_port}..${max_port}" >&2
|
186
|
+
return 1 # failure
|
173
187
|
}
|
174
188
|
export -f find_port
|
175
189
|
|
@@ -134,10 +134,45 @@ module OodCore
|
|
134
134
|
<<-EOT.gsub(/^ {14}/, "")
|
135
135
|
#{super}
|
136
136
|
|
137
|
+
# launches websockify in the background; waiting until the process
|
138
|
+
# has started proxying successfully.
|
139
|
+
start_websockify() {
|
140
|
+
local log_file="./websockify.log"
|
141
|
+
# launch websockify in background and redirect all output to a file.
|
142
|
+
#{websockify_cmd} $1 $2 &> $log_file &
|
143
|
+
local ws_pid=$!
|
144
|
+
local counter=0
|
145
|
+
|
146
|
+
# wait till websockify has successfully started
|
147
|
+
echo "[websockify]: pid: $ws_pid (proxying $1 ==> $2)" >&2
|
148
|
+
echo "[websockify]: log file: $log_file" >&2
|
149
|
+
echo "[websockify]: waiting ..." >&2
|
150
|
+
until grep -q -i "proxying from :$1" $log_file
|
151
|
+
do
|
152
|
+
if ! ps $ws_pid > /dev/null; then
|
153
|
+
echo "[websockify]: failed to launch!" >&2
|
154
|
+
return 1
|
155
|
+
elif [ $counter -ge 5 ]; then
|
156
|
+
# timeout after ~5 seconds
|
157
|
+
echo "[websockify]: timed-out :(!" >&2
|
158
|
+
return 1
|
159
|
+
else
|
160
|
+
sleep 1
|
161
|
+
((counter=counter+1))
|
162
|
+
fi
|
163
|
+
done
|
164
|
+
echo "[websockify]: started successfully (proxying $1 ==> $2)" >&2
|
165
|
+
echo $ws_pid
|
166
|
+
return 0
|
167
|
+
}
|
168
|
+
|
137
169
|
# Launch websockify websocket server
|
138
170
|
echo "Starting websocket server..."
|
139
171
|
websocket=$(find_port)
|
140
|
-
|
172
|
+
[ $? -eq 0 ] || clean_up 1 # give up if port not found
|
173
|
+
|
174
|
+
ws_pid=$(start_websockify ${websocket} localhost:${port})
|
175
|
+
[ $? -eq 0 ] || clean_up 1 # give up if websockify launch failed
|
141
176
|
|
142
177
|
# Set up background process that scans the log file for successful
|
143
178
|
# connections by users, and change the password after every
|
@@ -173,6 +173,7 @@ module OodCore
|
|
173
173
|
module load #{container_module}
|
174
174
|
echo "Starting websocket server..."
|
175
175
|
websocket=$(find_port)
|
176
|
+
[ $? -eq 0 ] || clean_up 1 # give up if port not found
|
176
177
|
#{container_command} exec instance://#{@instance_name} #{websockify_cmd} -D ${websocket} localhost:${port}
|
177
178
|
|
178
179
|
# Set up background process that scans the log file for successful
|
@@ -12,11 +12,13 @@ module OodCore
|
|
12
12
|
# @param config [#to_h] the configuration for job adapter
|
13
13
|
# @option config [Object] :bin (nil) Path to Fujitsu TCS resource manager binaries
|
14
14
|
# @option config [#to_h] :bin_overrides ({}) Optional overrides to Fujitsu TCS resource manager executables
|
15
|
+
# @option config [Object] :working_dir (nil) Working directory for submitting a batch script
|
15
16
|
def self.build_fujitsu_tcs(config)
|
16
17
|
c = config.to_h.symbolize_keys
|
17
18
|
bin = c.fetch(:bin, nil)
|
18
19
|
bin_overrides = c.fetch(:bin_overrides, {})
|
19
|
-
|
20
|
+
working_dir = c.fetch(:working_dir, nil)
|
21
|
+
fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides, working_dir: working_dir)
|
20
22
|
Adapters::Fujitsu_TCS.new(fujitsu_tcs: fujitsu_tcs)
|
21
23
|
end
|
22
24
|
end
|
@@ -43,6 +45,11 @@ module OodCore
|
|
43
45
|
# @return Hash<String, String>
|
44
46
|
attr_reader :bin_overrides
|
45
47
|
|
48
|
+
# Working directory for submitting a batch script
|
49
|
+
# @example
|
50
|
+
# my_batch.working_dir #=> "HOME" or Dir.pwd
|
51
|
+
attr_reader :working_dir
|
52
|
+
|
46
53
|
# The root exception class that all Fujitsu TCS specific exceptions inherit
|
47
54
|
# from
|
48
55
|
class Error < StandardError; end
|
@@ -52,9 +59,17 @@ module OodCore
|
|
52
59
|
|
53
60
|
# @param bin [#to_s] path to Fujitsu TCS installation binaries
|
54
61
|
# @param bin_overrides [#to_h] a hash of bin ovverides to be used in job
|
55
|
-
|
62
|
+
# @param working_dir [] Working directory for submitting a batch script
|
63
|
+
def initialize(bin: nil, bin_overrides: {}, working_dir: nil)
|
56
64
|
@bin = Pathname.new(bin.to_s)
|
57
65
|
@bin_overrides = bin_overrides
|
66
|
+
if working_dir == nil
|
67
|
+
@working_dir = Dir.pwd
|
68
|
+
elsif working_dir == "HOME"
|
69
|
+
@working_dir = Dir.home
|
70
|
+
else
|
71
|
+
raise(StandardError, "Unknown working_dir")
|
72
|
+
end
|
58
73
|
end
|
59
74
|
|
60
75
|
# Get a list of hashes detailing each of the jobs on the batch server
|
@@ -79,19 +94,19 @@ module OodCore
|
|
79
94
|
# @raise [Error] if `pjstat` command exited unsuccessfully
|
80
95
|
# @return [Array<Hash>] list of details for jobs
|
81
96
|
def get_jobs(id: "", owner: nil)
|
82
|
-
args = ["-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
|
83
|
-
args.concat ["--filter jid=" + id.to_s] unless id.to_s.empty?
|
84
|
-
args.concat ["--filter usr=" + owner.to_s] unless owner.to_s.empty?
|
97
|
+
args = ["-A", "-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
|
98
|
+
args.concat ["--filter", "jid=" + id.to_s] unless id.to_s.empty?
|
99
|
+
args.concat ["--filter", "usr=" + owner.to_s] unless owner.to_s.empty?
|
85
100
|
|
86
101
|
StringIO.open(call("pjstat", *args)) do |output|
|
87
102
|
output.gets() # Skip header
|
88
103
|
jobs = []
|
89
104
|
output.each_line do |line|
|
90
105
|
l = line.split(",")
|
91
|
-
jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split
|
106
|
+
jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split[0],
|
92
107
|
:ST => l[4], :STD => l[5], :STDE => l[6],
|
93
108
|
:ACCEPT => l[7], :START_DATE => l[8], :NODES => l[9].split(":")[0],
|
94
|
-
:USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split
|
109
|
+
:USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split[0] }
|
95
110
|
end
|
96
111
|
jobs
|
97
112
|
end
|
@@ -136,16 +151,18 @@ module OodCore
|
|
136
151
|
# @return [String] the id of the job that was created
|
137
152
|
def submit_string(str, args: [])
|
138
153
|
args = args.map(&:to_s)
|
139
|
-
call("pjsub", *args, stdin: str.to_s).split
|
154
|
+
call("pjsub", *args, stdin: str.to_s).split[5]
|
140
155
|
end
|
141
156
|
|
142
157
|
private
|
143
158
|
# Call a forked Fujitsu TCS command
|
144
159
|
def call(cmd, *args, stdin: "")
|
145
160
|
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
146
|
-
args
|
147
|
-
|
148
|
-
|
161
|
+
args = args.map(&:to_s)
|
162
|
+
Dir.chdir(working_dir) do
|
163
|
+
o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
164
|
+
s.success? ? o : raise(Error, e)
|
165
|
+
end
|
149
166
|
end
|
150
167
|
end
|
151
168
|
|
@@ -221,12 +238,12 @@ module OodCore
|
|
221
238
|
else
|
222
239
|
args.concat ["-e", script.error_path]
|
223
240
|
end
|
224
|
-
args.concat ["-L rscgrp=" + script.queue_name] unless script.queue_name.nil?
|
241
|
+
args.concat ["-L", "rscgrp=" + script.queue_name] unless script.queue_name.nil?
|
225
242
|
args.concat ["-p", script.priority] unless script.priority.nil?
|
226
243
|
|
227
244
|
# start_time: <%= Time.local(2023,11,22,13,4).to_i %> in form.yml.erb
|
228
245
|
args.concat ["--at", script.start_time.localtime.strftime("%C%y%m%d%H%M")] unless script.start_time.nil?
|
229
|
-
args.concat ["-L elapse=" + seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
|
246
|
+
args.concat ["-L", "elapse=" + seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
|
230
247
|
args.concat ["--bulk", "--sparam", script.job_array_request] unless script.job_array_request.nil?
|
231
248
|
|
232
249
|
# Set environment variables
|
@@ -368,7 +385,7 @@ module OodCore
|
|
368
385
|
private
|
369
386
|
# Convert duration to seconds
|
370
387
|
def duration_in_seconds(time)
|
371
|
-
return 0 if time.nil?
|
388
|
+
return 0 if time.nil? or time == "-"
|
372
389
|
time, days = time.split("-").reverse
|
373
390
|
days.to_i * 24 * 3600 +
|
374
391
|
time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
|
@@ -436,7 +436,8 @@ module OodCore
|
|
436
436
|
'SE' => :completed, # SPECIAL_EXIT
|
437
437
|
'ST' => :running, # STOPPED
|
438
438
|
'S' => :suspended, # SUSPENDED
|
439
|
-
'TO' => :completed # TIMEOUT
|
439
|
+
'TO' => :completed, # TIMEOUT
|
440
|
+
'OOM' => :completed # OUT_OF_MEMORY
|
440
441
|
}
|
441
442
|
|
442
443
|
# @api private
|
data/lib/ood_core/version.rb
CHANGED
data/ood_core.gemspec
CHANGED
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
|
27
27
|
spec.add_runtime_dependency "rexml", "~> 3.2"
|
28
28
|
spec.add_development_dependency "bundler", "~> 2.1"
|
29
|
-
spec.add_development_dependency "rake", "~> 13.0
|
29
|
+
spec.add_development_dependency "rake", "~> 13.1.0"
|
30
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
31
|
spec.add_development_dependency "pry", "~> 0.10"
|
32
32
|
spec.add_development_dependency "timecop", "~> 0.8"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.24.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2023-
|
13
|
+
date: 2023-11-29 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -80,14 +80,14 @@ dependencies:
|
|
80
80
|
requirements:
|
81
81
|
- - "~>"
|
82
82
|
- !ruby/object:Gem::Version
|
83
|
-
version: 13.0
|
83
|
+
version: 13.1.0
|
84
84
|
type: :development
|
85
85
|
prerelease: false
|
86
86
|
version_requirements: !ruby/object:Gem::Requirement
|
87
87
|
requirements:
|
88
88
|
- - "~>"
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: 13.0
|
90
|
+
version: 13.1.0
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: rspec
|
93
93
|
requirement: !ruby/object:Gem::Requirement
|