ood_core 0.23.5 → 0.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/lib/ood_core/batch_connect/template.rb +19 -5
- data/lib/ood_core/batch_connect/templates/vnc.rb +36 -1
- data/lib/ood_core/batch_connect/templates/vnc_container.rb +1 -0
- data/lib/ood_core/job/adapters/fujitsu_tcs.rb +31 -14
- data/lib/ood_core/job/adapters/slurm.rb +2 -1
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2d7ae635ec6299414feac4de3589bea126a97cd427507bd99242f0bcb79553b4
|
4
|
+
data.tar.gz: 29855a0d0573e1d51fe6ebf7a0679c37993c608f77e45f804f6851d84160620c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6b93ec8179ffca892538e5bdd8bcc1eae6adceac3a2d898079c7f8f3a60d5cd04ac64569e71ba0af2fb9ab9833a9b7f6d0c1f845cde1487eaf3d9c8f90b382d8
|
7
|
+
data.tar.gz: 2919276a9ade663afce93339602304f0fc727fa392d6f711f0166b82003c51e8765520603c93047919e4f1d1b81104723fd5ac05c760e891e90ab3a13a5d1c8e
|
data/CHANGELOG.md
CHANGED
@@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
9
|
|
10
|
+
## [0.24.1] - 11-29-2023
|
11
|
+
|
12
|
+
[820](https://github.com/OSC/ood_core/pull/820) Reverts [818](https://github.com/OSC/ood_core/pull/818)
|
13
|
+
|
14
|
+
## [0.24.0] - 11-28-2023
|
15
|
+
|
16
|
+
- Code cleanup and separate arguments with whitespace in Fujitsu TCS adapter by @mnakao in https://github.com/OSC/ood_core/pull/808
|
17
|
+
- Add OUT_OF_MEMORY state for Slurm by @robinkar in https://github.com/OSC/ood_core/pull/809
|
18
|
+
- find_port: avoid infinite loop by @utkarshayachit in https://github.com/OSC/ood_core/pull/811
|
19
|
+
- handle find_port error codes by @utkarshayachit in https://github.com/OSC/ood_core/pull/812
|
20
|
+
- vnc: run websockify as background process by @utkarshayachit in https://github.com/OSC/ood_core/pull/813
|
21
|
+
- Add working_dir option for Fujitsu TCS job scheduler by @mnakao in https://github.com/OSC/ood_core/pull/816
|
22
|
+
- Minor fix for Fujitsu TCS by @mnakao in https://github.com/OSC/ood_core/pull/817
|
23
|
+
- Update rake requirement from ~> 13.0.1 to ~> 13.1.0 by @dependabot in https://github.com/OSC/ood_core/pull/814
|
24
|
+
- Changes default return value for cluster.batch_connect_ssh_allow? by @HazelGrant in https://github.com/OSC/ood_core/pull/818
|
25
|
+
|
10
26
|
## [0.23.5] - 04-10-2023
|
11
27
|
|
12
28
|
### Fixed
|
@@ -162,14 +162,28 @@ module OodCore
|
|
162
162
|
export -f port_used
|
163
163
|
|
164
164
|
# Find available port in range [$2..$3] for host $1
|
165
|
-
# Default:
|
165
|
+
# Default host: localhost
|
166
|
+
# Default port range: [#{min_port}..#{max_port}]
|
167
|
+
# returns error code (0: success, 1: failed)
|
168
|
+
# On success, the chosen port is echoed on stdout.
|
166
169
|
find_port () {
|
167
170
|
local host="${1:-localhost}"
|
168
|
-
local
|
169
|
-
|
170
|
-
|
171
|
+
local min_port=${2:-#{min_port}}
|
172
|
+
local max_port=${3:-#{max_port}}
|
173
|
+
local port_range=($(shuf -i ${min_port}-${max_port}))
|
174
|
+
local retries=1 # number of retries over the port range if first attempt fails
|
175
|
+
for ((attempt=0; attempt<=$retries; attempt++)); do
|
176
|
+
for port in "${port_range[@]}"; do
|
177
|
+
if port_used "${host}:${port}"; then
|
178
|
+
continue
|
179
|
+
fi
|
180
|
+
echo "${port}"
|
181
|
+
return 0 # success
|
182
|
+
done
|
171
183
|
done
|
172
|
-
|
184
|
+
|
185
|
+
echo "error: failed to find available port in range ${min_port}..${max_port}" >&2
|
186
|
+
return 1 # failure
|
173
187
|
}
|
174
188
|
export -f find_port
|
175
189
|
|
@@ -134,10 +134,45 @@ module OodCore
|
|
134
134
|
<<-EOT.gsub(/^ {14}/, "")
|
135
135
|
#{super}
|
136
136
|
|
137
|
+
# launches websockify in the background; waiting until the process
|
138
|
+
# has started proxying successfully.
|
139
|
+
start_websockify() {
|
140
|
+
local log_file="./websockify.log"
|
141
|
+
# launch websockify in background and redirect all output to a file.
|
142
|
+
#{websockify_cmd} $1 $2 &> $log_file &
|
143
|
+
local ws_pid=$!
|
144
|
+
local counter=0
|
145
|
+
|
146
|
+
# wait till websockify has successfully started
|
147
|
+
echo "[websockify]: pid: $ws_pid (proxying $1 ==> $2)" >&2
|
148
|
+
echo "[websockify]: log file: $log_file" >&2
|
149
|
+
echo "[websockify]: waiting ..." >&2
|
150
|
+
until grep -q -i "proxying from :$1" $log_file
|
151
|
+
do
|
152
|
+
if ! ps $ws_pid > /dev/null; then
|
153
|
+
echo "[websockify]: failed to launch!" >&2
|
154
|
+
return 1
|
155
|
+
elif [ $counter -ge 5 ]; then
|
156
|
+
# timeout after ~5 seconds
|
157
|
+
echo "[websockify]: timed-out :(!" >&2
|
158
|
+
return 1
|
159
|
+
else
|
160
|
+
sleep 1
|
161
|
+
((counter=counter+1))
|
162
|
+
fi
|
163
|
+
done
|
164
|
+
echo "[websockify]: started successfully (proxying $1 ==> $2)" >&2
|
165
|
+
echo $ws_pid
|
166
|
+
return 0
|
167
|
+
}
|
168
|
+
|
137
169
|
# Launch websockify websocket server
|
138
170
|
echo "Starting websocket server..."
|
139
171
|
websocket=$(find_port)
|
140
|
-
|
172
|
+
[ $? -eq 0 ] || clean_up 1 # give up if port not found
|
173
|
+
|
174
|
+
ws_pid=$(start_websockify ${websocket} localhost:${port})
|
175
|
+
[ $? -eq 0 ] || clean_up 1 # give up if websockify launch failed
|
141
176
|
|
142
177
|
# Set up background process that scans the log file for successful
|
143
178
|
# connections by users, and change the password after every
|
@@ -173,6 +173,7 @@ module OodCore
|
|
173
173
|
module load #{container_module}
|
174
174
|
echo "Starting websocket server..."
|
175
175
|
websocket=$(find_port)
|
176
|
+
[ $? -eq 0 ] || clean_up 1 # give up if port not found
|
176
177
|
#{container_command} exec instance://#{@instance_name} #{websockify_cmd} -D ${websocket} localhost:${port}
|
177
178
|
|
178
179
|
# Set up background process that scans the log file for successful
|
@@ -12,11 +12,13 @@ module OodCore
|
|
12
12
|
# @param config [#to_h] the configuration for job adapter
|
13
13
|
# @option config [Object] :bin (nil) Path to Fujitsu TCS resource manager binaries
|
14
14
|
# @option config [#to_h] :bin_overrides ({}) Optional overrides to Fujitsu TCS resource manager executables
|
15
|
+
# @option config [Object] :working_dir (nil) Working directory for submitting a batch script
|
15
16
|
def self.build_fujitsu_tcs(config)
|
16
17
|
c = config.to_h.symbolize_keys
|
17
18
|
bin = c.fetch(:bin, nil)
|
18
19
|
bin_overrides = c.fetch(:bin_overrides, {})
|
19
|
-
|
20
|
+
working_dir = c.fetch(:working_dir, nil)
|
21
|
+
fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides, working_dir: working_dir)
|
20
22
|
Adapters::Fujitsu_TCS.new(fujitsu_tcs: fujitsu_tcs)
|
21
23
|
end
|
22
24
|
end
|
@@ -43,6 +45,11 @@ module OodCore
|
|
43
45
|
# @return Hash<String, String>
|
44
46
|
attr_reader :bin_overrides
|
45
47
|
|
48
|
+
# Working directory for submitting a batch script
|
49
|
+
# @example
|
50
|
+
# my_batch.working_dir #=> "HOME" or Dir.pwd
|
51
|
+
attr_reader :working_dir
|
52
|
+
|
46
53
|
# The root exception class that all Fujitsu TCS specific exceptions inherit
|
47
54
|
# from
|
48
55
|
class Error < StandardError; end
|
@@ -52,9 +59,17 @@ module OodCore
|
|
52
59
|
|
53
60
|
# @param bin [#to_s] path to Fujitsu TCS installation binaries
|
54
61
|
# @param bin_overrides [#to_h] a hash of bin ovverides to be used in job
|
55
|
-
|
62
|
+
# @param working_dir [] Working directory for submitting a batch script
|
63
|
+
def initialize(bin: nil, bin_overrides: {}, working_dir: nil)
|
56
64
|
@bin = Pathname.new(bin.to_s)
|
57
65
|
@bin_overrides = bin_overrides
|
66
|
+
if working_dir == nil
|
67
|
+
@working_dir = Dir.pwd
|
68
|
+
elsif working_dir == "HOME"
|
69
|
+
@working_dir = Dir.home
|
70
|
+
else
|
71
|
+
raise(StandardError, "Unknown working_dir")
|
72
|
+
end
|
58
73
|
end
|
59
74
|
|
60
75
|
# Get a list of hashes detailing each of the jobs on the batch server
|
@@ -79,19 +94,19 @@ module OodCore
|
|
79
94
|
# @raise [Error] if `pjstat` command exited unsuccessfully
|
80
95
|
# @return [Array<Hash>] list of details for jobs
|
81
96
|
def get_jobs(id: "", owner: nil)
|
82
|
-
args = ["-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
|
83
|
-
args.concat ["--filter jid=" + id.to_s] unless id.to_s.empty?
|
84
|
-
args.concat ["--filter usr=" + owner.to_s] unless owner.to_s.empty?
|
97
|
+
args = ["-A", "-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
|
98
|
+
args.concat ["--filter", "jid=" + id.to_s] unless id.to_s.empty?
|
99
|
+
args.concat ["--filter", "usr=" + owner.to_s] unless owner.to_s.empty?
|
85
100
|
|
86
101
|
StringIO.open(call("pjstat", *args)) do |output|
|
87
102
|
output.gets() # Skip header
|
88
103
|
jobs = []
|
89
104
|
output.each_line do |line|
|
90
105
|
l = line.split(",")
|
91
|
-
jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split
|
106
|
+
jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split[0],
|
92
107
|
:ST => l[4], :STD => l[5], :STDE => l[6],
|
93
108
|
:ACCEPT => l[7], :START_DATE => l[8], :NODES => l[9].split(":")[0],
|
94
|
-
:USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split
|
109
|
+
:USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split[0] }
|
95
110
|
end
|
96
111
|
jobs
|
97
112
|
end
|
@@ -136,16 +151,18 @@ module OodCore
|
|
136
151
|
# @return [String] the id of the job that was created
|
137
152
|
def submit_string(str, args: [])
|
138
153
|
args = args.map(&:to_s)
|
139
|
-
call("pjsub", *args, stdin: str.to_s).split
|
154
|
+
call("pjsub", *args, stdin: str.to_s).split[5]
|
140
155
|
end
|
141
156
|
|
142
157
|
private
|
143
158
|
# Call a forked Fujitsu TCS command
|
144
159
|
def call(cmd, *args, stdin: "")
|
145
160
|
cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
|
146
|
-
args
|
147
|
-
|
148
|
-
|
161
|
+
args = args.map(&:to_s)
|
162
|
+
Dir.chdir(working_dir) do
|
163
|
+
o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
164
|
+
s.success? ? o : raise(Error, e)
|
165
|
+
end
|
149
166
|
end
|
150
167
|
end
|
151
168
|
|
@@ -221,12 +238,12 @@ module OodCore
|
|
221
238
|
else
|
222
239
|
args.concat ["-e", script.error_path]
|
223
240
|
end
|
224
|
-
args.concat ["-L rscgrp=" + script.queue_name] unless script.queue_name.nil?
|
241
|
+
args.concat ["-L", "rscgrp=" + script.queue_name] unless script.queue_name.nil?
|
225
242
|
args.concat ["-p", script.priority] unless script.priority.nil?
|
226
243
|
|
227
244
|
# start_time: <%= Time.local(2023,11,22,13,4).to_i %> in form.yml.erb
|
228
245
|
args.concat ["--at", script.start_time.localtime.strftime("%C%y%m%d%H%M")] unless script.start_time.nil?
|
229
|
-
args.concat ["-L elapse=" + seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
|
246
|
+
args.concat ["-L", "elapse=" + seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
|
230
247
|
args.concat ["--bulk", "--sparam", script.job_array_request] unless script.job_array_request.nil?
|
231
248
|
|
232
249
|
# Set environment variables
|
@@ -368,7 +385,7 @@ module OodCore
|
|
368
385
|
private
|
369
386
|
# Convert duration to seconds
|
370
387
|
def duration_in_seconds(time)
|
371
|
-
return 0 if time.nil?
|
388
|
+
return 0 if time.nil? or time == "-"
|
372
389
|
time, days = time.split("-").reverse
|
373
390
|
days.to_i * 24 * 3600 +
|
374
391
|
time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
|
@@ -436,7 +436,8 @@ module OodCore
|
|
436
436
|
'SE' => :completed, # SPECIAL_EXIT
|
437
437
|
'ST' => :running, # STOPPED
|
438
438
|
'S' => :suspended, # SUSPENDED
|
439
|
-
'TO' => :completed # TIMEOUT
|
439
|
+
'TO' => :completed, # TIMEOUT
|
440
|
+
'OOM' => :completed # OUT_OF_MEMORY
|
440
441
|
}
|
441
442
|
|
442
443
|
# @api private
|
data/lib/ood_core/version.rb
CHANGED
data/ood_core.gemspec
CHANGED
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
|
27
27
|
spec.add_runtime_dependency "rexml", "~> 3.2"
|
28
28
|
spec.add_development_dependency "bundler", "~> 2.1"
|
29
|
-
spec.add_development_dependency "rake", "~> 13.0
|
29
|
+
spec.add_development_dependency "rake", "~> 13.1.0"
|
30
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
31
|
spec.add_development_dependency "pry", "~> 0.10"
|
32
32
|
spec.add_development_dependency "timecop", "~> 0.8"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.24.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2023-
|
13
|
+
date: 2023-11-29 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -80,14 +80,14 @@ dependencies:
|
|
80
80
|
requirements:
|
81
81
|
- - "~>"
|
82
82
|
- !ruby/object:Gem::Version
|
83
|
-
version: 13.0
|
83
|
+
version: 13.1.0
|
84
84
|
type: :development
|
85
85
|
prerelease: false
|
86
86
|
version_requirements: !ruby/object:Gem::Requirement
|
87
87
|
requirements:
|
88
88
|
- - "~>"
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: 13.0
|
90
|
+
version: 13.1.0
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: rspec
|
93
93
|
requirement: !ruby/object:Gem::Requirement
|