ood_core 0.23.4 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aaa540da7d8afffa45674650d11ef7bf260e55ff44e32244950d233e1d4d145a
4
- data.tar.gz: ff4e1fbe8c59309f36fc45c634cb27752d71e3e66e847496ab3678f556a90e8b
3
+ metadata.gz: d5a2f83340f4dad3b9de58e0b5db178e2ec8566da3f28e717405c2692477d49d
4
+ data.tar.gz: f63183676134dd3b410fa4bf24ea84a3c1e16ccae8efd02bdec0174ec0dee30b
5
5
  SHA512:
6
- metadata.gz: bc0fecb3a8eae6db9c5d6c40d62d4568f930fc197a153ec28a0942ec2d2bc8da0e488e6fe86d906ce03ca2ae53d911c43804623e7b185335281d23b1cef7af41
7
- data.tar.gz: 56eae7c8a31c5f4b625b06b33b84def3d80a0652bd883f39f656a276c63e7224b88cb9ed801ef4482e62d1eeee6aeff366787414d21c52daf03cc4eb2265d83a
6
+ metadata.gz: 3f356e6fa7d39314ae4092533fcadc7dcd062911875b3d3cf934612015fbdac1be55be66436ffa30ad8e14077c30717fe54ff5d8d4fc7699bc57c9f04ad7570f
7
+ data.tar.gz: 22c2606e3449c338a3c9dfae4e9f8b5c60a8c172072859dc0aa5ac79be3779dcf337d30a8d351e9273356c74cdc19a235f9267c06aab0405511049971f3fe206
data/CHANGELOG.md CHANGED
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.24.0] - 11-28-2023
11
+
12
+ - Code cleanup and separate arguments with whitespace in Fujitsu TCS adapter by @mnakao in https://github.com/OSC/ood_core/pull/808
13
+ - Add OUT_OF_MEMORY state for Slurm by @robinkar in https://github.com/OSC/ood_core/pull/809
14
+ - find_port: avoid infinite loop by @utkarshayachit in https://github.com/OSC/ood_core/pull/811
15
+ - handle find_port error codes by @utkarshayachit in https://github.com/OSC/ood_core/pull/812
16
+ - vnc: run websockify as background process by @utkarshayachit in https://github.com/OSC/ood_core/pull/813
17
+ - Add working_dir option for Fujitsu TCS job scheduler by @mnakao in https://github.com/OSC/ood_core/pull/816
18
+ - Minor fix for Fujitsu TCS by @mnakao in https://github.com/OSC/ood_core/pull/817
19
+ - Update rake requirement from ~> 13.0.1 to ~> 13.1.0 by @dependabot in https://github.com/OSC/ood_core/pull/814
20
+ - Changes default return value for cluster.batch_connect_ssh_allow? by @HazelGrant in https://github.com/OSC/ood_core/pull/818
21
+
22
+ ## [0.23.5] - 04-10-2023
23
+
24
+ ### Fixed
25
+
26
+ - [804](https://github.com/OSC/ood_core/pull/804) fixed a kubernetes bug in the
27
+ `info_all` code path.
28
+ - Slurm `-M` flag now correctly accounts for full path `sacctmgr` commands in
29
+ [807](https://github.com/OSC/ood_core/pull/807).
30
+
10
31
  ## [0.23.4] - 03-06-2023
11
32
 
12
33
  ### Fixed
@@ -500,8 +521,9 @@ Functionally the same as [0.17.3] but with some CI updates.
500
521
  - Initial release!
501
522
 
502
523
  [Unreleased]: https://github.com/OSC/ood_core/compare/v0.23.4...HEAD
503
- [0.23.4]: https://github.com/OSC/ood_core/compare/v0.23.4...v0.23.3
504
- [0.23.3]: https://github.com/OSC/ood_core/compare/v0.23.3...v0.23.2
524
+ [0.23.5]: https://github.com/OSC/ood_core/compare/v0.23.4...v0.23.5
525
+ [0.23.4]: https://github.com/OSC/ood_core/compare/v0.23.3...v0.23.4
526
+ [0.23.3]: https://github.com/OSC/ood_core/compare/v0.23.2...v0.23.3
505
527
  [0.23.2]: https://github.com/OSC/ood_core/compare/v0.23.1...v0.23.2
506
528
  [0.23.1]: https://github.com/OSC/ood_core/compare/v0.23.0...v0.23.1
507
529
  [0.23.0]: https://github.com/OSC/ood_core/compare/v0.22.0...v0.23.0
@@ -162,14 +162,28 @@ module OodCore
162
162
  export -f port_used
163
163
 
164
164
  # Find available port in range [$2..$3] for host $1
165
- # Default: [#{min_port}..#{max_port}]
165
+ # Default host: localhost
166
+ # Default port range: [#{min_port}..#{max_port}]
167
+ # returns error code (0: success, 1: failed)
168
+ # On success, the chosen port is echoed on stdout.
166
169
  find_port () {
167
170
  local host="${1:-localhost}"
168
- local port=$(random_number "${2:-#{min_port}}" "${3:-#{max_port}}")
169
- while port_used "${host}:${port}"; do
170
- port=$(random_number "${2:-#{min_port}}" "${3:-#{max_port}}")
171
+ local min_port=${2:-#{min_port}}
172
+ local max_port=${3:-#{max_port}}
173
+ local port_range=($(shuf -i ${min_port}-${max_port}))
174
+ local retries=1 # number of retries over the port range if first attempt fails
175
+ for ((attempt=0; attempt<=$retries; attempt++)); do
176
+ for port in "${port_range[@]}"; do
177
+ if port_used "${host}:${port}"; then
178
+ continue
179
+ fi
180
+ echo "${port}"
181
+ return 0 # success
182
+ done
171
183
  done
172
- echo "${port}"
184
+
185
+ echo "error: failed to find available port in range ${min_port}..${max_port}" >&2
186
+ return 1 # failure
173
187
  }
174
188
  export -f find_port
175
189
 
@@ -134,10 +134,45 @@ module OodCore
134
134
  <<-EOT.gsub(/^ {14}/, "")
135
135
  #{super}
136
136
 
137
+ # launches websockify in the background; waiting until the process
138
+ # has started proxying successfully.
139
+ start_websockify() {
140
+ local log_file="./websockify.log"
141
+ # launch websockify in background and redirect all output to a file.
142
+ #{websockify_cmd} $1 $2 &> $log_file &
143
+ local ws_pid=$!
144
+ local counter=0
145
+
146
+ # wait till websockify has successfully started
147
+ echo "[websockify]: pid: $ws_pid (proxying $1 ==> $2)" >&2
148
+ echo "[websockify]: log file: $log_file" >&2
149
+ echo "[websockify]: waiting ..." >&2
150
+ until grep -q -i "proxying from :$1" $log_file
151
+ do
152
+ if ! ps $ws_pid > /dev/null; then
153
+ echo "[websockify]: failed to launch!" >&2
154
+ return 1
155
+ elif [ $counter -ge 5 ]; then
156
+ # timeout after ~5 seconds
157
+ echo "[websockify]: timed-out :(!" >&2
158
+ return 1
159
+ else
160
+ sleep 1
161
+ ((counter=counter+1))
162
+ fi
163
+ done
164
+ echo "[websockify]: started successfully (proxying $1 ==> $2)" >&2
165
+ echo $ws_pid
166
+ return 0
167
+ }
168
+
137
169
  # Launch websockify websocket server
138
170
  echo "Starting websocket server..."
139
171
  websocket=$(find_port)
140
- #{websockify_cmd} -D ${websocket} localhost:${port}
172
+ [ $? -eq 0 ] || clean_up 1 # give up if port not found
173
+
174
+ ws_pid=$(start_websockify ${websocket} localhost:${port})
175
+ [ $? -eq 0 ] || clean_up 1 # give up if websockify launch failed
141
176
 
142
177
  # Set up background process that scans the log file for successful
143
178
  # connections by users, and change the password after every
@@ -173,6 +173,7 @@ module OodCore
173
173
  module load #{container_module}
174
174
  echo "Starting websocket server..."
175
175
  websocket=$(find_port)
176
+ [ $? -eq 0 ] || clean_up 1 # give up if port not found
176
177
  #{container_command} exec instance://#{@instance_name} #{websockify_cmd} -D ${websocket} localhost:${port}
177
178
 
178
179
  # Set up background process that scans the log file for successful
@@ -165,9 +165,9 @@ module OodCore
165
165
  # @return [Boolean, nil] whether cluster supports SSH to batch connect node
166
166
  def batch_connect_ssh_allow?
167
167
  return @batch_connect_ssh_allow if defined?(@batch_connect_ssh_allow)
168
- return @batch_connect_ssh_allow = nil if batch_connect_config.nil?
168
+ return @batch_connect_ssh_allow = true if batch_connect_config.nil?
169
169
 
170
- @batch_connect_ssh_allow = batch_connect_config.fetch(:ssh_allow, nil)
170
+ @batch_connect_ssh_allow = batch_connect_config.fetch(:ssh_allow, true)
171
171
  end
172
172
 
173
173
  # The comparison operator
@@ -12,11 +12,13 @@ module OodCore
12
12
  # @param config [#to_h] the configuration for job adapter
13
13
  # @option config [Object] :bin (nil) Path to Fujitsu TCS resource manager binaries
14
14
  # @option config [#to_h] :bin_overrides ({}) Optional overrides to Fujitsu TCS resource manager executables
15
+ # @option config [Object] :working_dir (nil) Working directory for submitting a batch script
15
16
  def self.build_fujitsu_tcs(config)
16
17
  c = config.to_h.symbolize_keys
17
18
  bin = c.fetch(:bin, nil)
18
19
  bin_overrides = c.fetch(:bin_overrides, {})
19
- fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides)
20
+ working_dir = c.fetch(:working_dir, nil)
21
+ fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides, working_dir: working_dir)
20
22
  Adapters::Fujitsu_TCS.new(fujitsu_tcs: fujitsu_tcs)
21
23
  end
22
24
  end
@@ -43,6 +45,11 @@ module OodCore
43
45
  # @return Hash<String, String>
44
46
  attr_reader :bin_overrides
45
47
 
48
+ # Working directory for submitting a batch script
49
+ # @example
50
+ # my_batch.working_dir #=> "HOME" or Dir.pwd
51
+ attr_reader :working_dir
52
+
46
53
  # The root exception class that all Fujitsu TCS specific exceptions inherit
47
54
  # from
48
55
  class Error < StandardError; end
@@ -52,9 +59,17 @@ module OodCore
52
59
 
53
60
  # @param bin [#to_s] path to Fujitsu TCS installation binaries
54
61
  # @param bin_overrides [#to_h] a hash of bin ovverides to be used in job
55
- def initialize(bin: nil, bin_overrides: {})
62
+ # @param working_dir [] Working directory for submitting a batch script
63
+ def initialize(bin: nil, bin_overrides: {}, working_dir: nil)
56
64
  @bin = Pathname.new(bin.to_s)
57
65
  @bin_overrides = bin_overrides
66
+ if working_dir == nil
67
+ @working_dir = Dir.pwd
68
+ elsif working_dir == "HOME"
69
+ @working_dir = Dir.home
70
+ else
71
+ raise(StandardError, "Unknown working_dir")
72
+ end
58
73
  end
59
74
 
60
75
  # Get a list of hashes detailing each of the jobs on the batch server
@@ -79,19 +94,19 @@ module OodCore
79
94
  # @raise [Error] if `pjstat` command exited unsuccessfully
80
95
  # @return [Array<Hash>] list of details for jobs
81
96
  def get_jobs(id: "", owner: nil)
82
- args = ["-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
83
- args.concat ["--filter jid=" + id.to_s] unless id.to_s.empty?
84
- args.concat ["--filter usr=" + owner.to_s] unless owner.to_s.empty?
97
+ args = ["-A", "-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
98
+ args.concat ["--filter", "jid=" + id.to_s] unless id.to_s.empty?
99
+ args.concat ["--filter", "usr=" + owner.to_s] unless owner.to_s.empty?
85
100
 
86
101
  StringIO.open(call("pjstat", *args)) do |output|
87
102
  output.gets() # Skip header
88
103
  jobs = []
89
104
  output.each_line do |line|
90
105
  l = line.split(",")
91
- jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split(" ")[0],
106
+ jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split[0],
92
107
  :ST => l[4], :STD => l[5], :STDE => l[6],
93
108
  :ACCEPT => l[7], :START_DATE => l[8], :NODES => l[9].split(":")[0],
94
- :USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split(" ")[0] }
109
+ :USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split[0] }
95
110
  end
96
111
  jobs
97
112
  end
@@ -136,16 +151,18 @@ module OodCore
136
151
  # @return [String] the id of the job that was created
137
152
  def submit_string(str, args: [])
138
153
  args = args.map(&:to_s)
139
- call("pjsub", *args, stdin: str.to_s).split(" ")[5]
154
+ call("pjsub", *args, stdin: str.to_s).split[5]
140
155
  end
141
156
 
142
157
  private
143
158
  # Call a forked Fujitsu TCS command
144
159
  def call(cmd, *args, stdin: "")
145
160
  cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
146
- args = args.map(&:to_s)
147
- o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
148
- s.success? ? o : raise(Error, e)
161
+ args = args.map(&:to_s)
162
+ Dir.chdir(working_dir) do
163
+ o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
164
+ s.success? ? o : raise(Error, e)
165
+ end
149
166
  end
150
167
  end
151
168
 
@@ -221,12 +238,12 @@ module OodCore
221
238
  else
222
239
  args.concat ["-e", script.error_path]
223
240
  end
224
- args.concat ["-L rscgrp=" + script.queue_name] unless script.queue_name.nil?
241
+ args.concat ["-L", "rscgrp=" + script.queue_name] unless script.queue_name.nil?
225
242
  args.concat ["-p", script.priority] unless script.priority.nil?
226
243
 
227
244
  # start_time: <%= Time.local(2023,11,22,13,4).to_i %> in form.yml.erb
228
245
  args.concat ["--at", script.start_time.localtime.strftime("%C%y%m%d%H%M")] unless script.start_time.nil?
229
- args.concat ["-L elapse=" + seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
246
+ args.concat ["-L", "elapse=" + seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
230
247
  args.concat ["--bulk", "--sparam", script.job_array_request] unless script.job_array_request.nil?
231
248
 
232
249
  # Set environment variables
@@ -368,7 +385,7 @@ module OodCore
368
385
  private
369
386
  # Convert duration to seconds
370
387
  def duration_in_seconds(time)
371
- return 0 if time.nil?
388
+ return 0 if time.nil? or time == "-"
372
389
  time, days = time.split("-").reverse
373
390
  days.to_i * 24 * 3600 +
374
391
  time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
@@ -306,10 +306,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
306
306
 
307
307
  def pod_info_from_json(pod)
308
308
  hash = helper.pod_info_from_json(pod)
309
- K8sJobInfo.new(hash)
310
- rescue Helper::K8sDataError
311
- # FIXME: silently eating error, could probably use a logger
312
- nil
309
+ OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(hash)
313
310
  end
314
311
 
315
312
  def configure_auth(auth)
@@ -377,7 +377,7 @@ module OodCore
377
377
  cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
378
378
 
379
379
  args = args.map(&:to_s)
380
- args.concat ["-M", cluster] if cluster && cmd != 'sacctmgr'
380
+ args.concat ["-M", cluster] if cluster && !cmd.to_s.end_with?('sacctmgr')
381
381
 
382
382
  env = env.to_h
383
383
  env["SLURM_CONF"] = conf.to_s if conf
@@ -436,7 +436,8 @@ module OodCore
436
436
  'SE' => :completed, # SPECIAL_EXIT
437
437
  'ST' => :running, # STOPPED
438
438
  'S' => :suspended, # SUSPENDED
439
- 'TO' => :completed # TIMEOUT
439
+ 'TO' => :completed, # TIMEOUT
440
+ 'OOM' => :completed # OUT_OF_MEMORY
440
441
  }
441
442
 
442
443
  # @api private
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.23.4"
3
+ VERSION = "0.24.0"
4
4
  end
data/ood_core.gemspec CHANGED
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
27
27
  spec.add_runtime_dependency "rexml", "~> 3.2"
28
28
  spec.add_development_dependency "bundler", "~> 2.1"
29
- spec.add_development_dependency "rake", "~> 13.0.1"
29
+ spec.add_development_dependency "rake", "~> 13.1.0"
30
30
  spec.add_development_dependency "rspec", "~> 3.0"
31
31
  spec.add_development_dependency "pry", "~> 0.10"
32
32
  spec.add_development_dependency "timecop", "~> 0.8"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.23.4
4
+ version: 0.24.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2023-03-06 00:00:00.000000000 Z
13
+ date: 2023-11-28 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -80,14 +80,14 @@ dependencies:
80
80
  requirements:
81
81
  - - "~>"
82
82
  - !ruby/object:Gem::Version
83
- version: 13.0.1
83
+ version: 13.1.0
84
84
  type: :development
85
85
  prerelease: false
86
86
  version_requirements: !ruby/object:Gem::Requirement
87
87
  requirements:
88
88
  - - "~>"
89
89
  - !ruby/object:Gem::Version
90
- version: 13.0.1
90
+ version: 13.1.0
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: rspec
93
93
  requirement: !ruby/object:Gem::Requirement