ood_core 0.23.5 → 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd70ee666e6339110f07224c339fb666d7e130c44854784d818ef6a87172e610
4
- data.tar.gz: a39550f3b74ea8b50aa28c35398d96dbf36d38e6768a37afa65c815cd5bd9cc2
3
+ metadata.gz: d5a2f83340f4dad3b9de58e0b5db178e2ec8566da3f28e717405c2692477d49d
4
+ data.tar.gz: f63183676134dd3b410fa4bf24ea84a3c1e16ccae8efd02bdec0174ec0dee30b
5
5
  SHA512:
6
- metadata.gz: d5e9f4ab2800182ebc7f8e7d1c975ba49a4d9e079731856f05cb85ae30e610a8cd9c51db8354aae4a4a6b99bfdd512a544dabc2d518e2fb089f96bd86be82976
7
- data.tar.gz: ce37e66c311f9b1ddf7d2bd24b9213be4b1a676913bcc6e4242d4b9e84dcf175a5d4ea6e4fa4400ad3bf42d0bbf10e3829f1c145330bad916b581731e871aa30
6
+ metadata.gz: 3f356e6fa7d39314ae4092533fcadc7dcd062911875b3d3cf934612015fbdac1be55be66436ffa30ad8e14077c30717fe54ff5d8d4fc7699bc57c9f04ad7570f
7
+ data.tar.gz: 22c2606e3449c338a3c9dfae4e9f8b5c60a8c172072859dc0aa5ac79be3779dcf337d30a8d351e9273356c74cdc19a235f9267c06aab0405511049971f3fe206
data/CHANGELOG.md CHANGED
@@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.24.0] - 11-28-2023
11
+
12
+ - Code cleanup and separate arguments with whitespace in Fujitsu TCS adapter by @mnakao in https://github.com/OSC/ood_core/pull/808
13
+ - Add OUT_OF_MEMORY state for Slurm by @robinkar in https://github.com/OSC/ood_core/pull/809
14
+ - find_port: avoid infinite loop by @utkarshayachit in https://github.com/OSC/ood_core/pull/811
15
+ - handle find_port error codes by @utkarshayachit in https://github.com/OSC/ood_core/pull/812
16
+ - vnc: run websockify as background process by @utkarshayachit in https://github.com/OSC/ood_core/pull/813
17
+ - Add working_dir option for Fujitsu TCS job scheduler by @mnakao in https://github.com/OSC/ood_core/pull/816
18
+ - Minor fix for Fujitsu TCS by @mnakao in https://github.com/OSC/ood_core/pull/817
19
+ - Update rake requirement from ~> 13.0.1 to ~> 13.1.0 by @dependabot in https://github.com/OSC/ood_core/pull/814
20
+ - Changes default return value for cluster.batch_connect_ssh_allow? by @HazelGrant in https://github.com/OSC/ood_core/pull/818
21
+
10
22
  ## [0.23.5] - 04-10-2023
11
23
 
12
24
  ### Fixed
@@ -162,14 +162,28 @@ module OodCore
162
162
  export -f port_used
163
163
 
164
164
  # Find available port in range [$2..$3] for host $1
165
- # Default: [#{min_port}..#{max_port}]
165
+ # Default host: localhost
166
+ # Default port range: [#{min_port}..#{max_port}]
167
+ # returns error code (0: success, 1: failed)
168
+ # On success, the chosen port is echoed on stdout.
166
169
  find_port () {
167
170
  local host="${1:-localhost}"
168
- local port=$(random_number "${2:-#{min_port}}" "${3:-#{max_port}}")
169
- while port_used "${host}:${port}"; do
170
- port=$(random_number "${2:-#{min_port}}" "${3:-#{max_port}}")
171
+ local min_port=${2:-#{min_port}}
172
+ local max_port=${3:-#{max_port}}
173
+ local port_range=($(shuf -i ${min_port}-${max_port}))
174
+ local retries=1 # number of retries over the port range if first attempt fails
175
+ for ((attempt=0; attempt<=$retries; attempt++)); do
176
+ for port in "${port_range[@]}"; do
177
+ if port_used "${host}:${port}"; then
178
+ continue
179
+ fi
180
+ echo "${port}"
181
+ return 0 # success
182
+ done
171
183
  done
172
- echo "${port}"
184
+
185
+ echo "error: failed to find available port in range ${min_port}..${max_port}" >&2
186
+ return 1 # failure
173
187
  }
174
188
  export -f find_port
175
189
 
@@ -134,10 +134,45 @@ module OodCore
134
134
  <<-EOT.gsub(/^ {14}/, "")
135
135
  #{super}
136
136
 
137
+ # launches websockify in the background; waiting until the process
138
+ # has started proxying successfully.
139
+ start_websockify() {
140
+ local log_file="./websockify.log"
141
+ # launch websockify in background and redirect all output to a file.
142
+ #{websockify_cmd} $1 $2 &> $log_file &
143
+ local ws_pid=$!
144
+ local counter=0
145
+
146
+ # wait till websockify has successfully started
147
+ echo "[websockify]: pid: $ws_pid (proxying $1 ==> $2)" >&2
148
+ echo "[websockify]: log file: $log_file" >&2
149
+ echo "[websockify]: waiting ..." >&2
150
+ until grep -q -i "proxying from :$1" $log_file
151
+ do
152
+ if ! ps $ws_pid > /dev/null; then
153
+ echo "[websockify]: failed to launch!" >&2
154
+ return 1
155
+ elif [ $counter -ge 5 ]; then
156
+ # timeout after ~5 seconds
157
+ echo "[websockify]: timed-out :(!" >&2
158
+ return 1
159
+ else
160
+ sleep 1
161
+ ((counter=counter+1))
162
+ fi
163
+ done
164
+ echo "[websockify]: started successfully (proxying $1 ==> $2)" >&2
165
+ echo $ws_pid
166
+ return 0
167
+ }
168
+
137
169
  # Launch websockify websocket server
138
170
  echo "Starting websocket server..."
139
171
  websocket=$(find_port)
140
- #{websockify_cmd} -D ${websocket} localhost:${port}
172
+ [ $? -eq 0 ] || clean_up 1 # give up if port not found
173
+
174
+ ws_pid=$(start_websockify ${websocket} localhost:${port})
175
+ [ $? -eq 0 ] || clean_up 1 # give up if websockify launch failed
141
176
 
142
177
  # Set up background process that scans the log file for successful
143
178
  # connections by users, and change the password after every
@@ -173,6 +173,7 @@ module OodCore
173
173
  module load #{container_module}
174
174
  echo "Starting websocket server..."
175
175
  websocket=$(find_port)
176
+ [ $? -eq 0 ] || clean_up 1 # give up if port not found
176
177
  #{container_command} exec instance://#{@instance_name} #{websockify_cmd} -D ${websocket} localhost:${port}
177
178
 
178
179
  # Set up background process that scans the log file for successful
@@ -165,9 +165,9 @@ module OodCore
165
165
  # @return [Boolean, nil] whether cluster supports SSH to batch connect node
166
166
  def batch_connect_ssh_allow?
167
167
  return @batch_connect_ssh_allow if defined?(@batch_connect_ssh_allow)
168
- return @batch_connect_ssh_allow = nil if batch_connect_config.nil?
168
+ return @batch_connect_ssh_allow = true if batch_connect_config.nil?
169
169
 
170
- @batch_connect_ssh_allow = batch_connect_config.fetch(:ssh_allow, nil)
170
+ @batch_connect_ssh_allow = batch_connect_config.fetch(:ssh_allow, true)
171
171
  end
172
172
 
173
173
  # The comparison operator
@@ -12,11 +12,13 @@ module OodCore
12
12
  # @param config [#to_h] the configuration for job adapter
13
13
  # @option config [Object] :bin (nil) Path to Fujitsu TCS resource manager binaries
14
14
  # @option config [#to_h] :bin_overrides ({}) Optional overrides to Fujitsu TCS resource manager executables
15
+ # @option config [Object] :working_dir (nil) Working directory for submitting a batch script
15
16
  def self.build_fujitsu_tcs(config)
16
17
  c = config.to_h.symbolize_keys
17
18
  bin = c.fetch(:bin, nil)
18
19
  bin_overrides = c.fetch(:bin_overrides, {})
19
- fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides)
20
+ working_dir = c.fetch(:working_dir, nil)
21
+ fujitsu_tcs = Adapters::Fujitsu_TCS::Batch.new(bin: bin, bin_overrides: bin_overrides, working_dir: working_dir)
20
22
  Adapters::Fujitsu_TCS.new(fujitsu_tcs: fujitsu_tcs)
21
23
  end
22
24
  end
@@ -43,6 +45,11 @@ module OodCore
43
45
  # @return Hash<String, String>
44
46
  attr_reader :bin_overrides
45
47
 
48
+ # Working directory for submitting a batch script
49
+ # @example
50
+ # my_batch.working_dir #=> "HOME" or Dir.pwd
51
+ attr_reader :working_dir
52
+
46
53
  # The root exception class that all Fujitsu TCS specific exceptions inherit
47
54
  # from
48
55
  class Error < StandardError; end
@@ -52,9 +59,17 @@ module OodCore
52
59
 
53
60
  # @param bin [#to_s] path to Fujitsu TCS installation binaries
54
61
  # @param bin_overrides [#to_h] a hash of bin ovverides to be used in job
55
- def initialize(bin: nil, bin_overrides: {})
62
+ # @param working_dir [] Working directory for submitting a batch script
63
+ def initialize(bin: nil, bin_overrides: {}, working_dir: nil)
56
64
  @bin = Pathname.new(bin.to_s)
57
65
  @bin_overrides = bin_overrides
66
+ if working_dir == nil
67
+ @working_dir = Dir.pwd
68
+ elsif working_dir == "HOME"
69
+ @working_dir = Dir.home
70
+ else
71
+ raise(StandardError, "Unknown working_dir")
72
+ end
58
73
  end
59
74
 
60
75
  # Get a list of hashes detailing each of the jobs on the batch server
@@ -79,19 +94,19 @@ module OodCore
79
94
  # @raise [Error] if `pjstat` command exited unsuccessfully
80
95
  # @return [Array<Hash>] list of details for jobs
81
96
  def get_jobs(id: "", owner: nil)
82
- args = ["-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
83
- args.concat ["--filter jid=" + id.to_s] unless id.to_s.empty?
84
- args.concat ["--filter usr=" + owner.to_s] unless owner.to_s.empty?
97
+ args = ["-A", "-s", "--data", "--choose=jid,jnam,rscg,st,std,stde,adt,sdt,nnumr,usr,elpl,elp"]
98
+ args.concat ["--filter", "jid=" + id.to_s] unless id.to_s.empty?
99
+ args.concat ["--filter", "usr=" + owner.to_s] unless owner.to_s.empty?
85
100
 
86
101
  StringIO.open(call("pjstat", *args)) do |output|
87
102
  output.gets() # Skip header
88
103
  jobs = []
89
104
  output.each_line do |line|
90
105
  l = line.split(",")
91
- jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split(" ")[0],
106
+ jobs << {:JOB_ID => l[1], :JOB_NAME => l[2], :RSC_GRP => l[3].split[0],
92
107
  :ST => l[4], :STD => l[5], :STDE => l[6],
93
108
  :ACCEPT => l[7], :START_DATE => l[8], :NODES => l[9].split(":")[0],
94
- :USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split(" ")[0] }
109
+ :USER => l[10], :ELAPSE_LIM => l[11], :ELAPSE_TIM => l[12].split[0] }
95
110
  end
96
111
  jobs
97
112
  end
@@ -136,16 +151,18 @@ module OodCore
136
151
  # @return [String] the id of the job that was created
137
152
  def submit_string(str, args: [])
138
153
  args = args.map(&:to_s)
139
- call("pjsub", *args, stdin: str.to_s).split(" ")[5]
154
+ call("pjsub", *args, stdin: str.to_s).split[5]
140
155
  end
141
156
 
142
157
  private
143
158
  # Call a forked Fujitsu TCS command
144
159
  def call(cmd, *args, stdin: "")
145
160
  cmd = OodCore::Job::Adapters::Helper.bin_path(cmd, bin, bin_overrides)
146
- args = args.map(&:to_s)
147
- o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
148
- s.success? ? o : raise(Error, e)
161
+ args = args.map(&:to_s)
162
+ Dir.chdir(working_dir) do
163
+ o, e, s = Open3.capture3(cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
164
+ s.success? ? o : raise(Error, e)
165
+ end
149
166
  end
150
167
  end
151
168
 
@@ -221,12 +238,12 @@ module OodCore
221
238
  else
222
239
  args.concat ["-e", script.error_path]
223
240
  end
224
- args.concat ["-L rscgrp=" + script.queue_name] unless script.queue_name.nil?
241
+ args.concat ["-L", "rscgrp=" + script.queue_name] unless script.queue_name.nil?
225
242
  args.concat ["-p", script.priority] unless script.priority.nil?
226
243
 
227
244
  # start_time: <%= Time.local(2023,11,22,13,4).to_i %> in form.yml.erb
228
245
  args.concat ["--at", script.start_time.localtime.strftime("%C%y%m%d%H%M")] unless script.start_time.nil?
229
- args.concat ["-L elapse=" + seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
246
+ args.concat ["-L", "elapse=" + seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
230
247
  args.concat ["--bulk", "--sparam", script.job_array_request] unless script.job_array_request.nil?
231
248
 
232
249
  # Set environment variables
@@ -368,7 +385,7 @@ module OodCore
368
385
  private
369
386
  # Convert duration to seconds
370
387
  def duration_in_seconds(time)
371
- return 0 if time.nil?
388
+ return 0 if time.nil? or time == "-"
372
389
  time, days = time.split("-").reverse
373
390
  days.to_i * 24 * 3600 +
374
391
  time.split(':').map { |v| v.to_i }.inject(0) { |total, v| total * 60 + v }
@@ -436,7 +436,8 @@ module OodCore
436
436
  'SE' => :completed, # SPECIAL_EXIT
437
437
  'ST' => :running, # STOPPED
438
438
  'S' => :suspended, # SUSPENDED
439
- 'TO' => :completed # TIMEOUT
439
+ 'TO' => :completed, # TIMEOUT
440
+ 'OOM' => :completed # OUT_OF_MEMORY
440
441
  }
441
442
 
442
443
  # @api private
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.23.5"
3
+ VERSION = "0.24.0"
4
4
  end
data/ood_core.gemspec CHANGED
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
27
27
  spec.add_runtime_dependency "rexml", "~> 3.2"
28
28
  spec.add_development_dependency "bundler", "~> 2.1"
29
- spec.add_development_dependency "rake", "~> 13.0.1"
29
+ spec.add_development_dependency "rake", "~> 13.1.0"
30
30
  spec.add_development_dependency "rspec", "~> 3.0"
31
31
  spec.add_development_dependency "pry", "~> 0.10"
32
32
  spec.add_development_dependency "timecop", "~> 0.8"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.23.5
4
+ version: 0.24.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2023-04-10 00:00:00.000000000 Z
13
+ date: 2023-11-28 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -80,14 +80,14 @@ dependencies:
80
80
  requirements:
81
81
  - - "~>"
82
82
  - !ruby/object:Gem::Version
83
- version: 13.0.1
83
+ version: 13.1.0
84
84
  type: :development
85
85
  prerelease: false
86
86
  version_requirements: !ruby/object:Gem::Requirement
87
87
  requirements:
88
88
  - - "~>"
89
89
  - !ruby/object:Gem::Version
90
- version: 13.0.1
90
+ version: 13.1.0
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: rspec
93
93
  requirement: !ruby/object:Gem::Requirement