ood_core 0.19.0 → 0.20.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -1
- data/lib/ood_core/batch_connect/templates/vnc.rb +6 -1
- data/lib/ood_core/job/adapter.rb +9 -0
- data/lib/ood_core/job/adapters/ccq.rb +2 -2
- data/lib/ood_core/job/adapters/kubernetes/batch.rb +1 -1
- data/lib/ood_core/job/adapters/kubernetes/helper.rb +1 -1
- data/lib/ood_core/job/adapters/kubernetes/k8s_job_info.rb +3 -3
- data/lib/ood_core/job/adapters/lsf.rb +1 -1
- data/lib/ood_core/job/adapters/slurm.rb +31 -1
- data/lib/ood_core/job/cluster_info.rb +32 -0
- data/lib/ood_core/job/info.rb +13 -2
- data/lib/ood_core/version.rb +1 -1
- data/lib/ood_core.rb +1 -0
- data/ood_core.gemspec +3 -2
- metadata +20 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c4e013f80e987d4d1cefbc78cc76bcff52e4083e0b84192b42807ae46806946
|
4
|
+
data.tar.gz: c4a1607904baccc1b063916ecf8e5a9692a9c0102a0d8cda3a9edf0ae760191f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ab3333366fc7802d59a15dead3b21e863d0017385053eea629a109a076c6e768ed1575378a34e68bb6c163b050be87a9cf323f087d02e4e2be4d349550bf5531
|
7
|
+
data.tar.gz: 234c13fbbc428717532bd93ba4e977cfd825e480c69daf66c737165ed7c5d8a951c329ced0312d525efc0b70cb4d11234c016c6216c2bb7f74573de854340889
|
data/CHANGELOG.md
CHANGED
@@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
9
|
|
10
|
+
## [0.20.2] - 07-28-2022
|
11
|
+
|
12
|
+
- Fixed an issue with Slurm's `cluster_info` in [762](https://github.com/OSC/ood_core/pull/762).
|
13
|
+
- Relaxed Ruby requirement down to 2.5 in [771](https://github.com/OSC/ood_core/pull/771).
|
14
|
+
|
15
|
+
## [0.20.1] - 07-21-2022
|
16
|
+
|
17
|
+
- Fixed turbovnc compatability issue with the -nohttpd flag in [767](https://github.com/OSC/ood_core/pull/767).
|
18
|
+
|
19
|
+
## [0.20.0] - 06-03-2022
|
20
|
+
|
21
|
+
- Adapters can now respond to `cluster_info` in [752](https://github.com/OSC/ood_core/pull/752). This returns information about the cluster like how many nodes are available and so on. Only Slurm support in this release.
|
22
|
+
- `OodCore::Job::Info` now has a `gpus` attribute in [753](https://github.com/OSC/ood_core/pull/753). Only Slurm support in this release.
|
23
|
+
- Support Ruby 3 in [759](https://github.com/OSC/ood_core/pull/759)
|
24
|
+
|
10
25
|
## [0.19.0] - 02-03-2022
|
11
26
|
|
12
27
|
### Added
|
@@ -422,7 +437,10 @@ Functionally the same as [0.17.3] but with some CI updates.
|
|
422
437
|
### Added
|
423
438
|
- Initial release!
|
424
439
|
|
425
|
-
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.
|
440
|
+
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.20.2...HEAD
|
441
|
+
[0.20.2]: https://github.com/OSC/ood_core/compare/v0.20.1...v0.20.2
|
442
|
+
[0.20.1]: https://github.com/OSC/ood_core/compare/v0.20.0...v0.20.1
|
443
|
+
[0.20.0]: https://github.com/OSC/ood_core/compare/v0.19.0...v0.20.0
|
426
444
|
[0.19.0]: https://github.com/OSC/ood_core/compare/v0.18.1...v0.19.0
|
427
445
|
[0.18.1]: https://github.com/OSC/ood_core/compare/v0.18.0...v0.18.1
|
428
446
|
[0.18.0]: https://github.com/OSC/ood_core/compare/v0.17.8...v0.18.0
|
@@ -86,8 +86,13 @@ module OodCore
|
|
86
86
|
# Clean up any old VNC sessions that weren't cleaned before
|
87
87
|
#{vnc_clean}
|
88
88
|
|
89
|
+
# for turbovnc 3.0 compatability.
|
90
|
+
if timeout 2 vncserver --help 2>&1 | grep 'nohttpd' >/dev/null 2>&1; then
|
91
|
+
HTTPD_OPT='-nohttpd'
|
92
|
+
fi
|
93
|
+
|
89
94
|
# Attempt to start VNC server
|
90
|
-
VNC_OUT=$(vncserver -log "#{vnc_log}" -rfbauth "#{vnc_passwd}"
|
95
|
+
VNC_OUT=$(vncserver -log "#{vnc_log}" -rfbauth "#{vnc_passwd}" $HTTPD_OPT -noxstartup #{vnc_args} 2>&1)
|
91
96
|
VNC_PID=$(pgrep -s 0 Xvnc) # the script above will daemonize the Xvnc process
|
92
97
|
echo "${VNC_OUT}"
|
93
98
|
|
data/lib/ood_core/job/adapter.rb
CHANGED
@@ -33,6 +33,15 @@ module OodCore
|
|
33
33
|
raise NotImplementedError, "subclass did not define #submit"
|
34
34
|
end
|
35
35
|
|
36
|
+
# Retrieve the number of active and total cpus, nodes, and gpus
|
37
|
+
# @abstract Subclass is expected to implement {#cluster_stats}
|
38
|
+
# @raise [NotImplementedError] if subclass did not define {#cluster_stats}
|
39
|
+
# @return [ClusterInfo] Object containing quantified statistics about the
|
40
|
+
# cluster's active/total cpus, nodes, and gpus
|
41
|
+
def cluster_info
|
42
|
+
raise NotImplementedError, "subclass did not define #cluster_stats"
|
43
|
+
end
|
44
|
+
|
36
45
|
# Retrieve info for all jobs from the resource manager
|
37
46
|
# @abstract Subclass is expected to implement {#info_all}
|
38
47
|
# @raise [NotImplementedError] if subclass did not define {#info_all}
|
@@ -228,7 +228,7 @@ module OodCore
|
|
228
228
|
data_hash[:submission_time] = raw['dateSubmitted'].to_i
|
229
229
|
data_hash[:queue_name] = raw['criteriaPriority']
|
230
230
|
|
231
|
-
Info.new(data_hash)
|
231
|
+
Info.new(**data_hash)
|
232
232
|
end
|
233
233
|
|
234
234
|
# extended data is just lines of 'key: value' value, so parse
|
@@ -242,7 +242,7 @@ module OodCore
|
|
242
242
|
|
243
243
|
data.to_s.lines.drop(1).each do |line|
|
244
244
|
match_data = ccqstat_regex.match(line)
|
245
|
-
infos << Info.new(ccqstat_match_to_hash(match_data)) if valid_ccqstat_match?(match_data)
|
245
|
+
infos << Info.new(**ccqstat_match_to_hash(match_data)) if valid_ccqstat_match?(match_data)
|
246
246
|
end
|
247
247
|
|
248
248
|
infos
|
@@ -93,7 +93,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
93
93
|
|
94
94
|
def info(id)
|
95
95
|
pod_json = safe_call('get', 'pod', id)
|
96
|
-
return OodCore::Job::Info.new({ id: id, status: 'completed' }) if pod_json.empty?
|
96
|
+
return OodCore::Job::Info.new(**{ id: id, status: 'completed' }) if pod_json.empty?
|
97
97
|
|
98
98
|
service_json = safe_call('get', 'service', service_name(id))
|
99
99
|
secret_json = safe_call('get', 'secret', secret_name(id))
|
@@ -31,7 +31,7 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
31
31
|
|
32
32
|
pod_hash.deep_merge!(service_hash)
|
33
33
|
pod_hash.deep_merge!(secret_hash)
|
34
|
-
OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(pod_hash)
|
34
|
+
OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(**pod_hash)
|
35
35
|
rescue NoMethodError
|
36
36
|
raise K8sDataError, "unable to read data correctly from json"
|
37
37
|
end
|
@@ -2,8 +2,8 @@
|
|
2
2
|
class OodCore::Job::Adapters::Kubernetes::K8sJobInfo < OodCore::Job::Info
|
3
3
|
attr_reader :ood_connection_info
|
4
4
|
|
5
|
-
def initialize(
|
6
|
-
super(options)
|
7
|
-
@ood_connection_info = ood_connection_info
|
5
|
+
def initialize(options)
|
6
|
+
super(**options)
|
7
|
+
@ood_connection_info = options[:ood_connection_info]
|
8
8
|
end
|
9
9
|
end
|
@@ -16,7 +16,7 @@ module OodCore
|
|
16
16
|
# @option config [#to_h] :bin_overrides ({}) Optional overrides to LSF client executables
|
17
17
|
# @option config [#to_s] :submit_host ('') Host to submit commands to
|
18
18
|
def self.build_lsf(config)
|
19
|
-
batch = Adapters::Lsf::Batch.new(config.to_h.symbolize_keys)
|
19
|
+
batch = Adapters::Lsf::Batch.new(**config.to_h.symbolize_keys)
|
20
20
|
Adapters::Lsf.new(batch: batch)
|
21
21
|
end
|
22
22
|
end
|
@@ -36,6 +36,13 @@ module OodCore
|
|
36
36
|
using Refinements::HashExtensions
|
37
37
|
using Refinements::ArrayExtensions
|
38
38
|
|
39
|
+
# Get integer representing the number of gpus used by a node or job,
|
40
|
+
# calculated from gres string
|
41
|
+
# @return [Integer] the number of gpus in gres
|
42
|
+
def self.gpus_from_gres(gres)
|
43
|
+
gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum
|
44
|
+
end
|
45
|
+
|
39
46
|
# Object used for simplified communication with a Slurm batch server
|
40
47
|
# @api private
|
41
48
|
class Batch
|
@@ -98,6 +105,22 @@ module OodCore
|
|
98
105
|
@strict_host_checking = strict_host_checking
|
99
106
|
end
|
100
107
|
|
108
|
+
# Get a ClusterInfo object containing information about the given cluster
|
109
|
+
# @return [ClusterInfo] object containing cluster details
|
110
|
+
def get_cluster_info
|
111
|
+
node_cpu_info = call("sinfo", "-aho %A/%D/%C").strip.split('/')
|
112
|
+
gres_length = call("sinfo", "-o %G").lines.map(&:strip).map(&:length).max + 2
|
113
|
+
gres_lines = call("sinfo", "-ahNO ,nodehost,gres:#{gres_length},gresused:#{gres_length}")
|
114
|
+
.lines.uniq.map(&:split)
|
115
|
+
ClusterInfo.new(active_nodes: node_cpu_info[0].to_i,
|
116
|
+
total_nodes: node_cpu_info[2].to_i,
|
117
|
+
active_processors: node_cpu_info[3].to_i,
|
118
|
+
total_processors: node_cpu_info[6].to_i,
|
119
|
+
active_gpus: gres_lines.sum { |line| Slurm.gpus_from_gres(line[2]) },
|
120
|
+
total_gpus: gres_lines.sum { |line| Slurm.gpus_from_gres(line[1]) }
|
121
|
+
)
|
122
|
+
end
|
123
|
+
|
101
124
|
# Get a list of hashes detailing each of the jobs on the batch server
|
102
125
|
# @example Status info for all jobs
|
103
126
|
# my_batch.get_jobs
|
@@ -454,6 +477,12 @@ module OodCore
|
|
454
477
|
raise JobAdapterError, e.message
|
455
478
|
end
|
456
479
|
|
480
|
+
# Retrieve info about active and total cpus, gpus, and nodes
|
481
|
+
# @return [Hash] information about cluster usage
|
482
|
+
def cluster_info
|
483
|
+
@slurm.get_cluster_info
|
484
|
+
end
|
485
|
+
|
457
486
|
# Retrieve info for all jobs from the resource manager
|
458
487
|
# @raise [JobAdapterError] if something goes wrong getting job info
|
459
488
|
# @return [Array<Info>] information describing submitted jobs
|
@@ -643,7 +672,8 @@ module OodCore
|
|
643
672
|
cpu_time: nil,
|
644
673
|
submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil,
|
645
674
|
dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]),
|
646
|
-
native: v
|
675
|
+
native: v,
|
676
|
+
gpus: self.class.gpus_from_gres(v[:gres])
|
647
677
|
)
|
648
678
|
end
|
649
679
|
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module OodCore
|
2
|
+
module Job
|
3
|
+
# An object that contains details about the cluster's active and total nodes, processors, and gpus
|
4
|
+
class ClusterInfo
|
5
|
+
using Refinements::HashExtensions
|
6
|
+
|
7
|
+
attr_reader :active_nodes, :total_nodes, :active_processors, :total_processors, :active_gpu_nodes,
|
8
|
+
:total_gpu_nodes, :active_gpus, :total_gpus
|
9
|
+
|
10
|
+
def initialize(opts = {})
|
11
|
+
opts = opts.transform_keys(&:to_sym)
|
12
|
+
@active_nodes = opts.fetch(:active_nodes, nil).to_i
|
13
|
+
@total_nodes = opts.fetch(:total_nodes, nil).to_i
|
14
|
+
@active_processors = opts.fetch(:active_processors, nil).to_i
|
15
|
+
@total_processors = opts.fetch(:total_processors, nil).to_i
|
16
|
+
@active_gpus = opts.fetch(:active_gpus, nil).to_i
|
17
|
+
@total_gpus = opts.fetch(:total_gpus, nil).to_i
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h
|
21
|
+
{
|
22
|
+
active_nodes: active_nodes,
|
23
|
+
total_nodes: total_nodes,
|
24
|
+
active_processors: active_processors,
|
25
|
+
total_processors: total_processors,
|
26
|
+
active_gpus: active_gpus,
|
27
|
+
total_gpus: total_gpus
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/ood_core/job/info.rb
CHANGED
@@ -65,6 +65,10 @@ module OodCore
|
|
65
65
|
# @return [Object] native info
|
66
66
|
attr_reader :native
|
67
67
|
|
68
|
+
# Number of gpus allocated for job
|
69
|
+
# @return [Integer, nil] allocated total number of gpus
|
70
|
+
attr_reader :gpus
|
71
|
+
|
68
72
|
# List of job array child task statuses
|
69
73
|
# @note only relevant for job arrays
|
70
74
|
# @return [Array<Task>] tasks
|
@@ -86,15 +90,16 @@ module OodCore
|
|
86
90
|
# @param dispatch_time [#to_i, nil] dispatch time
|
87
91
|
# @param tasks [Array<Hash>] tasks e.g. { id: '12345.owens-batch', status: :running }
|
88
92
|
# @param native [Object] native info
|
93
|
+
# @param gpus [#to_i, 0] allocated total number of gpus
|
89
94
|
def initialize(id:, status:, allocated_nodes: [], submit_host: nil,
|
90
95
|
job_name: nil, job_owner: nil, accounting_id: nil,
|
91
96
|
procs: nil, queue_name: nil, wallclock_time: nil,
|
92
97
|
wallclock_limit: nil, cpu_time: nil, submission_time: nil,
|
93
|
-
dispatch_time: nil, native: nil, tasks: [],
|
98
|
+
dispatch_time: nil, native: nil, gpus: 0, tasks: [],
|
94
99
|
**_)
|
95
100
|
@id = id.to_s
|
96
101
|
@status = Status.new(state: status.to_sym)
|
97
|
-
@allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(n.to_h) }
|
102
|
+
@allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(**n.to_h) }
|
98
103
|
@submit_host = submit_host && submit_host.to_s
|
99
104
|
@job_name = job_name && job_name.to_s
|
100
105
|
@job_owner = job_owner && job_owner.to_s
|
@@ -111,6 +116,7 @@ module OodCore
|
|
111
116
|
@status = job_array_aggregate_status unless @tasks.empty?
|
112
117
|
|
113
118
|
@native = native
|
119
|
+
@gpus = gpus && gpus.to_i
|
114
120
|
end
|
115
121
|
|
116
122
|
# Create a new Info for a child task
|
@@ -147,10 +153,15 @@ module OodCore
|
|
147
153
|
submission_time: submission_time,
|
148
154
|
dispatch_time: dispatch_time,
|
149
155
|
native: native,
|
156
|
+
gpus: gpus,
|
150
157
|
tasks: tasks
|
151
158
|
}
|
152
159
|
end
|
153
160
|
|
161
|
+
def gpu?
|
162
|
+
gpus.positive?
|
163
|
+
end
|
164
|
+
|
154
165
|
# The comparison operator
|
155
166
|
# @param other [#to_h] object to compare against
|
156
167
|
# @return [Boolean] whether objects are equivalent
|
data/lib/ood_core/version.rb
CHANGED
data/lib/ood_core.rb
CHANGED
data/ood_core.gemspec
CHANGED
@@ -20,14 +20,15 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.bindir = "exe"
|
21
21
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
22
|
spec.require_paths = ["lib"]
|
23
|
-
spec.required_ruby_version = ">= 2.
|
23
|
+
spec.required_ruby_version = ">= 2.5.0"
|
24
24
|
|
25
25
|
spec.add_runtime_dependency "ood_support", "~> 0.0.2"
|
26
26
|
spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
|
27
|
+
spec.add_runtime_dependency "rexml", "~> 3.2"
|
27
28
|
spec.add_development_dependency "bundler", "~> 2.1"
|
28
29
|
spec.add_development_dependency "rake", "~> 13.0.1"
|
29
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
30
31
|
spec.add_development_dependency "pry", "~> 0.10"
|
31
32
|
spec.add_development_dependency "timecop", "~> 0.8"
|
32
|
-
spec.add_development_dependency "climate_control", "~> 1.
|
33
|
+
spec.add_development_dependency "climate_control", "~> 1.1.1"
|
33
34
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.20.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2022-
|
13
|
+
date: 2022-07-28 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -46,6 +46,20 @@ dependencies:
|
|
46
46
|
- - ">="
|
47
47
|
- !ruby/object:Gem::Version
|
48
48
|
version: 1.9.6
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: rexml
|
51
|
+
requirement: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - "~>"
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '3.2'
|
56
|
+
type: :runtime
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - "~>"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '3.2'
|
49
63
|
- !ruby/object:Gem::Dependency
|
50
64
|
name: bundler
|
51
65
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,14 +136,14 @@ dependencies:
|
|
122
136
|
requirements:
|
123
137
|
- - "~>"
|
124
138
|
- !ruby/object:Gem::Version
|
125
|
-
version: 1.
|
139
|
+
version: 1.1.1
|
126
140
|
type: :development
|
127
141
|
prerelease: false
|
128
142
|
version_requirements: !ruby/object:Gem::Requirement
|
129
143
|
requirements:
|
130
144
|
- - "~>"
|
131
145
|
- !ruby/object:Gem::Version
|
132
|
-
version: 1.
|
146
|
+
version: 1.1.1
|
133
147
|
description: Open OnDemand core library that provides support for an HPC Center to
|
134
148
|
globally define HPC services that web applications can then take advantage of.
|
135
149
|
email:
|
@@ -196,6 +210,7 @@ files:
|
|
196
210
|
- lib/ood_core/job/adapters/torque/error.rb
|
197
211
|
- lib/ood_core/job/adapters/torque/ffi.rb
|
198
212
|
- lib/ood_core/job/array_ids.rb
|
213
|
+
- lib/ood_core/job/cluster_info.rb
|
199
214
|
- lib/ood_core/job/factory.rb
|
200
215
|
- lib/ood_core/job/info.rb
|
201
216
|
- lib/ood_core/job/node_info.rb
|
@@ -219,7 +234,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
219
234
|
requirements:
|
220
235
|
- - ">="
|
221
236
|
- !ruby/object:Gem::Version
|
222
|
-
version: 2.
|
237
|
+
version: 2.5.0
|
223
238
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
224
239
|
requirements:
|
225
240
|
- - ">="
|