ood_core 0.19.0 → 0.20.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/ood_core/job/adapter.rb +9 -0
- data/lib/ood_core/job/adapters/ccq.rb +2 -2
- data/lib/ood_core/job/adapters/kubernetes/batch.rb +1 -1
- data/lib/ood_core/job/adapters/kubernetes/helper.rb +1 -1
- data/lib/ood_core/job/adapters/kubernetes/k8s_job_info.rb +3 -3
- data/lib/ood_core/job/adapters/lsf.rb +1 -1
- data/lib/ood_core/job/adapters/slurm.rb +31 -1
- data/lib/ood_core/job/cluster_info.rb +32 -0
- data/lib/ood_core/job/info.rb +13 -2
- data/lib/ood_core/version.rb +1 -1
- data/lib/ood_core.rb +1 -0
- data/ood_core.gemspec +2 -1
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c4dd83d1f69c0bf61d4ddc57b1b7aef23e3309e75988121c5b502dbc35f5208
|
4
|
+
data.tar.gz: ec1a80f736557d1648c11b4cb3339c337d7ae9c0183117f14a74ab88dc7cc9c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8df8478fb9fb591e1c69174c620895027c9fd73a4d4a87b4db9bc064b024679f65d3799a23407b9c363962aed529398596a4fdfbbd61f5f06f0a19f3188cd9be
|
7
|
+
data.tar.gz: 5eab3b6b13b80fac696ce5417f5f61e05952351572a09e57ee196cdbeec043d280d491a3525ad1e53c940f6cb9ac9f58480c03071dddeb4a373f0dff4ad3d527
|
data/CHANGELOG.md
CHANGED
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
9
|
|
10
|
+
## [0.20.0] - 06-03-2022
|
11
|
+
|
12
|
+
- Adapters can now respond to `cluster_info` in [752](https://github.com/OSC/ood_core/pull/752). This returns information about the cluster like how many nodes are available and so on. Only Slurm support in this release.
|
13
|
+
- `OodCore::Job::Info` now has a `gpus` attribute in [753](https://github.com/OSC/ood_core/pull/753). Only Slurm support in this release.
|
14
|
+
- Support Ruby 3 in [759](https://github.com/OSC/ood_core/pull/759)
|
15
|
+
|
10
16
|
## [0.19.0] - 02-03-2022
|
11
17
|
|
12
18
|
### Added
|
data/lib/ood_core/job/adapter.rb
CHANGED
@@ -33,6 +33,15 @@ module OodCore
|
|
33
33
|
raise NotImplementedError, "subclass did not define #submit"
|
34
34
|
end
|
35
35
|
|
36
|
+
# Retrieve the number of active and total cpus, nodes, and gpus
|
37
|
+
# @abstract Subclass is expected to implement {#cluster_stats}
|
38
|
+
# @raise [NotImplementedError] if subclass did not define {#cluster_stats}
|
39
|
+
# @return [ClusterInfo] Object containing quantified statistics about the
|
40
|
+
# cluster's active/total cpus, nodes, and gpus
|
41
|
+
def cluster_info
|
42
|
+
raise NotImplementedError, "subclass did not define #cluster_stats"
|
43
|
+
end
|
44
|
+
|
36
45
|
# Retrieve info for all jobs from the resource manager
|
37
46
|
# @abstract Subclass is expected to implement {#info_all}
|
38
47
|
# @raise [NotImplementedError] if subclass did not define {#info_all}
|
@@ -228,7 +228,7 @@ module OodCore
|
|
228
228
|
data_hash[:submission_time] = raw['dateSubmitted'].to_i
|
229
229
|
data_hash[:queue_name] = raw['criteriaPriority']
|
230
230
|
|
231
|
-
Info.new(data_hash)
|
231
|
+
Info.new(**data_hash)
|
232
232
|
end
|
233
233
|
|
234
234
|
# extended data is just lines of 'key: value' value, so parse
|
@@ -242,7 +242,7 @@ module OodCore
|
|
242
242
|
|
243
243
|
data.to_s.lines.drop(1).each do |line|
|
244
244
|
match_data = ccqstat_regex.match(line)
|
245
|
-
infos << Info.new(ccqstat_match_to_hash(match_data)) if valid_ccqstat_match?(match_data)
|
245
|
+
infos << Info.new(**ccqstat_match_to_hash(match_data)) if valid_ccqstat_match?(match_data)
|
246
246
|
end
|
247
247
|
|
248
248
|
infos
|
@@ -93,7 +93,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
93
93
|
|
94
94
|
def info(id)
|
95
95
|
pod_json = safe_call('get', 'pod', id)
|
96
|
-
return OodCore::Job::Info.new({ id: id, status: 'completed' }) if pod_json.empty?
|
96
|
+
return OodCore::Job::Info.new(**{ id: id, status: 'completed' }) if pod_json.empty?
|
97
97
|
|
98
98
|
service_json = safe_call('get', 'service', service_name(id))
|
99
99
|
secret_json = safe_call('get', 'secret', secret_name(id))
|
@@ -31,7 +31,7 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
31
31
|
|
32
32
|
pod_hash.deep_merge!(service_hash)
|
33
33
|
pod_hash.deep_merge!(secret_hash)
|
34
|
-
OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(pod_hash)
|
34
|
+
OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(**pod_hash)
|
35
35
|
rescue NoMethodError
|
36
36
|
raise K8sDataError, "unable to read data correctly from json"
|
37
37
|
end
|
@@ -2,8 +2,8 @@
|
|
2
2
|
class OodCore::Job::Adapters::Kubernetes::K8sJobInfo < OodCore::Job::Info
|
3
3
|
attr_reader :ood_connection_info
|
4
4
|
|
5
|
-
def initialize(
|
6
|
-
super(options)
|
7
|
-
@ood_connection_info = ood_connection_info
|
5
|
+
def initialize(options)
|
6
|
+
super(**options)
|
7
|
+
@ood_connection_info = options[:ood_connection_info]
|
8
8
|
end
|
9
9
|
end
|
@@ -16,7 +16,7 @@ module OodCore
|
|
16
16
|
# @option config [#to_h] :bin_overrides ({}) Optional overrides to LSF client executables
|
17
17
|
# @option config [#to_s] :submit_host ('') Host to submit commands to
|
18
18
|
def self.build_lsf(config)
|
19
|
-
batch = Adapters::Lsf::Batch.new(config.to_h.symbolize_keys)
|
19
|
+
batch = Adapters::Lsf::Batch.new(**config.to_h.symbolize_keys)
|
20
20
|
Adapters::Lsf.new(batch: batch)
|
21
21
|
end
|
22
22
|
end
|
@@ -36,6 +36,13 @@ module OodCore
|
|
36
36
|
using Refinements::HashExtensions
|
37
37
|
using Refinements::ArrayExtensions
|
38
38
|
|
39
|
+
# Get integer representing the number of gpus used by a node or job,
|
40
|
+
# calculated from gres string
|
41
|
+
# @return [Integer] the number of gpus in gres
|
42
|
+
def gpus_from_gres(gres)
|
43
|
+
gres.to_s.scan(/gpu:[^,]*(\d+)/).flatten.map(&:to_i).sum
|
44
|
+
end
|
45
|
+
|
39
46
|
# Object used for simplified communication with a Slurm batch server
|
40
47
|
# @api private
|
41
48
|
class Batch
|
@@ -98,6 +105,22 @@ module OodCore
|
|
98
105
|
@strict_host_checking = strict_host_checking
|
99
106
|
end
|
100
107
|
|
108
|
+
# Get a ClusterInfo object containing information about the given cluster
|
109
|
+
# @return [ClusterInfo] object containing cluster details
|
110
|
+
def get_cluster_info
|
111
|
+
node_cpu_info = call("sinfo", "-aho %A/%D/%C").strip.split('/')
|
112
|
+
gres_length = call("sinfo", "-o %G").lines.map(&:strip).map(&:length).max + 2
|
113
|
+
gres_lines = call("sinfo", "-ahNO ,nodehost,gres:#{gres_length},gresused:#{gres_length}")
|
114
|
+
.lines.uniq.map(&:split)
|
115
|
+
ClusterInfo.new(active_nodes: node_cpu_info[0].to_i,
|
116
|
+
total_nodes: node_cpu_info[2].to_i,
|
117
|
+
active_processors: node_cpu_info[3].to_i,
|
118
|
+
total_processors: node_cpu_info[6].to_i,
|
119
|
+
active_gpus: gres_lines.sum { |line| gpus_from_gres(line[2]) },
|
120
|
+
total_gpus: gres_lines.sum { |line| gpus_from_gres(line[1]) }
|
121
|
+
)
|
122
|
+
end
|
123
|
+
|
101
124
|
# Get a list of hashes detailing each of the jobs on the batch server
|
102
125
|
# @example Status info for all jobs
|
103
126
|
# my_batch.get_jobs
|
@@ -454,6 +477,12 @@ module OodCore
|
|
454
477
|
raise JobAdapterError, e.message
|
455
478
|
end
|
456
479
|
|
480
|
+
# Retrieve info about active and total cpus, gpus, and nodes
|
481
|
+
# @return [Hash] information about cluster usage
|
482
|
+
def cluster_info
|
483
|
+
@slurm.get_cluster_info
|
484
|
+
end
|
485
|
+
|
457
486
|
# Retrieve info for all jobs from the resource manager
|
458
487
|
# @raise [JobAdapterError] if something goes wrong getting job info
|
459
488
|
# @return [Array<Info>] information describing submitted jobs
|
@@ -643,7 +672,8 @@ module OodCore
|
|
643
672
|
cpu_time: nil,
|
644
673
|
submission_time: v[:submit_time] ? Time.parse(v[:submit_time]) : nil,
|
645
674
|
dispatch_time: (v[:start_time].nil? || v[:start_time] == "N/A") ? nil : Time.parse(v[:start_time]),
|
646
|
-
native: v
|
675
|
+
native: v,
|
676
|
+
gpus: gpus_from_gres(v[:gres])
|
647
677
|
)
|
648
678
|
end
|
649
679
|
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module OodCore
|
2
|
+
module Job
|
3
|
+
# An object that contains details about the cluster's active and total nodes, processors, and gpus
|
4
|
+
class ClusterInfo
|
5
|
+
using Refinements::HashExtensions
|
6
|
+
|
7
|
+
attr_reader :active_nodes, :total_nodes, :active_processors, :total_processors, :active_gpu_nodes,
|
8
|
+
:total_gpu_nodes, :active_gpus, :total_gpus
|
9
|
+
|
10
|
+
def initialize(opts = {})
|
11
|
+
opts = opts.transform_keys(&:to_sym)
|
12
|
+
@active_nodes = opts.fetch(:active_nodes, nil).to_i
|
13
|
+
@total_nodes = opts.fetch(:total_nodes, nil).to_i
|
14
|
+
@active_processors = opts.fetch(:active_processors, nil).to_i
|
15
|
+
@total_processors = opts.fetch(:total_processors, nil).to_i
|
16
|
+
@active_gpus = opts.fetch(:active_gpus, nil).to_i
|
17
|
+
@total_gpus = opts.fetch(:total_gpus, nil).to_i
|
18
|
+
end
|
19
|
+
|
20
|
+
def to_h
|
21
|
+
{
|
22
|
+
active_nodes: active_nodes,
|
23
|
+
total_nodes: total_nodes,
|
24
|
+
active_processors: active_processors,
|
25
|
+
total_processors: total_processors,
|
26
|
+
active_gpus: active_gpus,
|
27
|
+
total_gpus: total_gpus
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/ood_core/job/info.rb
CHANGED
@@ -65,6 +65,10 @@ module OodCore
|
|
65
65
|
# @return [Object] native info
|
66
66
|
attr_reader :native
|
67
67
|
|
68
|
+
# Number of gpus allocated for job
|
69
|
+
# @return [Integer, nil] allocated total number of gpus
|
70
|
+
attr_reader :gpus
|
71
|
+
|
68
72
|
# List of job array child task statuses
|
69
73
|
# @note only relevant for job arrays
|
70
74
|
# @return [Array<Task>] tasks
|
@@ -86,15 +90,16 @@ module OodCore
|
|
86
90
|
# @param dispatch_time [#to_i, nil] dispatch time
|
87
91
|
# @param tasks [Array<Hash>] tasks e.g. { id: '12345.owens-batch', status: :running }
|
88
92
|
# @param native [Object] native info
|
93
|
+
# @param gpus [#to_i, 0] allocated total number of gpus
|
89
94
|
def initialize(id:, status:, allocated_nodes: [], submit_host: nil,
|
90
95
|
job_name: nil, job_owner: nil, accounting_id: nil,
|
91
96
|
procs: nil, queue_name: nil, wallclock_time: nil,
|
92
97
|
wallclock_limit: nil, cpu_time: nil, submission_time: nil,
|
93
|
-
dispatch_time: nil, native: nil, tasks: [],
|
98
|
+
dispatch_time: nil, native: nil, gpus: 0, tasks: [],
|
94
99
|
**_)
|
95
100
|
@id = id.to_s
|
96
101
|
@status = Status.new(state: status.to_sym)
|
97
|
-
@allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(n.to_h) }
|
102
|
+
@allocated_nodes = allocated_nodes.map { |n| NodeInfo.new(**n.to_h) }
|
98
103
|
@submit_host = submit_host && submit_host.to_s
|
99
104
|
@job_name = job_name && job_name.to_s
|
100
105
|
@job_owner = job_owner && job_owner.to_s
|
@@ -111,6 +116,7 @@ module OodCore
|
|
111
116
|
@status = job_array_aggregate_status unless @tasks.empty?
|
112
117
|
|
113
118
|
@native = native
|
119
|
+
@gpus = gpus && gpus.to_i
|
114
120
|
end
|
115
121
|
|
116
122
|
# Create a new Info for a child task
|
@@ -147,10 +153,15 @@ module OodCore
|
|
147
153
|
submission_time: submission_time,
|
148
154
|
dispatch_time: dispatch_time,
|
149
155
|
native: native,
|
156
|
+
gpus: gpus,
|
150
157
|
tasks: tasks
|
151
158
|
}
|
152
159
|
end
|
153
160
|
|
161
|
+
def gpu?
|
162
|
+
gpus.positive?
|
163
|
+
end
|
164
|
+
|
154
165
|
# The comparison operator
|
155
166
|
# @param other [#to_h] object to compare against
|
156
167
|
# @return [Boolean] whether objects are equivalent
|
data/lib/ood_core/version.rb
CHANGED
data/lib/ood_core.rb
CHANGED
data/ood_core.gemspec
CHANGED
@@ -20,10 +20,11 @@ Gem::Specification.new do |spec|
|
|
20
20
|
spec.bindir = "exe"
|
21
21
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
22
|
spec.require_paths = ["lib"]
|
23
|
-
spec.required_ruby_version = ">= 2.
|
23
|
+
spec.required_ruby_version = ">= 2.7.0"
|
24
24
|
|
25
25
|
spec.add_runtime_dependency "ood_support", "~> 0.0.2"
|
26
26
|
spec.add_runtime_dependency "ffi", "~> 1.9", ">= 1.9.6"
|
27
|
+
spec.add_runtime_dependency "rexml", "~> 3.2"
|
27
28
|
spec.add_development_dependency "bundler", "~> 2.1"
|
28
29
|
spec.add_development_dependency "rake", "~> 13.0.1"
|
29
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.20.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2022-03
|
13
|
+
date: 2022-06-03 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -46,6 +46,20 @@ dependencies:
|
|
46
46
|
- - ">="
|
47
47
|
- !ruby/object:Gem::Version
|
48
48
|
version: 1.9.6
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: rexml
|
51
|
+
requirement: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - "~>"
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '3.2'
|
56
|
+
type: :runtime
|
57
|
+
prerelease: false
|
58
|
+
version_requirements: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - "~>"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '3.2'
|
49
63
|
- !ruby/object:Gem::Dependency
|
50
64
|
name: bundler
|
51
65
|
requirement: !ruby/object:Gem::Requirement
|
@@ -196,6 +210,7 @@ files:
|
|
196
210
|
- lib/ood_core/job/adapters/torque/error.rb
|
197
211
|
- lib/ood_core/job/adapters/torque/ffi.rb
|
198
212
|
- lib/ood_core/job/array_ids.rb
|
213
|
+
- lib/ood_core/job/cluster_info.rb
|
199
214
|
- lib/ood_core/job/factory.rb
|
200
215
|
- lib/ood_core/job/info.rb
|
201
216
|
- lib/ood_core/job/node_info.rb
|
@@ -219,7 +234,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
219
234
|
requirements:
|
220
235
|
- - ">="
|
221
236
|
- !ruby/object:Gem::Version
|
222
|
-
version: 2.
|
237
|
+
version: 2.7.0
|
223
238
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
224
239
|
requirements:
|
225
240
|
- - ">="
|