ood_core 0.16.0 → 0.17.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/dependabot.yml +8 -0
- data/CHANGELOG.md +66 -2
- data/lib/ood_core/cluster.rb +20 -5
- data/lib/ood_core/job/adapters/kubernetes/batch.rb +9 -3
- data/lib/ood_core/job/adapters/kubernetes/helper.rb +35 -13
- data/lib/ood_core/job/adapters/kubernetes/resources.rb +30 -3
- data/lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb +44 -5
- data/lib/ood_core/job/adapters/slurm.rb +1 -0
- data/lib/ood_core/job/adapters/torque.rb +2 -0
- data/lib/ood_core/job/script.rb +8 -1
- data/lib/ood_core/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: dca336fb15a08ad68f556b8d33fb76887f5c0370a0eef63685a5770fbf073110
|
|
4
|
+
data.tar.gz: 410b08fee5e739b7444ca3054483a2758d43062af964168b3f32318489d19fa0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e82540895495b9f09c92f413f8f39a894fb700122da195cd4224d68eb5eae30845f8692c6d440462bac2c4f45b0a3270e7bf5219ba4adecfc63baa3884b53d28
|
|
7
|
+
data.tar.gz: 39a441ede8e9b91e169b1aff0c1345a56c98aac1f25a1b07d873b20e66833ed46a534b5ef23f392f5d0213b7e5c80e942f62b0031f4fd2065c327116975fdf8b
|
data/CHANGELOG.md
CHANGED
|
@@ -6,6 +6,66 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
|
6
6
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.17.2] - 7-14-2021
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
|
|
14
|
+
- Fixed k8s adapter to only show Running pods as running in [300](https://github.com/OSC/ood_core/pull/300).
|
|
15
|
+
|
|
16
|
+
## [0.17.1] - 6-14-2021
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
- Fixed [278](https://github.com/OSC/ood_core/pull/278) where unschedulable pods will now show up as
|
|
21
|
+
queued_held status.
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
|
|
25
|
+
- KUBECONFIG now defaults to /dev/null in the kubernetes adapter in [292](https://github.com/OSC/ood_core/pull/292).
|
|
26
|
+
|
|
27
|
+
### Added
|
|
28
|
+
|
|
29
|
+
- Sites can now set `batch_connect.ssh_allow` on the cluster to disable the buttons to start
|
|
30
|
+
a shell session to compute nodes in [289](https://github.com/OSC/ood_core/pull/289).
|
|
31
|
+
- `POD_PORT` is now available to jobs in the kubernetes adapter in [290](https://github.com/OSC/ood_core/pull/290).
|
|
32
|
+
- Kubernetes pods now support a startProbe in [291](https://github.com/OSC/ood_core/pull/291).
|
|
33
|
+
|
|
34
|
+
## [0.17.0] - 5-26-2021
|
|
35
|
+
|
|
36
|
+
### Fixed
|
|
37
|
+
|
|
38
|
+
- All Kubernetes resources now have the same labels in [280](https://github.com/OSC/ood_core/pull/280).
|
|
39
|
+
- Kubernetes does not crash when no configmap is defined in [282](https://github.com/OSC/ood_core/pull/282).
|
|
40
|
+
- Kubernetes will not specify init containers if there are none in
|
|
41
|
+
[284](https://github.com/OSC/ood_core/pull/284).
|
|
42
|
+
|
|
43
|
+
### Added
|
|
44
|
+
|
|
45
|
+
- Kubernetes, Slurm and Torque now support the script option `gpus_per_node` in
|
|
46
|
+
[266](https://github.com/OSC/ood_core/pull/266).
|
|
47
|
+
- Kubernetes will now save the pod.yml into the staged root in
|
|
48
|
+
[277](https://github.com/OSC/ood_core/pull/277).
|
|
49
|
+
- Kubernetes now allows for node selector in [264](https://github.com/OSC/ood_core/pull/264).
|
|
50
|
+
- Kubernetes pods now have access the environment variable POD_NAMESPACE in
|
|
51
|
+
[275](https://github.com/OSC/ood_core/pull/275).
|
|
52
|
+
- Kubernetes pods can now specify the image pull policy in [272](https://github.com/OSC/ood_core/pull/272).
|
|
53
|
+
- Cluster config's batch_connect now support `ssh_allow` to disable sshing to compute
|
|
54
|
+
nodes per cluster in [286](https://github.com/OSC/ood_core/pull/286).
|
|
55
|
+
- Kubernetes will now add the templated script content to a configmap in
|
|
56
|
+
[273](https://github.com/OSC/ood_core/pull/273).
|
|
57
|
+
|
|
58
|
+
### Changed
|
|
59
|
+
|
|
60
|
+
- Kubernetes username prefix no longer appends a - in [271](https://github.com/OSC/ood_core/pull/271).
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
## [0.16.1] - 2021-04-23
|
|
65
|
+
### Fixed
|
|
66
|
+
- memorized some allow? variables to have better support around ACLS in
|
|
67
|
+
[267](https://github.com/OSC/ood_core/pull/267)
|
|
68
|
+
|
|
9
69
|
## [0.16.0] - 2021-04-20
|
|
10
70
|
### Fixed
|
|
11
71
|
- tmux 2.7+ bug in the linux host adapter in [2.5.8](https://github.com/OSC/ood_core/pull/258)
|
|
@@ -300,8 +360,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
300
360
|
### Added
|
|
301
361
|
- Initial release!
|
|
302
362
|
|
|
303
|
-
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.
|
|
304
|
-
[0.
|
|
363
|
+
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.17.2...HEAD
|
|
364
|
+
[0.17.2]: https://github.com/OSC/ood_core/compare/v0.17.1...v0.17.2
|
|
365
|
+
[0.17.1]: https://github.com/OSC/ood_core/compare/v0.17.0...v0.17.1
|
|
366
|
+
[0.17.0]: https://github.com/OSC/ood_core/compare/v0.16.1...v0.17.0
|
|
367
|
+
[0.16.1]: https://github.com/OSC/ood_core/compare/v0.16.0...v0.16.1
|
|
368
|
+
[0.16.0]: https://github.com/OSC/ood_core/compare/v0.15.1...v0.16.0
|
|
305
369
|
[0.15.1]: https://github.com/OSC/ood_core/compare/v0.15.0...v0.15.1
|
|
306
370
|
[0.15.0]: https://github.com/OSC/ood_core/compare/v0.14.0...v0.15.0
|
|
307
371
|
[0.14.0]: https://github.com/OSC/ood_core/compare/v0.13.0...v0.14.0
|
data/lib/ood_core/cluster.rb
CHANGED
|
@@ -78,7 +78,9 @@ module OodCore
|
|
|
78
78
|
# Whether the login feature is allowed
|
|
79
79
|
# @return [Boolean] is login allowed
|
|
80
80
|
def login_allow?
|
|
81
|
-
|
|
81
|
+
return @login_allow if defined?(@login_allow)
|
|
82
|
+
|
|
83
|
+
@login_allow = (allow? && !login_config.empty?)
|
|
82
84
|
end
|
|
83
85
|
|
|
84
86
|
# Build a job adapter from the job configuration
|
|
@@ -90,9 +92,11 @@ module OodCore
|
|
|
90
92
|
# Whether the job feature is allowed based on the ACLs
|
|
91
93
|
# @return [Boolean] is the job feature allowed
|
|
92
94
|
def job_allow?
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
95
|
+
return @job_allow if defined?(@job_allow)
|
|
96
|
+
|
|
97
|
+
@job_allow = (allow? && ! job_config.empty? && build_acls(
|
|
98
|
+
job_config.fetch(:acls, []).map(&:to_h)
|
|
99
|
+
).all?(&:allow?))
|
|
96
100
|
end
|
|
97
101
|
|
|
98
102
|
# The batch connect template configuration used for this cluster
|
|
@@ -138,7 +142,18 @@ module OodCore
|
|
|
138
142
|
# Whether this cluster is allowed to be used
|
|
139
143
|
# @return [Boolean] whether cluster is allowed
|
|
140
144
|
def allow?
|
|
141
|
-
|
|
145
|
+
return @allow if defined?(@allow)
|
|
146
|
+
|
|
147
|
+
@allow = acls.all?(&:allow?)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Whether this cluster supports SSH to batch connect nodes
|
|
151
|
+
# @return [Boolean, nil] whether cluster supports SSH to batch connect node
|
|
152
|
+
def batch_connect_ssh_allow?
|
|
153
|
+
return @batch_connect_ssh_allow if defined?(@batch_connect_ssh_allow)
|
|
154
|
+
return @batch_connect_ssh_allow = nil if batch_connect_config.nil?
|
|
155
|
+
|
|
156
|
+
@batch_connect_ssh_allow = batch_connect_config.fetch(:ssh_allow, nil)
|
|
142
157
|
end
|
|
143
158
|
|
|
144
159
|
# The comparison operator
|
|
@@ -23,7 +23,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
23
23
|
@cluster = options.fetch(:cluster, 'open-ondemand')
|
|
24
24
|
@mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
|
|
25
25
|
@all_namespaces = options.fetch(:all_namespaces, false)
|
|
26
|
-
@username_prefix = options.fetch(:username_prefix,
|
|
26
|
+
@username_prefix = options.fetch(:username_prefix, '')
|
|
27
27
|
@namespace_prefix = options.fetch(:namespace_prefix, '')
|
|
28
28
|
|
|
29
29
|
@using_context = false
|
|
@@ -45,6 +45,9 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
45
45
|
raise ArgumentError, 'Must specify the script' if script.nil?
|
|
46
46
|
|
|
47
47
|
resource_yml, id = generate_id_yml(script)
|
|
48
|
+
if !script.workdir.nil? && Dir.exist?(script.workdir)
|
|
49
|
+
File.open(File.join(script.workdir, 'pod.yml'), 'w') { |f| f.write resource_yml }
|
|
50
|
+
end
|
|
48
51
|
call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
|
|
49
52
|
|
|
50
53
|
id
|
|
@@ -146,7 +149,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
146
149
|
end
|
|
147
150
|
|
|
148
151
|
def k8s_username
|
|
149
|
-
|
|
152
|
+
"#{username_prefix}#{username}"
|
|
150
153
|
end
|
|
151
154
|
|
|
152
155
|
def user
|
|
@@ -180,6 +183,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
180
183
|
HOME: home_dir,
|
|
181
184
|
GROUP: group,
|
|
182
185
|
GID: run_as_group,
|
|
186
|
+
KUBECONFIG: '/dev/null',
|
|
183
187
|
}
|
|
184
188
|
end
|
|
185
189
|
|
|
@@ -189,10 +193,12 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
189
193
|
native_data = script.native
|
|
190
194
|
container = helper.container_from_native(native_data[:container], default_env)
|
|
191
195
|
id = generate_id(container.name)
|
|
192
|
-
configmap = helper.configmap_from_native(native_data, id)
|
|
196
|
+
configmap = helper.configmap_from_native(native_data, id, script.content)
|
|
193
197
|
init_containers = helper.init_ctrs_from_native(native_data[:init_containers], container.env)
|
|
194
198
|
spec = OodCore::Job::Adapters::Kubernetes::Resources::PodSpec.new(container, init_containers: init_containers)
|
|
195
199
|
all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
|
|
200
|
+
node_selector = native_data[:node_selector].nil? ? {} : native_data[:node_selector]
|
|
201
|
+
gpu_type = native_data[:gpu_type].nil? ? "nvidia.com/gpu" : native_data[:gpu_type]
|
|
196
202
|
|
|
197
203
|
template = ERB.new(File.read(resource_file), nil, '-')
|
|
198
204
|
|
|
@@ -53,7 +53,9 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
53
53
|
cpu: container[:cpu],
|
|
54
54
|
working_dir: container[:working_dir],
|
|
55
55
|
restart_policy: container[:restart_policy],
|
|
56
|
-
|
|
56
|
+
image_pull_policy: container[:image_pull_policy],
|
|
57
|
+
image_pull_secret: container[:image_pull_secret],
|
|
58
|
+
startup_probe: container[:startup_probe],
|
|
57
59
|
)
|
|
58
60
|
end
|
|
59
61
|
|
|
@@ -80,10 +82,18 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
80
82
|
# the input configmap hash
|
|
81
83
|
# @param id [#to_s]
|
|
82
84
|
# the id to use for giving the configmap a name
|
|
85
|
+
# @param script_content [#to_s]
|
|
86
|
+
# the batch script content
|
|
83
87
|
# @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
|
|
84
|
-
def configmap_from_native(native, id)
|
|
85
|
-
configmap = native.fetch(:configmap,
|
|
86
|
-
|
|
88
|
+
def configmap_from_native(native, id, script_content)
|
|
89
|
+
configmap = native.fetch(:configmap, {})
|
|
90
|
+
configmap[:files] ||= []
|
|
91
|
+
configmap[:files] << {
|
|
92
|
+
filename: 'script.sh',
|
|
93
|
+
data: script_content,
|
|
94
|
+
mount_path: '/ood/script.sh',
|
|
95
|
+
sub_path: 'script.sh',
|
|
96
|
+
} unless configmap[:files].any? { |f| f[:filename] == 'script.sh' }
|
|
87
97
|
|
|
88
98
|
OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap.new(
|
|
89
99
|
configmap_name(id),
|
|
@@ -140,7 +150,7 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
140
150
|
{
|
|
141
151
|
id: json_data.dig(:metadata, :name).to_s,
|
|
142
152
|
job_name: name_from_metadata(json_data.dig(:metadata)),
|
|
143
|
-
status: pod_status_from_json(json_data),
|
|
153
|
+
status: OodCore::Job::Status.new(state: pod_status_from_json(json_data)),
|
|
144
154
|
job_owner: job_owner_from_json(json_data, ns_prefix),
|
|
145
155
|
submission_time: submission_time(json_data),
|
|
146
156
|
dispatch_time: dispatch_time(json_data),
|
|
@@ -230,15 +240,21 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
230
240
|
def submission_time(json_data)
|
|
231
241
|
status = json_data.dig(:status)
|
|
232
242
|
start = status.dig(:startTime)
|
|
243
|
+
creation = json_data.dig(:metadata, :creationTimestamp)
|
|
233
244
|
|
|
234
|
-
if
|
|
245
|
+
if !creation.nil?
|
|
246
|
+
str = creation
|
|
247
|
+
elsif !start.nil?
|
|
248
|
+
str = start
|
|
249
|
+
else
|
|
235
250
|
# the pod is in some pending state limbo
|
|
236
251
|
conditions = status.dig(:conditions)
|
|
252
|
+
return nil if conditions.nil?
|
|
253
|
+
return nil if conditions.size == 0
|
|
237
254
|
# best guess to start time is just the first condition's
|
|
238
255
|
# transition time
|
|
239
256
|
str = conditions[0].dig(:lastTransitionTime)
|
|
240
|
-
|
|
241
|
-
str = start
|
|
257
|
+
return nil if str.nil?
|
|
242
258
|
end
|
|
243
259
|
|
|
244
260
|
DateTime.parse(str).to_time.to_i
|
|
@@ -246,11 +262,19 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
246
262
|
|
|
247
263
|
def pod_status_from_json(json_data)
|
|
248
264
|
phase = json_data.dig(:status, :phase)
|
|
265
|
+
conditions = json_data.dig(:status, :conditions)
|
|
266
|
+
container_statuses = json_data.dig(:status, :containerStatuses)
|
|
267
|
+
unschedulable = conditions.to_a.any? { |c| c.dig(:reason) == "Unschedulable" }
|
|
268
|
+
ready = !container_statuses.to_a.empty? && container_statuses.to_a.all? { |s| s.dig(:ready) == true }
|
|
269
|
+
return "running" if ready
|
|
270
|
+
|
|
249
271
|
state = case phase
|
|
250
|
-
when "Running"
|
|
251
|
-
"running"
|
|
252
272
|
when "Pending"
|
|
253
|
-
|
|
273
|
+
if unschedulable
|
|
274
|
+
"queued_held"
|
|
275
|
+
else
|
|
276
|
+
"queued"
|
|
277
|
+
end
|
|
254
278
|
when "Failed"
|
|
255
279
|
"suspended"
|
|
256
280
|
when "Succeeded"
|
|
@@ -260,8 +284,6 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
260
284
|
else
|
|
261
285
|
"undetermined"
|
|
262
286
|
end
|
|
263
|
-
|
|
264
|
-
OodCore::Job::Status.new(state: state)
|
|
265
287
|
end
|
|
266
288
|
|
|
267
289
|
def terminated_state(status)
|
|
@@ -33,13 +33,36 @@ module OodCore::Job::Adapters::Kubernetes::Resources
|
|
|
33
33
|
end
|
|
34
34
|
end
|
|
35
35
|
|
|
36
|
+
class TCPProbe
|
|
37
|
+
attr_accessor :port, :initial_delay_seconds, :failure_threshold, :period_seconds
|
|
38
|
+
|
|
39
|
+
def initialize(port, data)
|
|
40
|
+
data ||= {}
|
|
41
|
+
@port = port
|
|
42
|
+
@initial_delay_seconds = data[:initial_delay_seconds] || 2
|
|
43
|
+
@failure_threshold = data[:failure_threshold] || 5
|
|
44
|
+
@period_seconds = data[:period_seconds] || 5
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def to_h
|
|
48
|
+
{
|
|
49
|
+
port: port,
|
|
50
|
+
initial_delay_seconds: initial_delay_seconds,
|
|
51
|
+
failure_threshold: failure_threshold,
|
|
52
|
+
period_seconds: period_seconds,
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
36
57
|
class Container
|
|
37
58
|
attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
|
|
38
|
-
:restart_policy, :image_pull_secret, :supplemental_groups
|
|
59
|
+
:restart_policy, :image_pull_policy, :image_pull_secret, :supplemental_groups,
|
|
60
|
+
:startup_probe
|
|
39
61
|
|
|
40
62
|
def initialize(
|
|
41
63
|
name, image, command: [], port: nil, env: {}, memory: "4Gi", cpu: "1",
|
|
42
|
-
working_dir: "", restart_policy: "Never", image_pull_secret: nil, supplemental_groups: []
|
|
64
|
+
working_dir: "", restart_policy: "Never", image_pull_policy: nil, image_pull_secret: nil, supplemental_groups: [],
|
|
65
|
+
startup_probe: {}
|
|
43
66
|
)
|
|
44
67
|
raise ArgumentError, "containers need valid names and images" unless name && image
|
|
45
68
|
|
|
@@ -52,8 +75,10 @@ module OodCore::Job::Adapters::Kubernetes::Resources
|
|
|
52
75
|
@cpu = cpu.nil? ? "1" : cpu
|
|
53
76
|
@working_dir = working_dir.nil? ? "" : working_dir
|
|
54
77
|
@restart_policy = restart_policy.nil? ? "Never" : restart_policy
|
|
78
|
+
@image_pull_policy = image_pull_policy.nil? ? "IfNotPresent" : image_pull_policy
|
|
55
79
|
@image_pull_secret = image_pull_secret
|
|
56
80
|
@supplemental_groups = supplemental_groups.nil? ? [] : supplemental_groups
|
|
81
|
+
@startup_probe = TCPProbe.new(@port, startup_probe)
|
|
57
82
|
end
|
|
58
83
|
|
|
59
84
|
def ==(other)
|
|
@@ -66,8 +91,10 @@ module OodCore::Job::Adapters::Kubernetes::Resources
|
|
|
66
91
|
cpu == other.cpu &&
|
|
67
92
|
working_dir == other.working_dir &&
|
|
68
93
|
restart_policy == other.restart_policy &&
|
|
94
|
+
image_pull_policy == other.image_pull_policy &&
|
|
69
95
|
image_pull_secret == other.image_pull_secret &&
|
|
70
|
-
supplemental_groups == other.supplemental_groups
|
|
96
|
+
supplemental_groups == other.supplemental_groups &&
|
|
97
|
+
startup_probe.to_h == other.startup_probe.to_h
|
|
71
98
|
end
|
|
72
99
|
end
|
|
73
100
|
|
|
@@ -39,7 +39,7 @@ spec:
|
|
|
39
39
|
containers:
|
|
40
40
|
- name: "<%= spec.container.name %>"
|
|
41
41
|
image: <%= spec.container.image %>
|
|
42
|
-
imagePullPolicy:
|
|
42
|
+
imagePullPolicy: <%= spec.container.image_pull_policy %>
|
|
43
43
|
<%- unless spec.container.working_dir.empty? -%>
|
|
44
44
|
workingDir: "<%= spec.container.working_dir %>"
|
|
45
45
|
<%- end -%>
|
|
@@ -48,6 +48,14 @@ spec:
|
|
|
48
48
|
valueFrom:
|
|
49
49
|
fieldRef:
|
|
50
50
|
fieldPath: metadata.name
|
|
51
|
+
- name: POD_NAMESPACE
|
|
52
|
+
valueFrom:
|
|
53
|
+
fieldRef:
|
|
54
|
+
fieldPath: metadata.namespace
|
|
55
|
+
<%- unless spec.container.port.nil? -%>
|
|
56
|
+
- name: POD_PORT
|
|
57
|
+
value: "<%= spec.container.port %>"
|
|
58
|
+
<%- end -%>
|
|
51
59
|
<%- spec.container.env.each_pair do |name, value| -%>
|
|
52
60
|
- name: <%= name %>
|
|
53
61
|
value: "<%= value %>"
|
|
@@ -61,9 +69,16 @@ spec:
|
|
|
61
69
|
<%- unless spec.container.port.nil? -%>
|
|
62
70
|
ports:
|
|
63
71
|
- containerPort: <%= spec.container.port %>
|
|
72
|
+
startupProbe:
|
|
73
|
+
tcpSocket:
|
|
74
|
+
port: <%= spec.container.startup_probe.port %>
|
|
75
|
+
initialDelaySeconds: <%= spec.container.startup_probe.initial_delay_seconds %>
|
|
76
|
+
failureThreshold: <%= spec.container.startup_probe.failure_threshold %>
|
|
77
|
+
periodSeconds: <%= spec.container.startup_probe.period_seconds %>
|
|
64
78
|
<%- end -%>
|
|
65
|
-
<%- if
|
|
79
|
+
<%- if !all_mounts.empty? || (!configmap.nil? && configmap.mounts?) -%>
|
|
66
80
|
volumeMounts:
|
|
81
|
+
<%- unless configmap.nil? -%>
|
|
67
82
|
<%- configmap.files.each do |file| -%>
|
|
68
83
|
<%- next if file.mount_path.nil? -%>
|
|
69
84
|
- name: configmap-volume
|
|
@@ -72,6 +87,7 @@ spec:
|
|
|
72
87
|
subPath: <%= file.sub_path %>
|
|
73
88
|
<%- end # end unless file.sub_path.nil? -%>
|
|
74
89
|
<%- end # end configmap.files.each -%>
|
|
90
|
+
<%- end # unless configmap.nil? -%>
|
|
75
91
|
<%- all_mounts.each do |mount| -%>
|
|
76
92
|
- name: <%= mount[:name] %>
|
|
77
93
|
mountPath: <%= mount[:destination_path] %>
|
|
@@ -81,25 +97,36 @@ spec:
|
|
|
81
97
|
limits:
|
|
82
98
|
memory: "<%= spec.container.memory %>"
|
|
83
99
|
cpu: "<%= spec.container.cpu %>"
|
|
100
|
+
<%- unless script.gpus_per_node.nil? -%>
|
|
101
|
+
<%= gpu_type %>: <%= script.gpus_per_node %>
|
|
102
|
+
<%- end -%>
|
|
84
103
|
requests:
|
|
85
104
|
memory: "<%= spec.container.memory %>"
|
|
86
105
|
cpu: "<%= spec.container.cpu %>"
|
|
106
|
+
<%- unless script.gpus_per_node.nil? -%>
|
|
107
|
+
<%= gpu_type %>: <%= script.gpus_per_node %>
|
|
108
|
+
<%- end -%>
|
|
87
109
|
securityContext:
|
|
88
110
|
allowPrivilegeEscalation: false
|
|
89
111
|
capabilities:
|
|
90
112
|
drop:
|
|
91
113
|
- all
|
|
92
114
|
privileged: false
|
|
93
|
-
<%- unless spec.init_containers.
|
|
115
|
+
<%- unless spec.init_containers.empty? -%>
|
|
94
116
|
initContainers:
|
|
95
117
|
<%- spec.init_containers.each do |ctr| -%>
|
|
96
118
|
- name: "<%= ctr.name %>"
|
|
97
119
|
image: "<%= ctr.image %>"
|
|
120
|
+
imagePullPolicy: <%= ctr.image_pull_policy %>
|
|
98
121
|
env:
|
|
99
122
|
- name: POD_NAME
|
|
100
123
|
valueFrom:
|
|
101
124
|
fieldRef:
|
|
102
125
|
fieldPath: metadata.name
|
|
126
|
+
- name: POD_NAMESPACE
|
|
127
|
+
valueFrom:
|
|
128
|
+
fieldRef:
|
|
129
|
+
fieldPath: metadata.namespace
|
|
103
130
|
<%- ctr.env.each_pair do |name, value| -%>
|
|
104
131
|
- name: <%= name %>
|
|
105
132
|
value: "<%= value %>"
|
|
@@ -108,8 +135,9 @@ spec:
|
|
|
108
135
|
<%- ctr.command.each do |cmd| -%>
|
|
109
136
|
- "<%= cmd %>"
|
|
110
137
|
<%- end # command loop -%>
|
|
111
|
-
<%- if
|
|
138
|
+
<%- if !all_mounts.empty? || (!configmap.nil? && configmap.init_mounts?) -%>
|
|
112
139
|
volumeMounts:
|
|
140
|
+
<%- unless configmap.nil? -%>
|
|
113
141
|
<%- configmap.files.each do |file| -%>
|
|
114
142
|
<%- next if file.init_mount_path.nil? -%>
|
|
115
143
|
- name: configmap-volume
|
|
@@ -118,6 +146,7 @@ spec:
|
|
|
118
146
|
subPath: <%= file.init_sub_path %>
|
|
119
147
|
<%- end # end unless file.sub_path.nil? -%>
|
|
120
148
|
<%- end # end configmap.files.each -%>
|
|
149
|
+
<%- end # unless configmap.nil? -%>
|
|
121
150
|
<%- all_mounts.each do |mount| -%>
|
|
122
151
|
- name: <%= mount[:name] %>
|
|
123
152
|
mountPath: <%= mount[:destination_path] %>
|
|
@@ -152,6 +181,12 @@ spec:
|
|
|
152
181
|
<%- end # if mount is [host,nfs] -%>
|
|
153
182
|
<%- end # for each mount -%>
|
|
154
183
|
<%- end # (configmap.to_s.empty? || all_mounts.empty?) -%>
|
|
184
|
+
<%- unless node_selector.empty? -%>
|
|
185
|
+
nodeSelector:
|
|
186
|
+
<%- node_selector.each_pair do |key, value| -%>
|
|
187
|
+
<%= key %>: "<%= value %>"
|
|
188
|
+
<%- end # node_selector.each_pair -%>
|
|
189
|
+
<%- end #unless node_selector.empty? -%>
|
|
155
190
|
---
|
|
156
191
|
<%- unless spec.container.port.nil? -%>
|
|
157
192
|
apiVersion: v1
|
|
@@ -161,6 +196,8 @@ metadata:
|
|
|
161
196
|
namespace: <%= namespace %>
|
|
162
197
|
labels:
|
|
163
198
|
job: <%= id %>
|
|
199
|
+
app.kubernetes.io/name: <%= container.name %>
|
|
200
|
+
app.kubernetes.io/managed-by: open-ondemand
|
|
164
201
|
spec:
|
|
165
202
|
selector:
|
|
166
203
|
job: <%= id %>
|
|
@@ -170,8 +207,8 @@ spec:
|
|
|
170
207
|
targetPort: <%= spec.container.port %>
|
|
171
208
|
type: NodePort
|
|
172
209
|
<%- end # end for service -%>
|
|
173
|
-
---
|
|
174
210
|
<%- unless configmap.nil? -%>
|
|
211
|
+
---
|
|
175
212
|
apiVersion: v1
|
|
176
213
|
kind: ConfigMap
|
|
177
214
|
metadata:
|
|
@@ -179,6 +216,8 @@ metadata:
|
|
|
179
216
|
namespace: <%= namespace %>
|
|
180
217
|
labels:
|
|
181
218
|
job: <%= id %>
|
|
219
|
+
app.kubernetes.io/name: <%= container.name %>
|
|
220
|
+
app.kubernetes.io/managed-by: open-ondemand
|
|
182
221
|
data:
|
|
183
222
|
<%- configmap.files.each do |file| -%>
|
|
184
223
|
<%- next if file.data.nil? || file.filename.nil? -%>
|
|
@@ -423,6 +423,7 @@ module OodCore
|
|
|
423
423
|
args.concat ["-t", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
|
|
424
424
|
args.concat ['-a', script.job_array_request] unless script.job_array_request.nil?
|
|
425
425
|
args.concat ['--qos', script.qos] unless script.qos.nil?
|
|
426
|
+
args.concat ['--gpus-per-node', script.gpus_per_node] unless script.gpus_per_node.nil?
|
|
426
427
|
# ignore nodes, don't know how to do this for slurm
|
|
427
428
|
|
|
428
429
|
# Set dependencies
|
|
@@ -159,6 +159,8 @@ module OodCore
|
|
|
159
159
|
args.concat ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
|
|
160
160
|
args.concat ['-t', script.job_array_request] unless script.job_array_request.nil?
|
|
161
161
|
args.concat ['-l', "qos=#{script.qos}"] unless script.qos.nil?
|
|
162
|
+
args.concat ['-l', "gpus=#{script.gpus_per_node}"] unless script.gpus_per_node.nil?
|
|
163
|
+
|
|
162
164
|
# Set environment variables
|
|
163
165
|
env = script.job_environment.to_h
|
|
164
166
|
args.concat ["-v", env.keys.join(",")] unless env.empty?
|
data/lib/ood_core/job/script.rb
CHANGED
|
@@ -103,6 +103,10 @@ module OodCore
|
|
|
103
103
|
# @return [String, nil] qos
|
|
104
104
|
attr_reader :qos
|
|
105
105
|
|
|
106
|
+
# The GPUs per node for the job
|
|
107
|
+
# @return [Integer, nil] gpus per node
|
|
108
|
+
attr_reader :gpus_per_node
|
|
109
|
+
|
|
106
110
|
# Object detailing any native specifications that are implementation specific
|
|
107
111
|
# @note Should not be used at all costs.
|
|
108
112
|
# @return [Object, nil] native specifications
|
|
@@ -136,6 +140,7 @@ module OodCore
|
|
|
136
140
|
# @param accounting_id [#to_s, nil] accounting id
|
|
137
141
|
# @param job_array_request [#to_s, nil] job array request
|
|
138
142
|
# @param qos [#to_s, nil] qos
|
|
143
|
+
# @param gpus_per_node [#to_i, nil] gpus per node
|
|
139
144
|
# @param native [Object, nil] native specifications
|
|
140
145
|
# @param copy_environment [Boolean, nil] copy the environment
|
|
141
146
|
def initialize(content:, args: nil, submit_as_hold: nil, rerunnable: nil,
|
|
@@ -145,7 +150,7 @@ module OodCore
|
|
|
145
150
|
output_path: nil, error_path: nil, reservation_id: nil,
|
|
146
151
|
queue_name: nil, priority: nil, start_time: nil,
|
|
147
152
|
wall_time: nil, accounting_id: nil, job_array_request: nil,
|
|
148
|
-
qos: nil, native: nil, copy_environment: nil, **_)
|
|
153
|
+
qos: nil, gpus_per_node: nil, native: nil, copy_environment: nil, **_)
|
|
149
154
|
@content = content.to_s
|
|
150
155
|
|
|
151
156
|
@submit_as_hold = submit_as_hold
|
|
@@ -170,6 +175,7 @@ module OodCore
|
|
|
170
175
|
@accounting_id = accounting_id && accounting_id.to_s
|
|
171
176
|
@job_array_request = job_array_request && job_array_request.to_s
|
|
172
177
|
@qos = qos && qos.to_s
|
|
178
|
+
@gpus_per_node = gpus_per_node && gpus_per_node.to_i
|
|
173
179
|
@native = native
|
|
174
180
|
@copy_environment = (copy_environment.nil?) ? nil : !! copy_environment
|
|
175
181
|
end
|
|
@@ -200,6 +206,7 @@ module OodCore
|
|
|
200
206
|
accounting_id: accounting_id,
|
|
201
207
|
job_array_request: job_array_request,
|
|
202
208
|
qos: qos,
|
|
209
|
+
gpus_per_node: gpus_per_node,
|
|
203
210
|
native: native,
|
|
204
211
|
copy_environment: copy_environment
|
|
205
212
|
}
|
data/lib/ood_core/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ood_core
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.17.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Franz
|
|
@@ -10,7 +10,7 @@ authors:
|
|
|
10
10
|
autorequire:
|
|
11
11
|
bindir: exe
|
|
12
12
|
cert_chain: []
|
|
13
|
-
date: 2021-
|
|
13
|
+
date: 2021-07-16 00:00:00.000000000 Z
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
16
16
|
name: ood_support
|
|
@@ -160,6 +160,7 @@ executables: []
|
|
|
160
160
|
extensions: []
|
|
161
161
|
extra_rdoc_files: []
|
|
162
162
|
files:
|
|
163
|
+
- ".github/dependabot.yml"
|
|
163
164
|
- ".github/workflows/test.yml"
|
|
164
165
|
- ".gitignore"
|
|
165
166
|
- ".rspec"
|