ood_core 0.16.1 → 0.17.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +75 -1
- data/lib/ood_core/cluster.rb +9 -0
- data/lib/ood_core/job/adapters/kubernetes/batch.rb +25 -3
- data/lib/ood_core/job/adapters/kubernetes/helper.rb +38 -13
- data/lib/ood_core/job/adapters/kubernetes/resources.rb +30 -3
- data/lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb +45 -13
- data/lib/ood_core/job/adapters/slurm.rb +1 -0
- data/lib/ood_core/job/adapters/torque.rb +2 -0
- data/lib/ood_core/job/script.rb +8 -1
- data/lib/ood_core/version.rb +1 -1
- data/ood_core.gemspec +1 -1
- metadata +2 -3
- data/.github/workflows/test.yml +0 -30
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4c94c6fbf110564ec2cff9d885d2799566a70e1e98759f44fec00fde6eb0cdec
|
|
4
|
+
data.tar.gz: 7ed0326c52582dbd8b15a272706e8aa58e36a2be0555a78c65037bb74517d0a6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b6af308bf4acb767e6c3128ce753714ebcee4f33a17b5114a1196d73ec7df63be5d5007ad985c752329463e2533ed1bbfa8951426a2a035ef08ce9b3704b5984
|
|
7
|
+
data.tar.gz: 76b07812da52479c3d5c834c51dcb6c5af328721436474197a77bd0423f5061361d333d233c235da7a699e31ac772104eb65902e74ee25afb4640c6e5adc4add
|
data/CHANGELOG.md
CHANGED
|
@@ -6,6 +6,75 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
|
6
6
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.17.4] - 7-29-2021
|
|
11
|
+
|
|
12
|
+
Functionally the same as [0.17.3] but with some CI updates.
|
|
13
|
+
|
|
14
|
+
## [0.17.3] - 7-29-2021
|
|
15
|
+
|
|
16
|
+
### Fixed
|
|
17
|
+
|
|
18
|
+
- Fixed handling of pods in a startup phase in [303](https://github.com/OSC/ood_core/pull/303).
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- Enable automatic population of supplemental groups in [305](https://github.com/OSC/ood_core/pull/305).
|
|
23
|
+
|
|
24
|
+
## [0.17.2] - 7-14-2021
|
|
25
|
+
|
|
26
|
+
### Fixed
|
|
27
|
+
|
|
28
|
+
- Fixed k8s adapter to only show Running pods as running in [300](https://github.com/OSC/ood_core/pull/300).
|
|
29
|
+
|
|
30
|
+
## [0.17.1] - 6-14-2021
|
|
31
|
+
|
|
32
|
+
### Fixed
|
|
33
|
+
|
|
34
|
+
- Fixed [278](https://github.com/OSC/ood_core/pull/278) where unschedulable pods will now show up as
|
|
35
|
+
queued_held status.
|
|
36
|
+
|
|
37
|
+
### Changed
|
|
38
|
+
|
|
39
|
+
- KUBECONFIG now defaults to /dev/null in the kubernetes adapter in [292](https://github.com/OSC/ood_core/pull/292).
|
|
40
|
+
|
|
41
|
+
### Added
|
|
42
|
+
|
|
43
|
+
- Sites can now set `batch_connect.ssh_allow` on the cluster to disable the buttons to start
|
|
44
|
+
a shell session to compute nodes in [289](https://github.com/OSC/ood_core/pull/289).
|
|
45
|
+
- `POD_PORT` is now available to jobs in the kubernetes adapter in [290](https://github.com/OSC/ood_core/pull/290).
|
|
46
|
+
- Kubernetes pods now support a startProbe in [291](https://github.com/OSC/ood_core/pull/291).
|
|
47
|
+
|
|
48
|
+
## [0.17.0] - 5-26-2021
|
|
49
|
+
|
|
50
|
+
### Fixed
|
|
51
|
+
|
|
52
|
+
- All Kubernetes resources now have the same labels in [280](https://github.com/OSC/ood_core/pull/280).
|
|
53
|
+
- Kubernetes does not crash when no configmap is defined in [282](https://github.com/OSC/ood_core/pull/282).
|
|
54
|
+
- Kubernetes will not specify init containers if there are none in
|
|
55
|
+
[284](https://github.com/OSC/ood_core/pull/284).
|
|
56
|
+
|
|
57
|
+
### Added
|
|
58
|
+
|
|
59
|
+
- Kubernetes, Slurm and Torque now support the script option `gpus_per_node` in
|
|
60
|
+
[266](https://github.com/OSC/ood_core/pull/266).
|
|
61
|
+
- Kubernetes will now save the pod.yml into the staged root in
|
|
62
|
+
[277](https://github.com/OSC/ood_core/pull/277).
|
|
63
|
+
- Kubernetes now allows for node selector in [264](https://github.com/OSC/ood_core/pull/264).
|
|
64
|
+
- Kubernetes pods now have access the environment variable POD_NAMESPACE in
|
|
65
|
+
[275](https://github.com/OSC/ood_core/pull/275).
|
|
66
|
+
- Kubernetes pods can now specify the image pull policy in [272](https://github.com/OSC/ood_core/pull/272).
|
|
67
|
+
- Cluster config's batch_connect now support `ssh_allow` to disable sshing to compute
|
|
68
|
+
nodes per cluster in [286](https://github.com/OSC/ood_core/pull/286).
|
|
69
|
+
- Kubernetes will now add the templated script content to a configmap in
|
|
70
|
+
[273](https://github.com/OSC/ood_core/pull/273).
|
|
71
|
+
|
|
72
|
+
### Changed
|
|
73
|
+
|
|
74
|
+
- Kubernetes username prefix no longer appends a - in [271](https://github.com/OSC/ood_core/pull/271).
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
9
78
|
## [0.16.1] - 2021-04-23
|
|
10
79
|
### Fixed
|
|
11
80
|
- memorized some allow? variables to have better support around ACLS in
|
|
@@ -305,7 +374,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
305
374
|
### Added
|
|
306
375
|
- Initial release!
|
|
307
376
|
|
|
308
|
-
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.
|
|
377
|
+
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.17.4...HEAD
|
|
378
|
+
[0.17.4]: https://github.com/OSC/ood_core/compare/v0.17.3...v0.17.4
|
|
379
|
+
[0.17.3]: https://github.com/OSC/ood_core/compare/v0.17.2...v0.17.3
|
|
380
|
+
[0.17.2]: https://github.com/OSC/ood_core/compare/v0.17.1...v0.17.2
|
|
381
|
+
[0.17.1]: https://github.com/OSC/ood_core/compare/v0.17.0...v0.17.1
|
|
382
|
+
[0.17.0]: https://github.com/OSC/ood_core/compare/v0.16.1...v0.17.0
|
|
309
383
|
[0.16.1]: https://github.com/OSC/ood_core/compare/v0.16.0...v0.16.1
|
|
310
384
|
[0.16.0]: https://github.com/OSC/ood_core/compare/v0.15.1...v0.16.0
|
|
311
385
|
[0.15.1]: https://github.com/OSC/ood_core/compare/v0.15.0...v0.15.1
|
data/lib/ood_core/cluster.rb
CHANGED
|
@@ -147,6 +147,15 @@ module OodCore
|
|
|
147
147
|
@allow = acls.all?(&:allow?)
|
|
148
148
|
end
|
|
149
149
|
|
|
150
|
+
# Whether this cluster supports SSH to batch connect nodes
|
|
151
|
+
# @return [Boolean, nil] whether cluster supports SSH to batch connect node
|
|
152
|
+
def batch_connect_ssh_allow?
|
|
153
|
+
return @batch_connect_ssh_allow if defined?(@batch_connect_ssh_allow)
|
|
154
|
+
return @batch_connect_ssh_allow = nil if batch_connect_config.nil?
|
|
155
|
+
|
|
156
|
+
@batch_connect_ssh_allow = batch_connect_config.fetch(:ssh_allow, nil)
|
|
157
|
+
end
|
|
158
|
+
|
|
150
159
|
# The comparison operator
|
|
151
160
|
# @param other [#to_sym] object to compare against
|
|
152
161
|
# @return [Boolean] whether objects are equivalent
|
|
@@ -14,6 +14,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
14
14
|
attr_reader :config_file, :bin, :cluster, :mounts
|
|
15
15
|
attr_reader :all_namespaces, :using_context, :helper
|
|
16
16
|
attr_reader :username_prefix, :namespace_prefix
|
|
17
|
+
attr_reader :auto_supplemental_groups
|
|
17
18
|
|
|
18
19
|
def initialize(options = {})
|
|
19
20
|
options = options.to_h.symbolize_keys
|
|
@@ -23,8 +24,9 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
23
24
|
@cluster = options.fetch(:cluster, 'open-ondemand')
|
|
24
25
|
@mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
|
|
25
26
|
@all_namespaces = options.fetch(:all_namespaces, false)
|
|
26
|
-
@username_prefix = options.fetch(:username_prefix,
|
|
27
|
+
@username_prefix = options.fetch(:username_prefix, '')
|
|
27
28
|
@namespace_prefix = options.fetch(:namespace_prefix, '')
|
|
29
|
+
@auto_supplemental_groups = options.fetch(:auto_supplemental_groups, false)
|
|
28
30
|
|
|
29
31
|
@using_context = false
|
|
30
32
|
@helper = OodCore::Job::Adapters::Kubernetes::Helper.new
|
|
@@ -45,6 +47,9 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
45
47
|
raise ArgumentError, 'Must specify the script' if script.nil?
|
|
46
48
|
|
|
47
49
|
resource_yml, id = generate_id_yml(script)
|
|
50
|
+
if !script.workdir.nil? && Dir.exist?(script.workdir)
|
|
51
|
+
File.open(File.join(script.workdir, 'pod.yml'), 'w') { |f| f.write resource_yml }
|
|
52
|
+
end
|
|
48
53
|
call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
|
|
49
54
|
|
|
50
55
|
id
|
|
@@ -146,7 +151,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
146
151
|
end
|
|
147
152
|
|
|
148
153
|
def k8s_username
|
|
149
|
-
|
|
154
|
+
"#{username_prefix}#{username}"
|
|
150
155
|
end
|
|
151
156
|
|
|
152
157
|
def user
|
|
@@ -173,6 +178,19 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
173
178
|
Etc.getgrgid(run_as_group).name
|
|
174
179
|
end
|
|
175
180
|
|
|
181
|
+
def default_supplemental_groups
|
|
182
|
+
OodSupport::User.new.groups.sort_by(&:id).map(&:id).reject { |id| id < 1000 }
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def supplemental_groups(groups = [])
|
|
186
|
+
sgroups = []
|
|
187
|
+
if auto_supplemental_groups
|
|
188
|
+
sgroups.concat(default_supplemental_groups)
|
|
189
|
+
end
|
|
190
|
+
sgroups.concat(groups.to_a)
|
|
191
|
+
sgroups.uniq.sort
|
|
192
|
+
end
|
|
193
|
+
|
|
176
194
|
def default_env
|
|
177
195
|
{
|
|
178
196
|
USER: username,
|
|
@@ -180,6 +198,7 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
180
198
|
HOME: home_dir,
|
|
181
199
|
GROUP: group,
|
|
182
200
|
GID: run_as_group,
|
|
201
|
+
KUBECONFIG: '/dev/null',
|
|
183
202
|
}
|
|
184
203
|
end
|
|
185
204
|
|
|
@@ -187,12 +206,15 @@ class OodCore::Job::Adapters::Kubernetes::Batch
|
|
|
187
206
|
# create an id.
|
|
188
207
|
def generate_id_yml(script)
|
|
189
208
|
native_data = script.native
|
|
209
|
+
native_data[:container][:supplemental_groups] = supplemental_groups(native_data[:container][:supplemental_groups])
|
|
190
210
|
container = helper.container_from_native(native_data[:container], default_env)
|
|
191
211
|
id = generate_id(container.name)
|
|
192
|
-
configmap = helper.configmap_from_native(native_data, id)
|
|
212
|
+
configmap = helper.configmap_from_native(native_data, id, script.content)
|
|
193
213
|
init_containers = helper.init_ctrs_from_native(native_data[:init_containers], container.env)
|
|
194
214
|
spec = OodCore::Job::Adapters::Kubernetes::Resources::PodSpec.new(container, init_containers: init_containers)
|
|
195
215
|
all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
|
|
216
|
+
node_selector = native_data[:node_selector].nil? ? {} : native_data[:node_selector]
|
|
217
|
+
gpu_type = native_data[:gpu_type].nil? ? "nvidia.com/gpu" : native_data[:gpu_type]
|
|
196
218
|
|
|
197
219
|
template = ERB.new(File.read(resource_file), nil, '-')
|
|
198
220
|
|
|
@@ -53,7 +53,10 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
53
53
|
cpu: container[:cpu],
|
|
54
54
|
working_dir: container[:working_dir],
|
|
55
55
|
restart_policy: container[:restart_policy],
|
|
56
|
-
|
|
56
|
+
image_pull_policy: container[:image_pull_policy],
|
|
57
|
+
image_pull_secret: container[:image_pull_secret],
|
|
58
|
+
supplemental_groups: container[:supplemental_groups],
|
|
59
|
+
startup_probe: container[:startup_probe],
|
|
57
60
|
)
|
|
58
61
|
end
|
|
59
62
|
|
|
@@ -80,10 +83,18 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
80
83
|
# the input configmap hash
|
|
81
84
|
# @param id [#to_s]
|
|
82
85
|
# the id to use for giving the configmap a name
|
|
86
|
+
# @param script_content [#to_s]
|
|
87
|
+
# the batch script content
|
|
83
88
|
# @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
|
|
84
|
-
def configmap_from_native(native, id)
|
|
85
|
-
configmap = native.fetch(:configmap,
|
|
86
|
-
|
|
89
|
+
def configmap_from_native(native, id, script_content)
|
|
90
|
+
configmap = native.fetch(:configmap, {})
|
|
91
|
+
configmap[:files] ||= []
|
|
92
|
+
configmap[:files] << {
|
|
93
|
+
filename: 'script.sh',
|
|
94
|
+
data: script_content,
|
|
95
|
+
mount_path: '/ood/script.sh',
|
|
96
|
+
sub_path: 'script.sh',
|
|
97
|
+
} unless configmap[:files].any? { |f| f[:filename] == 'script.sh' }
|
|
87
98
|
|
|
88
99
|
OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap.new(
|
|
89
100
|
configmap_name(id),
|
|
@@ -140,7 +151,7 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
140
151
|
{
|
|
141
152
|
id: json_data.dig(:metadata, :name).to_s,
|
|
142
153
|
job_name: name_from_metadata(json_data.dig(:metadata)),
|
|
143
|
-
status: pod_status_from_json(json_data),
|
|
154
|
+
status: OodCore::Job::Status.new(state: pod_status_from_json(json_data)),
|
|
144
155
|
job_owner: job_owner_from_json(json_data, ns_prefix),
|
|
145
156
|
submission_time: submission_time(json_data),
|
|
146
157
|
dispatch_time: dispatch_time(json_data),
|
|
@@ -230,15 +241,21 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
230
241
|
def submission_time(json_data)
|
|
231
242
|
status = json_data.dig(:status)
|
|
232
243
|
start = status.dig(:startTime)
|
|
244
|
+
creation = json_data.dig(:metadata, :creationTimestamp)
|
|
233
245
|
|
|
234
|
-
if
|
|
246
|
+
if !creation.nil?
|
|
247
|
+
str = creation
|
|
248
|
+
elsif !start.nil?
|
|
249
|
+
str = start
|
|
250
|
+
else
|
|
235
251
|
# the pod is in some pending state limbo
|
|
236
252
|
conditions = status.dig(:conditions)
|
|
253
|
+
return nil if conditions.nil?
|
|
254
|
+
return nil if conditions.size == 0
|
|
237
255
|
# best guess to start time is just the first condition's
|
|
238
256
|
# transition time
|
|
239
257
|
str = conditions[0].dig(:lastTransitionTime)
|
|
240
|
-
|
|
241
|
-
str = start
|
|
258
|
+
return nil if str.nil?
|
|
242
259
|
end
|
|
243
260
|
|
|
244
261
|
DateTime.parse(str).to_time.to_i
|
|
@@ -246,11 +263,21 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
246
263
|
|
|
247
264
|
def pod_status_from_json(json_data)
|
|
248
265
|
phase = json_data.dig(:status, :phase)
|
|
266
|
+
conditions = json_data.dig(:status, :conditions)
|
|
267
|
+
container_statuses = json_data.dig(:status, :containerStatuses)
|
|
268
|
+
unschedulable = conditions.to_a.any? { |c| c.dig(:reason) == "Unschedulable" }
|
|
269
|
+
ready = !container_statuses.to_a.empty? && container_statuses.to_a.all? { |s| s.dig(:ready) == true }
|
|
270
|
+
started = !container_statuses.to_a.empty? && container_statuses.to_a.any? { |s| s.fetch(:state, {}).key?(:running) }
|
|
271
|
+
return "running" if ready
|
|
272
|
+
return "queued" if phase == "Running" && started
|
|
273
|
+
|
|
249
274
|
state = case phase
|
|
250
|
-
when "Running"
|
|
251
|
-
"running"
|
|
252
275
|
when "Pending"
|
|
253
|
-
|
|
276
|
+
if unschedulable
|
|
277
|
+
"queued_held"
|
|
278
|
+
else
|
|
279
|
+
"queued"
|
|
280
|
+
end
|
|
254
281
|
when "Failed"
|
|
255
282
|
"suspended"
|
|
256
283
|
when "Succeeded"
|
|
@@ -260,8 +287,6 @@ class OodCore::Job::Adapters::Kubernetes::Helper
|
|
|
260
287
|
else
|
|
261
288
|
"undetermined"
|
|
262
289
|
end
|
|
263
|
-
|
|
264
|
-
OodCore::Job::Status.new(state: state)
|
|
265
290
|
end
|
|
266
291
|
|
|
267
292
|
def terminated_state(status)
|
|
@@ -33,13 +33,36 @@ module OodCore::Job::Adapters::Kubernetes::Resources
|
|
|
33
33
|
end
|
|
34
34
|
end
|
|
35
35
|
|
|
36
|
+
class TCPProbe
|
|
37
|
+
attr_accessor :port, :initial_delay_seconds, :failure_threshold, :period_seconds
|
|
38
|
+
|
|
39
|
+
def initialize(port, data)
|
|
40
|
+
data ||= {}
|
|
41
|
+
@port = port
|
|
42
|
+
@initial_delay_seconds = data[:initial_delay_seconds] || 2
|
|
43
|
+
@failure_threshold = data[:failure_threshold] || 5
|
|
44
|
+
@period_seconds = data[:period_seconds] || 5
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def to_h
|
|
48
|
+
{
|
|
49
|
+
port: port,
|
|
50
|
+
initial_delay_seconds: initial_delay_seconds,
|
|
51
|
+
failure_threshold: failure_threshold,
|
|
52
|
+
period_seconds: period_seconds,
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
36
57
|
class Container
|
|
37
58
|
attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
|
|
38
|
-
:restart_policy, :image_pull_secret, :supplemental_groups
|
|
59
|
+
:restart_policy, :image_pull_policy, :image_pull_secret, :supplemental_groups,
|
|
60
|
+
:startup_probe
|
|
39
61
|
|
|
40
62
|
def initialize(
|
|
41
63
|
name, image, command: [], port: nil, env: {}, memory: "4Gi", cpu: "1",
|
|
42
|
-
working_dir: "", restart_policy: "Never", image_pull_secret: nil, supplemental_groups: []
|
|
64
|
+
working_dir: "", restart_policy: "Never", image_pull_policy: nil, image_pull_secret: nil, supplemental_groups: [],
|
|
65
|
+
startup_probe: {}
|
|
43
66
|
)
|
|
44
67
|
raise ArgumentError, "containers need valid names and images" unless name && image
|
|
45
68
|
|
|
@@ -52,8 +75,10 @@ module OodCore::Job::Adapters::Kubernetes::Resources
|
|
|
52
75
|
@cpu = cpu.nil? ? "1" : cpu
|
|
53
76
|
@working_dir = working_dir.nil? ? "" : working_dir
|
|
54
77
|
@restart_policy = restart_policy.nil? ? "Never" : restart_policy
|
|
78
|
+
@image_pull_policy = image_pull_policy.nil? ? "IfNotPresent" : image_pull_policy
|
|
55
79
|
@image_pull_secret = image_pull_secret
|
|
56
80
|
@supplemental_groups = supplemental_groups.nil? ? [] : supplemental_groups
|
|
81
|
+
@startup_probe = TCPProbe.new(@port, startup_probe)
|
|
57
82
|
end
|
|
58
83
|
|
|
59
84
|
def ==(other)
|
|
@@ -66,8 +91,10 @@ module OodCore::Job::Adapters::Kubernetes::Resources
|
|
|
66
91
|
cpu == other.cpu &&
|
|
67
92
|
working_dir == other.working_dir &&
|
|
68
93
|
restart_policy == other.restart_policy &&
|
|
94
|
+
image_pull_policy == other.image_pull_policy &&
|
|
69
95
|
image_pull_secret == other.image_pull_secret &&
|
|
70
|
-
supplemental_groups == other.supplemental_groups
|
|
96
|
+
supplemental_groups == other.supplemental_groups &&
|
|
97
|
+
startup_probe.to_h == other.startup_probe.to_h
|
|
71
98
|
end
|
|
72
99
|
end
|
|
73
100
|
|
|
@@ -20,14 +20,7 @@ spec:
|
|
|
20
20
|
runAsUser: <%= run_as_user %>
|
|
21
21
|
runAsGroup: <%= run_as_group %>
|
|
22
22
|
runAsNonRoot: true
|
|
23
|
-
|
|
24
|
-
supplementalGroups: []
|
|
25
|
-
<%- else -%>
|
|
26
|
-
supplementalGroups:
|
|
27
|
-
<%- spec.container.supplemental_groups.each do |supplemental_group| -%>
|
|
28
|
-
- "<%= supplemental_group %>"
|
|
29
|
-
<%- end -%>
|
|
30
|
-
<%- end -%>
|
|
23
|
+
supplementalGroups: <%= spec.container.supplemental_groups %>
|
|
31
24
|
fsGroup: <%= fs_group %>
|
|
32
25
|
hostNetwork: false
|
|
33
26
|
hostIPC: false
|
|
@@ -39,7 +32,7 @@ spec:
|
|
|
39
32
|
containers:
|
|
40
33
|
- name: "<%= spec.container.name %>"
|
|
41
34
|
image: <%= spec.container.image %>
|
|
42
|
-
imagePullPolicy:
|
|
35
|
+
imagePullPolicy: <%= spec.container.image_pull_policy %>
|
|
43
36
|
<%- unless spec.container.working_dir.empty? -%>
|
|
44
37
|
workingDir: "<%= spec.container.working_dir %>"
|
|
45
38
|
<%- end -%>
|
|
@@ -48,6 +41,14 @@ spec:
|
|
|
48
41
|
valueFrom:
|
|
49
42
|
fieldRef:
|
|
50
43
|
fieldPath: metadata.name
|
|
44
|
+
- name: POD_NAMESPACE
|
|
45
|
+
valueFrom:
|
|
46
|
+
fieldRef:
|
|
47
|
+
fieldPath: metadata.namespace
|
|
48
|
+
<%- unless spec.container.port.nil? -%>
|
|
49
|
+
- name: POD_PORT
|
|
50
|
+
value: "<%= spec.container.port %>"
|
|
51
|
+
<%- end -%>
|
|
51
52
|
<%- spec.container.env.each_pair do |name, value| -%>
|
|
52
53
|
- name: <%= name %>
|
|
53
54
|
value: "<%= value %>"
|
|
@@ -61,9 +62,16 @@ spec:
|
|
|
61
62
|
<%- unless spec.container.port.nil? -%>
|
|
62
63
|
ports:
|
|
63
64
|
- containerPort: <%= spec.container.port %>
|
|
65
|
+
startupProbe:
|
|
66
|
+
tcpSocket:
|
|
67
|
+
port: <%= spec.container.startup_probe.port %>
|
|
68
|
+
initialDelaySeconds: <%= spec.container.startup_probe.initial_delay_seconds %>
|
|
69
|
+
failureThreshold: <%= spec.container.startup_probe.failure_threshold %>
|
|
70
|
+
periodSeconds: <%= spec.container.startup_probe.period_seconds %>
|
|
64
71
|
<%- end -%>
|
|
65
|
-
<%- if
|
|
72
|
+
<%- if !all_mounts.empty? || (!configmap.nil? && configmap.mounts?) -%>
|
|
66
73
|
volumeMounts:
|
|
74
|
+
<%- unless configmap.nil? -%>
|
|
67
75
|
<%- configmap.files.each do |file| -%>
|
|
68
76
|
<%- next if file.mount_path.nil? -%>
|
|
69
77
|
- name: configmap-volume
|
|
@@ -72,6 +80,7 @@ spec:
|
|
|
72
80
|
subPath: <%= file.sub_path %>
|
|
73
81
|
<%- end # end unless file.sub_path.nil? -%>
|
|
74
82
|
<%- end # end configmap.files.each -%>
|
|
83
|
+
<%- end # unless configmap.nil? -%>
|
|
75
84
|
<%- all_mounts.each do |mount| -%>
|
|
76
85
|
- name: <%= mount[:name] %>
|
|
77
86
|
mountPath: <%= mount[:destination_path] %>
|
|
@@ -81,25 +90,36 @@ spec:
|
|
|
81
90
|
limits:
|
|
82
91
|
memory: "<%= spec.container.memory %>"
|
|
83
92
|
cpu: "<%= spec.container.cpu %>"
|
|
93
|
+
<%- unless script.gpus_per_node.nil? -%>
|
|
94
|
+
<%= gpu_type %>: <%= script.gpus_per_node %>
|
|
95
|
+
<%- end -%>
|
|
84
96
|
requests:
|
|
85
97
|
memory: "<%= spec.container.memory %>"
|
|
86
98
|
cpu: "<%= spec.container.cpu %>"
|
|
99
|
+
<%- unless script.gpus_per_node.nil? -%>
|
|
100
|
+
<%= gpu_type %>: <%= script.gpus_per_node %>
|
|
101
|
+
<%- end -%>
|
|
87
102
|
securityContext:
|
|
88
103
|
allowPrivilegeEscalation: false
|
|
89
104
|
capabilities:
|
|
90
105
|
drop:
|
|
91
106
|
- all
|
|
92
107
|
privileged: false
|
|
93
|
-
<%- unless spec.init_containers.
|
|
108
|
+
<%- unless spec.init_containers.empty? -%>
|
|
94
109
|
initContainers:
|
|
95
110
|
<%- spec.init_containers.each do |ctr| -%>
|
|
96
111
|
- name: "<%= ctr.name %>"
|
|
97
112
|
image: "<%= ctr.image %>"
|
|
113
|
+
imagePullPolicy: <%= ctr.image_pull_policy %>
|
|
98
114
|
env:
|
|
99
115
|
- name: POD_NAME
|
|
100
116
|
valueFrom:
|
|
101
117
|
fieldRef:
|
|
102
118
|
fieldPath: metadata.name
|
|
119
|
+
- name: POD_NAMESPACE
|
|
120
|
+
valueFrom:
|
|
121
|
+
fieldRef:
|
|
122
|
+
fieldPath: metadata.namespace
|
|
103
123
|
<%- ctr.env.each_pair do |name, value| -%>
|
|
104
124
|
- name: <%= name %>
|
|
105
125
|
value: "<%= value %>"
|
|
@@ -108,8 +128,9 @@ spec:
|
|
|
108
128
|
<%- ctr.command.each do |cmd| -%>
|
|
109
129
|
- "<%= cmd %>"
|
|
110
130
|
<%- end # command loop -%>
|
|
111
|
-
<%- if
|
|
131
|
+
<%- if !all_mounts.empty? || (!configmap.nil? && configmap.init_mounts?) -%>
|
|
112
132
|
volumeMounts:
|
|
133
|
+
<%- unless configmap.nil? -%>
|
|
113
134
|
<%- configmap.files.each do |file| -%>
|
|
114
135
|
<%- next if file.init_mount_path.nil? -%>
|
|
115
136
|
- name: configmap-volume
|
|
@@ -118,6 +139,7 @@ spec:
|
|
|
118
139
|
subPath: <%= file.init_sub_path %>
|
|
119
140
|
<%- end # end unless file.sub_path.nil? -%>
|
|
120
141
|
<%- end # end configmap.files.each -%>
|
|
142
|
+
<%- end # unless configmap.nil? -%>
|
|
121
143
|
<%- all_mounts.each do |mount| -%>
|
|
122
144
|
- name: <%= mount[:name] %>
|
|
123
145
|
mountPath: <%= mount[:destination_path] %>
|
|
@@ -152,6 +174,12 @@ spec:
|
|
|
152
174
|
<%- end # if mount is [host,nfs] -%>
|
|
153
175
|
<%- end # for each mount -%>
|
|
154
176
|
<%- end # (configmap.to_s.empty? || all_mounts.empty?) -%>
|
|
177
|
+
<%- unless node_selector.empty? -%>
|
|
178
|
+
nodeSelector:
|
|
179
|
+
<%- node_selector.each_pair do |key, value| -%>
|
|
180
|
+
<%= key %>: "<%= value %>"
|
|
181
|
+
<%- end # node_selector.each_pair -%>
|
|
182
|
+
<%- end #unless node_selector.empty? -%>
|
|
155
183
|
---
|
|
156
184
|
<%- unless spec.container.port.nil? -%>
|
|
157
185
|
apiVersion: v1
|
|
@@ -161,6 +189,8 @@ metadata:
|
|
|
161
189
|
namespace: <%= namespace %>
|
|
162
190
|
labels:
|
|
163
191
|
job: <%= id %>
|
|
192
|
+
app.kubernetes.io/name: <%= container.name %>
|
|
193
|
+
app.kubernetes.io/managed-by: open-ondemand
|
|
164
194
|
spec:
|
|
165
195
|
selector:
|
|
166
196
|
job: <%= id %>
|
|
@@ -170,8 +200,8 @@ spec:
|
|
|
170
200
|
targetPort: <%= spec.container.port %>
|
|
171
201
|
type: NodePort
|
|
172
202
|
<%- end # end for service -%>
|
|
173
|
-
---
|
|
174
203
|
<%- unless configmap.nil? -%>
|
|
204
|
+
---
|
|
175
205
|
apiVersion: v1
|
|
176
206
|
kind: ConfigMap
|
|
177
207
|
metadata:
|
|
@@ -179,6 +209,8 @@ metadata:
|
|
|
179
209
|
namespace: <%= namespace %>
|
|
180
210
|
labels:
|
|
181
211
|
job: <%= id %>
|
|
212
|
+
app.kubernetes.io/name: <%= container.name %>
|
|
213
|
+
app.kubernetes.io/managed-by: open-ondemand
|
|
182
214
|
data:
|
|
183
215
|
<%- configmap.files.each do |file| -%>
|
|
184
216
|
<%- next if file.data.nil? || file.filename.nil? -%>
|
|
@@ -423,6 +423,7 @@ module OodCore
|
|
|
423
423
|
args.concat ["-t", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
|
|
424
424
|
args.concat ['-a', script.job_array_request] unless script.job_array_request.nil?
|
|
425
425
|
args.concat ['--qos', script.qos] unless script.qos.nil?
|
|
426
|
+
args.concat ['--gpus-per-node', script.gpus_per_node] unless script.gpus_per_node.nil?
|
|
426
427
|
# ignore nodes, don't know how to do this for slurm
|
|
427
428
|
|
|
428
429
|
# Set dependencies
|
|
@@ -159,6 +159,8 @@ module OodCore
|
|
|
159
159
|
args.concat ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
|
|
160
160
|
args.concat ['-t', script.job_array_request] unless script.job_array_request.nil?
|
|
161
161
|
args.concat ['-l', "qos=#{script.qos}"] unless script.qos.nil?
|
|
162
|
+
args.concat ['-l', "gpus=#{script.gpus_per_node}"] unless script.gpus_per_node.nil?
|
|
163
|
+
|
|
162
164
|
# Set environment variables
|
|
163
165
|
env = script.job_environment.to_h
|
|
164
166
|
args.concat ["-v", env.keys.join(",")] unless env.empty?
|
data/lib/ood_core/job/script.rb
CHANGED
|
@@ -103,6 +103,10 @@ module OodCore
|
|
|
103
103
|
# @return [String, nil] qos
|
|
104
104
|
attr_reader :qos
|
|
105
105
|
|
|
106
|
+
# The GPUs per node for the job
|
|
107
|
+
# @return [Integer, nil] gpus per node
|
|
108
|
+
attr_reader :gpus_per_node
|
|
109
|
+
|
|
106
110
|
# Object detailing any native specifications that are implementation specific
|
|
107
111
|
# @note Should not be used at all costs.
|
|
108
112
|
# @return [Object, nil] native specifications
|
|
@@ -136,6 +140,7 @@ module OodCore
|
|
|
136
140
|
# @param accounting_id [#to_s, nil] accounting id
|
|
137
141
|
# @param job_array_request [#to_s, nil] job array request
|
|
138
142
|
# @param qos [#to_s, nil] qos
|
|
143
|
+
# @param gpus_per_node [#to_i, nil] gpus per node
|
|
139
144
|
# @param native [Object, nil] native specifications
|
|
140
145
|
# @param copy_environment [Boolean, nil] copy the environment
|
|
141
146
|
def initialize(content:, args: nil, submit_as_hold: nil, rerunnable: nil,
|
|
@@ -145,7 +150,7 @@ module OodCore
|
|
|
145
150
|
output_path: nil, error_path: nil, reservation_id: nil,
|
|
146
151
|
queue_name: nil, priority: nil, start_time: nil,
|
|
147
152
|
wall_time: nil, accounting_id: nil, job_array_request: nil,
|
|
148
|
-
qos: nil, native: nil, copy_environment: nil, **_)
|
|
153
|
+
qos: nil, gpus_per_node: nil, native: nil, copy_environment: nil, **_)
|
|
149
154
|
@content = content.to_s
|
|
150
155
|
|
|
151
156
|
@submit_as_hold = submit_as_hold
|
|
@@ -170,6 +175,7 @@ module OodCore
|
|
|
170
175
|
@accounting_id = accounting_id && accounting_id.to_s
|
|
171
176
|
@job_array_request = job_array_request && job_array_request.to_s
|
|
172
177
|
@qos = qos && qos.to_s
|
|
178
|
+
@gpus_per_node = gpus_per_node && gpus_per_node.to_i
|
|
173
179
|
@native = native
|
|
174
180
|
@copy_environment = (copy_environment.nil?) ? nil : !! copy_environment
|
|
175
181
|
end
|
|
@@ -200,6 +206,7 @@ module OodCore
|
|
|
200
206
|
accounting_id: accounting_id,
|
|
201
207
|
job_array_request: job_array_request,
|
|
202
208
|
qos: qos,
|
|
209
|
+
gpus_per_node: gpus_per_node,
|
|
203
210
|
native: native,
|
|
204
211
|
copy_environment: copy_environment
|
|
205
212
|
}
|
data/lib/ood_core/version.rb
CHANGED
data/ood_core.gemspec
CHANGED
|
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
|
|
|
15
15
|
spec.license = "MIT"
|
|
16
16
|
|
|
17
17
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
|
18
|
-
f.match(%r{^(test|spec|features)/})
|
|
18
|
+
f.match(%r{^(test|spec|features|.github)/})
|
|
19
19
|
end
|
|
20
20
|
spec.bindir = "exe"
|
|
21
21
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ood_core
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.17.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eric Franz
|
|
@@ -10,7 +10,7 @@ authors:
|
|
|
10
10
|
autorequire:
|
|
11
11
|
bindir: exe
|
|
12
12
|
cert_chain: []
|
|
13
|
-
date: 2021-
|
|
13
|
+
date: 2021-07-29 00:00:00.000000000 Z
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
16
16
|
name: ood_support
|
|
@@ -160,7 +160,6 @@ executables: []
|
|
|
160
160
|
extensions: []
|
|
161
161
|
extra_rdoc_files: []
|
|
162
162
|
files:
|
|
163
|
-
- ".github/workflows/test.yml"
|
|
164
163
|
- ".gitignore"
|
|
165
164
|
- ".rspec"
|
|
166
165
|
- CHANGELOG.md
|
data/.github/workflows/test.yml
DELETED
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
name: Unit Tests
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
push:
|
|
5
|
-
branches:
|
|
6
|
-
- master
|
|
7
|
-
pull_request:
|
|
8
|
-
branches:
|
|
9
|
-
- master
|
|
10
|
-
|
|
11
|
-
jobs:
|
|
12
|
-
tests:
|
|
13
|
-
runs-on: ubuntu-latest
|
|
14
|
-
|
|
15
|
-
steps:
|
|
16
|
-
- name: checkout
|
|
17
|
-
uses: actions/checkout@v2
|
|
18
|
-
|
|
19
|
-
- name: Setup Ruby using Bundler
|
|
20
|
-
uses: ruby/setup-ruby@v1
|
|
21
|
-
with:
|
|
22
|
-
ruby-version: "2.7.1"
|
|
23
|
-
bundler-cache: true
|
|
24
|
-
bundler: "2.1.4"
|
|
25
|
-
|
|
26
|
-
- name: install gems
|
|
27
|
-
run: bundle install
|
|
28
|
-
|
|
29
|
-
- name: test
|
|
30
|
-
run: bundle exec rake spec
|