ood_core 0.15.0 → 0.17.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 766b778b98f189dee73ff1cb70a2b0acf53d628a897288260a1b8cf7cb80c0c6
4
- data.tar.gz: 03052a68c57de5fe76b795dfd76b66639520d3e9b055c324fbece05db71dd331
3
+ metadata.gz: 71272779185c8dee0b38f361c14419b77a9a497d46fc74a30b31735882f2f99b
4
+ data.tar.gz: 827b6d710a0a31f279cc370bd00d49b2d56c3cd31d009d155ee6ad8d2967e552
5
5
  SHA512:
6
- metadata.gz: c678069d0a37762a706a020c5a7ad7a7354ed3f1edb01fdd25b915065b50754d43c60df48c9ea3b773f3dc4ddb4e12604e9dda02a1ad30b0482e7ab050804181
7
- data.tar.gz: 8416227140b6d761f6246f0cce50e41f4462f83b1913ad9e72b511685412810b7b2c7c9951f8c712b127572b421e8a57cd6cdb4dedb7cab9c242eba0d057ad45
6
+ metadata.gz: e42c8b974608a23ac3973fd68d40042e002e61c0b5a2d17877626578b7ceb178ae94ba3881c26003a8b69800075669410b226cd12c7cfe8350ba21fdd75a9329
7
+ data.tar.gz: 2f149dedb64a5806626a827d43ceab368cdd1f026a789bb9ece6189596e0e7cc2a5009d98053eafdcd090c49befa688fcf8ccc1f70593635a2a3a4bf5535b0e3
data/CHANGELOG.md CHANGED
@@ -6,6 +6,87 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6
6
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7
7
 
8
8
  ## [Unreleased]
9
+
10
+ ## [0.17.1] - 6-14-2021
11
+
12
+ ### Fixed
13
+
14
+ - Fixed [278](https://github.com/OSC/ood_core/pull/278) where unschedulable pods will now show up as
15
+ queued_held status.
16
+
17
+ ### Changed
18
+
19
+ - KUBECONFIG now defaults to /dev/null in the kubernetes adapter in [292](https://github.com/OSC/ood_core/pull/292).
20
+
21
+ ### Added
22
+
23
+ - Sites can now set `batch_connect.ssh_allow` on the cluster to disable the buttons to start
24
+ a shell session to compute nodes in [289](https://github.com/OSC/ood_core/pull/289).
25
+ - `POD_PORT` is now available to jobs in the kubernetes adapter in [290](https://github.com/OSC/ood_core/pull/290).
26
+ - Kubernetes pods now support a startProbe in [291](https://github.com/OSC/ood_core/pull/291).
27
+
28
+ ## [0.17.0] - 5-26-2021
29
+
30
+ ### Fixed
31
+
32
+ - All Kubernetes resources now have the same labels in [280](https://github.com/OSC/ood_core/pull/280).
33
+ - Kubernetes does not crash when no configmap is defined in [282](https://github.com/OSC/ood_core/pull/282).
34
+ - Kubernetes will not specify init containers if there are none in
35
+ [284](https://github.com/OSC/ood_core/pull/284).
36
+
37
+ ### Added
38
+
39
+ - Kubernetes, Slurm and Torque now support the script option `gpus_per_node` in
40
+ [266](https://github.com/OSC/ood_core/pull/266).
41
+ - Kubernetes will now save the pod.yml into the staged root in
42
+ [277](https://github.com/OSC/ood_core/pull/277).
43
+ - Kubernetes now allows for node selector in [264](https://github.com/OSC/ood_core/pull/264).
44
+ - Kubernetes pods now have access the environment variable POD_NAMESPACE in
45
+ [275](https://github.com/OSC/ood_core/pull/275).
46
+ - Kubernetes pods can now specify the image pull policy in [272](https://github.com/OSC/ood_core/pull/272).
47
+ - Cluster config's batch_connect now support `ssh_allow` to disable sshing to compute
48
+ nodes per cluster in [286](https://github.com/OSC/ood_core/pull/286).
49
+ - Kubernetes will now add the templated script content to a configmap in
50
+ [273](https://github.com/OSC/ood_core/pull/273).
51
+
52
+ ### Changed
53
+
54
+ - Kubernetes username prefix no longer appends a - in [271](https://github.com/OSC/ood_core/pull/271).
55
+
56
+
57
+
58
+ ## [0.16.1] - 2021-04-23
59
+ ### Fixed
60
+ - memorized some allow? variables to have better support around ACLS in
61
+ [267](https://github.com/OSC/ood_core/pull/267)
62
+
63
+ ## [0.16.0] - 2021-04-20
64
+ ### Fixed
65
+ - tmux 2.7+ bug in the linux host adapter in [2.5.8](https://github.com/OSC/ood_core/pull/258)
66
+ and [259](https://github.com/OSC/ood_core/pull/259).
67
+
68
+ ### Changed
69
+
70
+ - Changed how k8s configmaps in are defined in [251](https://github.com/OSC/ood_core/pull/251).
71
+ The data structure now expects a key called files which is an array of objects that hold
72
+ filename, data, mount_path, sub_path and init_mount_path.
73
+ [255](https://github.com/OSC/ood_core/pull/255) also relates to this interface change.
74
+
75
+ ### Added
76
+
77
+ - The k8s adapter can now specify environment variables and creates defaults
78
+ in [252](https://github.com/OSC/ood_core/pull/252).
79
+ - The k8s adapter can now specify image pull secrets in [253](https://github.com/OSC/ood_core/pull/253).
80
+
81
+ ## [0.15.1] - 2021-02-25
82
+ ### Fixed
83
+ - kubernetes adapter uses the full module for helpers in [245](https://github.com/OSC/ood_core/pull/245).
84
+
85
+ ### Changed
86
+ - kubernetes pods spawn with runAsNonRoot set to true in [247](https://github.com/OSC/ood_core/pull/247).
87
+ - kubernetes pods can spawn with supplemental groups along with some other in security defaults in
88
+ [246](https://github.com/OSC/ood_core/pull/246).
89
+
9
90
  ## [0.15.0] - 2021-01-26
10
91
  ### Fixed
11
92
  - ccq adapter now accepts job names with spaces in [210](https://github.com/OSC/ood_core/pull/209)
@@ -273,7 +354,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
273
354
  ### Added
274
355
  - Initial release!
275
356
 
276
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.15.0...HEAD
357
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.17.1...HEAD
358
+ [0.17.1]: https://github.com/OSC/ood_core/compare/v0.17.0...v0.17.1
359
+ [0.17.0]: https://github.com/OSC/ood_core/compare/v0.16.1...v0.17.0
360
+ [0.16.1]: https://github.com/OSC/ood_core/compare/v0.16.0...v0.16.1
361
+ [0.16.0]: https://github.com/OSC/ood_core/compare/v0.15.1...v0.16.0
362
+ [0.15.1]: https://github.com/OSC/ood_core/compare/v0.15.0...v0.15.1
277
363
  [0.15.0]: https://github.com/OSC/ood_core/compare/v0.14.0...v0.15.0
278
364
  [0.14.0]: https://github.com/OSC/ood_core/compare/v0.13.0...v0.14.0
279
365
  [0.13.0]: https://github.com/OSC/ood_core/compare/v0.12.0...v0.13.0
@@ -78,7 +78,9 @@ module OodCore
78
78
  # Whether the login feature is allowed
79
79
  # @return [Boolean] is login allowed
80
80
  def login_allow?
81
- allow? && !login_config.empty?
81
+ return @login_allow if defined?(@login_allow)
82
+
83
+ @login_allow = (allow? && !login_config.empty?)
82
84
  end
83
85
 
84
86
  # Build a job adapter from the job configuration
@@ -90,9 +92,11 @@ module OodCore
90
92
  # Whether the job feature is allowed based on the ACLs
91
93
  # @return [Boolean] is the job feature allowed
92
94
  def job_allow?
93
- allow? &&
94
- !job_config.empty? &&
95
- build_acls(job_config.fetch(:acls, []).map(&:to_h)).all?(&:allow?)
95
+ return @job_allow if defined?(@job_allow)
96
+
97
+ @job_allow = (allow? && ! job_config.empty? && build_acls(
98
+ job_config.fetch(:acls, []).map(&:to_h)
99
+ ).all?(&:allow?))
96
100
  end
97
101
 
98
102
  # The batch connect template configuration used for this cluster
@@ -138,7 +142,18 @@ module OodCore
138
142
  # Whether this cluster is allowed to be used
139
143
  # @return [Boolean] whether cluster is allowed
140
144
  def allow?
141
- acls.all?(&:allow?)
145
+ return @allow if defined?(@allow)
146
+
147
+ @allow = acls.all?(&:allow?)
148
+ end
149
+
150
+ # Whether this cluster supports SSH to batch connect nodes
151
+ # @return [Boolean, nil] whether cluster supports SSH to batch connect node
152
+ def batch_connect_ssh_allow?
153
+ return @batch_connect_ssh_allow if defined?(@batch_connect_ssh_allow)
154
+ return @batch_connect_ssh_allow = nil if batch_connect_config.nil?
155
+
156
+ @batch_connect_ssh_allow = batch_connect_config.fetch(:ssh_allow, nil)
142
157
  end
143
158
 
144
159
  # The comparison operator
@@ -23,11 +23,11 @@ class OodCore::Job::Adapters::Kubernetes::Batch
23
23
  @cluster = options.fetch(:cluster, 'open-ondemand')
24
24
  @mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
25
25
  @all_namespaces = options.fetch(:all_namespaces, false)
26
- @username_prefix = options.fetch(:username_prefix, nil)
26
+ @username_prefix = options.fetch(:username_prefix, '')
27
27
  @namespace_prefix = options.fetch(:namespace_prefix, '')
28
28
 
29
29
  @using_context = false
30
- @helper = Helper.new
30
+ @helper = OodCore::Job::Adapters::Kubernetes::Helper.new
31
31
 
32
32
  begin
33
33
  make_kubectl_config(options)
@@ -45,6 +45,9 @@ class OodCore::Job::Adapters::Kubernetes::Batch
45
45
  raise ArgumentError, 'Must specify the script' if script.nil?
46
46
 
47
47
  resource_yml, id = generate_id_yml(script)
48
+ if !script.workdir.nil? && Dir.exist?(script.workdir)
49
+ File.open(File.join(script.workdir, 'pod.yml'), 'w') { |f| f.write resource_yml }
50
+ end
48
51
  call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
49
52
 
50
53
  id
@@ -112,10 +115,6 @@ class OodCore::Job::Adapters::Kubernetes::Batch
112
115
  safe_call("delete", "configmap", configmap_name(id))
113
116
  end
114
117
 
115
- def configmap_mount_path
116
- '/ood'
117
- end
118
-
119
118
  private
120
119
 
121
120
  def safe_call(verb, resource, id)
@@ -150,31 +149,56 @@ class OodCore::Job::Adapters::Kubernetes::Batch
150
149
  end
151
150
 
152
151
  def k8s_username
153
- username_prefix.nil? ? username : "#{username_prefix}-#{username}"
152
+ "#{username_prefix}#{username}"
153
+ end
154
+
155
+ def user
156
+ @user ||= Etc.getpwnam(username)
157
+ end
158
+
159
+ def home_dir
160
+ user.dir
154
161
  end
155
162
 
156
163
  def run_as_user
157
- Etc.getpwnam(username).uid
164
+ user.uid
158
165
  end
159
166
 
160
167
  def run_as_group
161
- Etc.getpwnam(username).gid
168
+ user.gid
162
169
  end
163
170
 
164
171
  def fs_group
165
172
  run_as_group
166
173
  end
167
174
 
175
+ def group
176
+ Etc.getgrgid(run_as_group).name
177
+ end
178
+
179
+ def default_env
180
+ {
181
+ USER: username,
182
+ UID: run_as_user,
183
+ HOME: home_dir,
184
+ GROUP: group,
185
+ GID: run_as_group,
186
+ KUBECONFIG: '/dev/null',
187
+ }
188
+ end
189
+
168
190
  # helper to template resource yml you're going to submit and
169
191
  # create an id.
170
192
  def generate_id_yml(script)
171
193
  native_data = script.native
172
- container = helper.container_from_native(native_data[:container])
194
+ container = helper.container_from_native(native_data[:container], default_env)
173
195
  id = generate_id(container.name)
174
- configmap = helper.configmap_from_native(native_data, id)
175
- init_containers = helper.init_ctrs_from_native(native_data[:init_containers])
176
- spec = Kubernetes::Resources::PodSpec.new(container, init_containers: init_containers)
196
+ configmap = helper.configmap_from_native(native_data, id, script.content)
197
+ init_containers = helper.init_ctrs_from_native(native_data[:init_containers], container.env)
198
+ spec = OodCore::Job::Adapters::Kubernetes::Resources::PodSpec.new(container, init_containers: init_containers)
177
199
  all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
200
+ node_selector = native_data[:node_selector].nil? ? {} : native_data[:node_selector]
201
+ gpu_type = native_data[:gpu_type].nil? ? "nvidia.com/gpu" : native_data[:gpu_type]
178
202
 
179
203
  template = ERB.new(File.read(resource_file), nil, '-')
180
204
 
@@ -29,7 +29,7 @@ class OodCore::Job::Adapters::Kubernetes::Helper
29
29
 
30
30
  pod_hash.deep_merge!(service_hash)
31
31
  pod_hash.deep_merge!(secret_hash)
32
- K8sJobInfo.new(pod_hash)
32
+ OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(pod_hash)
33
33
  rescue NoMethodError
34
34
  raise K8sDataError, "unable to read data correctly from json"
35
35
  end
@@ -38,18 +38,24 @@ class OodCore::Job::Adapters::Kubernetes::Helper
38
38
  #
39
39
  # @param container [#to_h]
40
40
  # the input container hash
41
+ # @param default_env [#to_h]
42
+ # Default env to merge with defined env
41
43
  # @return [OodCore::Job::Adapters::Kubernetes::Resources::Container]
42
- def container_from_native(container)
43
- Kubernetes::Resources::Container.new(
44
+ def container_from_native(container, default_env)
45
+ env = container.fetch(:env, {}).to_h.symbolize_keys
46
+ OodCore::Job::Adapters::Kubernetes::Resources::Container.new(
44
47
  container[:name],
45
48
  container[:image],
46
49
  command: parse_command(container[:command]),
47
50
  port: container[:port],
48
- env: container.fetch(:env, []),
51
+ env: default_env.merge(env),
49
52
  memory: container[:memory],
50
53
  cpu: container[:cpu],
51
54
  working_dir: container[:working_dir],
52
- restart_policy: container[:restart_policy]
55
+ restart_policy: container[:restart_policy],
56
+ image_pull_policy: container[:image_pull_policy],
57
+ image_pull_secret: container[:image_pull_secret],
58
+ startup_probe: container[:startup_probe],
53
59
  )
54
60
  end
55
61
 
@@ -76,15 +82,22 @@ class OodCore::Job::Adapters::Kubernetes::Helper
76
82
  # the input configmap hash
77
83
  # @param id [#to_s]
78
84
  # the id to use for giving the configmap a name
85
+ # @param script_content [#to_s]
86
+ # the batch script content
79
87
  # @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
80
- def configmap_from_native(native, id)
81
- configmap = native.fetch(:configmap, nil)
82
- return nil if configmap.nil?
83
-
84
- Kubernetes::Resources::ConfigMap.new(
88
+ def configmap_from_native(native, id, script_content)
89
+ configmap = native.fetch(:configmap, {})
90
+ configmap[:files] ||= []
91
+ configmap[:files] << {
92
+ filename: 'script.sh',
93
+ data: script_content,
94
+ mount_path: '/ood/script.sh',
95
+ sub_path: 'script.sh',
96
+ } unless configmap[:files].any? { |f| f[:filename] == 'script.sh' }
97
+
98
+ OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap.new(
85
99
  configmap_name(id),
86
- configmap[:filename],
87
- configmap[:data]
100
+ (configmap[:files] || [])
88
101
  )
89
102
  end
90
103
 
@@ -93,13 +106,15 @@ class OodCore::Job::Adapters::Kubernetes::Helper
93
106
  # @param native_data [#to_h]
94
107
  # the native data to parse. Expected key init_ctrs and for that
95
108
  # key to be an array of hashes.
109
+ # @param default_env [#to_h]
110
+ # Default env to merge with defined env
96
111
  # @return [Array<OodCore::Job::Adapters::Kubernetes::Resources::Container>]
97
112
  # the array of init containers
98
- def init_ctrs_from_native(ctrs)
113
+ def init_ctrs_from_native(ctrs, default_env)
99
114
  init_ctrs = []
100
115
 
101
116
  ctrs&.each do |ctr_raw|
102
- ctr = container_from_native(ctr_raw)
117
+ ctr = container_from_native(ctr_raw, default_env)
103
118
  init_ctrs.push(ctr)
104
119
  end
105
120
 
@@ -225,15 +240,21 @@ class OodCore::Job::Adapters::Kubernetes::Helper
225
240
  def submission_time(json_data)
226
241
  status = json_data.dig(:status)
227
242
  start = status.dig(:startTime)
243
+ creation = json_data.dig(:metadata, :creationTimestamp)
228
244
 
229
- if start.nil?
245
+ if !creation.nil?
246
+ str = creation
247
+ elsif !start.nil?
248
+ str = start
249
+ else
230
250
  # the pod is in some pending state limbo
231
251
  conditions = status.dig(:conditions)
252
+ return nil if conditions.nil?
253
+ return nil if conditions.size == 0
232
254
  # best guess to start time is just the first condition's
233
255
  # transition time
234
256
  str = conditions[0].dig(:lastTransitionTime)
235
- else
236
- str = start
257
+ return nil if str.nil?
237
258
  end
238
259
 
239
260
  DateTime.parse(str).to_time.to_i
@@ -241,11 +262,17 @@ class OodCore::Job::Adapters::Kubernetes::Helper
241
262
 
242
263
  def pod_status_from_json(json_data)
243
264
  phase = json_data.dig(:status, :phase)
265
+ conditions = json_data.dig(:status, :conditions)
266
+ unschedulable = conditions.to_a.any? { |c| c.dig(:reason) == "Unschedulable" }
244
267
  state = case phase
245
268
  when "Running"
246
269
  "running"
247
270
  when "Pending"
248
- "queued"
271
+ if unschedulable
272
+ "queued_held"
273
+ else
274
+ "queued"
275
+ end
249
276
  when "Failed"
250
277
  "suspended"
251
278
  when "Succeeded"
@@ -1,22 +1,68 @@
1
1
  module OodCore::Job::Adapters::Kubernetes::Resources
2
2
 
3
3
  class ConfigMap
4
- attr_accessor :name, :filename, :data
4
+ attr_accessor :name, :files
5
5
 
6
- def initialize(name, filename, data)
6
+ def initialize(name, files)
7
7
  @name = name
8
- @filename = filename
9
- @data = data
8
+ @files = []
9
+ files.each do |f|
10
+ @files << ConfigMapFile.new(f)
11
+ end
12
+ end
13
+
14
+ def mounts?
15
+ @files.any? { |f| f.mount_path }
16
+ end
17
+
18
+ def init_mounts?
19
+ @files.any? { |f| f.init_mount_path }
20
+ end
21
+ end
22
+
23
+ class ConfigMapFile
24
+ attr_accessor :filename, :data, :mount_path, :sub_path, :init_mount_path, :init_sub_path
25
+
26
+ def initialize(data)
27
+ @filename = data[:filename]
28
+ @data = data[:data]
29
+ @mount_path = data[:mount_path]
30
+ @sub_path = data[:sub_path]
31
+ @init_mount_path = data[:init_mount_path]
32
+ @init_sub_path = data[:init_sub_path]
33
+ end
34
+ end
35
+
36
+ class TCPProbe
37
+ attr_accessor :port, :initial_delay_seconds, :failure_threshold, :period_seconds
38
+
39
+ def initialize(port, data)
40
+ data ||= {}
41
+ @port = port
42
+ @initial_delay_seconds = data[:initial_delay_seconds] || 2
43
+ @failure_threshold = data[:failure_threshold] || 5
44
+ @period_seconds = data[:period_seconds] || 5
45
+ end
46
+
47
+ def to_h
48
+ {
49
+ port: port,
50
+ initial_delay_seconds: initial_delay_seconds,
51
+ failure_threshold: failure_threshold,
52
+ period_seconds: period_seconds,
53
+ }
10
54
  end
11
55
  end
12
56
 
13
57
  class Container
14
58
  attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
15
- :restart_policy
59
+ :restart_policy, :image_pull_policy, :image_pull_secret, :supplemental_groups,
60
+ :startup_probe
16
61
 
17
62
  def initialize(
18
- name, image, command: [], port: nil, env: [], memory: "4Gi", cpu: "1",
19
- working_dir: "", restart_policy: "Never"
63
+ name, image, command: [], port: nil, env: {}, memory: "4Gi", cpu: "1",
64
+ working_dir: "", restart_policy: "Never", image_pull_policy: nil, image_pull_secret: nil, supplemental_groups: [],
65
+ startup_probe: {}
20
66
  )
21
67
  raise ArgumentError, "containers need valid names and images" unless name && image
22
68
 
@@ -24,11 +70,15 @@ module OodCore::Job::Adapters::Kubernetes::Resources
24
70
  @image = image
25
71
  @command = command.nil? ? [] : command
26
72
  @port = port&.to_i
27
- @env = env.nil? ? [] : env
73
+ @env = env.nil? ? {} : env
28
74
  @memory = memory.nil? ? "4Gi" : memory
29
75
  @cpu = cpu.nil? ? "1" : cpu
30
76
  @working_dir = working_dir.nil? ? "" : working_dir
31
77
  @restart_policy = restart_policy.nil? ? "Never" : restart_policy
78
+ @image_pull_policy = image_pull_policy.nil? ? "IfNotPresent" : image_pull_policy
79
+ @image_pull_secret = image_pull_secret
80
+ @supplemental_groups = supplemental_groups.nil? ? [] : supplemental_groups
81
+ @startup_probe = TCPProbe.new(@port, startup_probe)
32
82
  end
33
83
 
34
84
  def ==(other)
@@ -40,9 +90,12 @@ module OodCore::Job::Adapters::Kubernetes::Resources
40
90
  memory == other.memory &&
41
91
  cpu == other.cpu &&
42
92
  working_dir == other.working_dir &&
43
- restart_policy == other.restart_policy
93
+ restart_policy == other.restart_policy &&
94
+ image_pull_policy == other.image_pull_policy &&
95
+ image_pull_secret == other.image_pull_secret &&
96
+ supplemental_groups == other.supplemental_groups &&
97
+ startup_probe.to_h == other.startup_probe.to_h
44
98
  end
45
-
46
99
  end
47
100
 
48
101
  class PodSpec
@@ -19,21 +19,47 @@ spec:
19
19
  securityContext:
20
20
  runAsUser: <%= run_as_user %>
21
21
  runAsGroup: <%= run_as_group %>
22
+ runAsNonRoot: true
23
+ <%- if spec.container.supplemental_groups.empty? -%>
24
+ supplementalGroups: []
25
+ <%- else -%>
26
+ supplementalGroups:
27
+ <%- spec.container.supplemental_groups.each do |supplemental_group| -%>
28
+ - "<%= supplemental_group %>"
29
+ <%- end -%>
30
+ <%- end -%>
22
31
  fsGroup: <%= fs_group %>
32
+ hostNetwork: false
33
+ hostIPC: false
34
+ hostPID: false
35
+ <%- unless spec.container.image_pull_secret.nil? -%>
36
+ imagePullSecrets:
37
+ - name: <%= spec.container.image_pull_secret %>
38
+ <%- end -%>
23
39
  containers:
24
40
  - name: "<%= spec.container.name %>"
25
41
  image: <%= spec.container.image %>
26
- imagePullPolicy: IfNotPresent
42
+ imagePullPolicy: <%= spec.container.image_pull_policy %>
27
43
  <%- unless spec.container.working_dir.empty? -%>
28
44
  workingDir: "<%= spec.container.working_dir %>"
29
45
  <%- end -%>
30
- <%- unless spec.container.env.empty? -%>
31
46
  env:
32
- <%- spec.container.env.each do |env| -%>
33
- - name: <%= env[:name] %>
34
- value: "<%= env[:value] %>"
47
+ - name: POD_NAME
48
+ valueFrom:
49
+ fieldRef:
50
+ fieldPath: metadata.name
51
+ - name: POD_NAMESPACE
52
+ valueFrom:
53
+ fieldRef:
54
+ fieldPath: metadata.namespace
55
+ <%- unless spec.container.port.nil? -%>
56
+ - name: POD_PORT
57
+ value: "<%= spec.container.port %>"
58
+ <%- end -%>
59
+ <%- spec.container.env.each_pair do |name, value| -%>
60
+ - name: <%= name %>
61
+ value: "<%= value %>"
35
62
  <%- end # for each env -%>
36
- <%- end # unless env is nil -%>
37
63
  <%- unless spec.container.command.empty? -%>
38
64
  command:
39
65
  <%- spec.container.command.each do |cmd| -%>
@@ -43,41 +69,95 @@ spec:
43
69
  <%- unless spec.container.port.nil? -%>
44
70
  ports:
45
71
  - containerPort: <%= spec.container.port %>
72
+ startupProbe:
73
+ tcpSocket:
74
+ port: <%= spec.container.startup_probe.port %>
75
+ initialDelaySeconds: <%= spec.container.startup_probe.initial_delay_seconds %>
76
+ failureThreshold: <%= spec.container.startup_probe.failure_threshold %>
77
+ periodSeconds: <%= spec.container.startup_probe.period_seconds %>
46
78
  <%- end -%>
79
+ <%- if !all_mounts.empty? || (!configmap.nil? && configmap.mounts?) -%>
47
80
  volumeMounts:
48
81
  <%- unless configmap.nil? -%>
82
+ <%- configmap.files.each do |file| -%>
83
+ <%- next if file.mount_path.nil? -%>
49
84
  - name: configmap-volume
50
- mountPath: <%= configmap_mount_path %>
51
- <%- end -%>
85
+ mountPath: <%= file.mount_path %>
86
+ <%- unless file.sub_path.nil? -%>
87
+ subPath: <%= file.sub_path %>
88
+ <%- end # end unless file.sub_path.nil? -%>
89
+ <%- end # end configmap.files.each -%>
90
+ <%- end # unless configmap.nil? -%>
52
91
  <%- all_mounts.each do |mount| -%>
53
92
  - name: <%= mount[:name] %>
54
93
  mountPath: <%= mount[:destination_path] %>
55
94
  <%- end # for each mount -%>
95
+ <%- end # configmap mounts? and all_mounts not empty -%>
56
96
  resources:
57
97
  limits:
58
98
  memory: "<%= spec.container.memory %>"
59
99
  cpu: "<%= spec.container.cpu %>"
100
+ <%- unless script.gpus_per_node.nil? -%>
101
+ <%= gpu_type %>: <%= script.gpus_per_node %>
102
+ <%- end -%>
60
103
  requests:
61
104
  memory: "<%= spec.container.memory %>"
62
105
  cpu: "<%= spec.container.cpu %>"
63
- <%- unless spec.init_containers.nil? -%>
106
+ <%- unless script.gpus_per_node.nil? -%>
107
+ <%= gpu_type %>: <%= script.gpus_per_node %>
108
+ <%- end -%>
109
+ securityContext:
110
+ allowPrivilegeEscalation: false
111
+ capabilities:
112
+ drop:
113
+ - all
114
+ privileged: false
115
+ <%- unless spec.init_containers.empty? -%>
64
116
  initContainers:
65
117
  <%- spec.init_containers.each do |ctr| -%>
66
118
  - name: "<%= ctr.name %>"
67
119
  image: "<%= ctr.image %>"
120
+ imagePullPolicy: <%= ctr.image_pull_policy %>
121
+ env:
122
+ - name: POD_NAME
123
+ valueFrom:
124
+ fieldRef:
125
+ fieldPath: metadata.name
126
+ - name: POD_NAMESPACE
127
+ valueFrom:
128
+ fieldRef:
129
+ fieldPath: metadata.namespace
130
+ <%- ctr.env.each_pair do |name, value| -%>
131
+ - name: <%= name %>
132
+ value: "<%= value %>"
133
+ <%- end # for each env -%>
68
134
  command:
69
135
  <%- ctr.command.each do |cmd| -%>
70
136
  - "<%= cmd %>"
71
137
  <%- end # command loop -%>
138
+ <%- if !all_mounts.empty? || (!configmap.nil? && configmap.init_mounts?) -%>
72
139
  volumeMounts:
73
140
  <%- unless configmap.nil? -%>
141
+ <%- configmap.files.each do |file| -%>
142
+ <%- next if file.init_mount_path.nil? -%>
74
143
  - name: configmap-volume
75
- mountPath: <%= configmap_mount_path %>
76
- <%- end -%>
144
+ mountPath: <%= file.init_mount_path %>
145
+ <%- unless file.init_sub_path.nil? -%>
146
+ subPath: <%= file.init_sub_path %>
147
+ <%- end # end unless file.sub_path.nil? -%>
148
+ <%- end # end configmap.files.each -%>
149
+ <%- end # unless configmap.nil? -%>
77
150
  <%- all_mounts.each do |mount| -%>
78
151
  - name: <%= mount[:name] %>
79
152
  mountPath: <%= mount[:destination_path] %>
80
153
  <%- end # for each mount -%>
154
+ <%- end # if config_map init mounts and all_mounts not empty -%>
155
+ securityContext:
156
+ allowPrivilegeEscalation: false
157
+ capabilities:
158
+ drop:
159
+ - all
160
+ privileged: false
81
161
  <%- end # init container loop -%>
82
162
  <%- end # if init containers -%>
83
163
  <%- unless (configmap.to_s.empty? && all_mounts.empty?) -%>
@@ -101,6 +181,12 @@ spec:
101
181
  <%- end # if mount is [host,nfs] -%>
102
182
  <%- end # for each mount -%>
103
183
  <%- end # (configmap.to_s.empty? || all_mounts.empty?) -%>
184
+ <%- unless node_selector.empty? -%>
185
+ nodeSelector:
186
+ <%- node_selector.each_pair do |key, value| -%>
187
+ <%= key %>: "<%= value %>"
188
+ <%- end # node_selector.each_pair -%>
189
+ <%- end #unless node_selector.empty? -%>
104
190
  ---
105
191
  <%- unless spec.container.port.nil? -%>
106
192
  apiVersion: v1
@@ -110,6 +196,8 @@ metadata:
110
196
  namespace: <%= namespace %>
111
197
  labels:
112
198
  job: <%= id %>
199
+ app.kubernetes.io/name: <%= container.name %>
200
+ app.kubernetes.io/managed-by: open-ondemand
113
201
  spec:
114
202
  selector:
115
203
  job: <%= id %>
@@ -119,8 +207,8 @@ spec:
119
207
  targetPort: <%= spec.container.port %>
120
208
  type: NodePort
121
209
  <%- end # end for service -%>
122
- ---
123
210
  <%- unless configmap.nil? -%>
211
+ ---
124
212
  apiVersion: v1
125
213
  kind: ConfigMap
126
214
  metadata:
@@ -128,7 +216,12 @@ metadata:
128
216
  namespace: <%= namespace %>
129
217
  labels:
130
218
  job: <%= id %>
219
+ app.kubernetes.io/name: <%= container.name %>
220
+ app.kubernetes.io/managed-by: open-ondemand
131
221
  data:
132
- <%= configmap.filename %>: |
133
- <% config_data_lines(configmap.data).each do |line| %><%= line %><% end %>
134
- <%- end # end for configmap -%>
222
+ <%- configmap.files.each do |file| -%>
223
+ <%- next if file.data.nil? || file.filename.nil? -%>
224
+ <%= file.filename %>: |
225
+ <% config_data_lines(file.data).each do |line| %><%= line %><% end %>
226
+ <%- end # end for configmap files -%>
227
+ <%- end # end configmap.nil? %>
@@ -16,7 +16,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
16
16
  # from
17
17
  class Error < StandardError; end
18
18
 
19
- UNIT_SEPARATOR = "\x1F"
19
+ UNIT_SEPARATOR = ","
20
20
 
21
21
  # @param debug Whether the adapter should be used in debug mode
22
22
  # @param site_timeout [#to_i] A period after which the job should be killed or nil
@@ -80,12 +80,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
80
80
 
81
81
  call(*cmd, stdin: kill_cmd)
82
82
  rescue Error => e
83
- raise e unless (
84
- # The tmux server not running is not an error
85
- e.message.include?('failed to connect to server') ||
86
- # The session not being found is not an error
87
- e.message.include?("session not found: #{session_name_label}")
88
- )
83
+ interpret_and_raise(e)
89
84
  end
90
85
 
91
86
  def list_remote_sessions(host: nil)
@@ -264,8 +259,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
264
259
  |session_hash| session_hash[:session_name].start_with?(session_name_label)
265
260
  }
266
261
  rescue Error => e
267
- # The tmux server not running is not an error
268
- raise e unless e.message.include?('failed to connect to server')
262
+ interpret_and_raise(e)
269
263
  []
270
264
  end
271
265
 
@@ -280,4 +274,17 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
280
274
 
281
275
  '/dev/null'
282
276
  end
277
+
278
+ # under some conditions tmux returns status code 1 but it's not an actual
279
+ # error. These are when the session is not found or there are no sessions
280
+ # at all.
281
+ def interpret_and_raise(error)
282
+ if error.message.include?('failed to connect to server') # no sessions in tmux 1.8
283
+ nil
284
+ elsif error.message.include?('no server running on') # no sessions in tmux 2.7+ message
285
+ nil
286
+ else
287
+ raise error
288
+ end
289
+ end
283
290
  end
@@ -423,6 +423,7 @@ module OodCore
423
423
  args.concat ["-t", seconds_to_duration(script.wall_time)] unless script.wall_time.nil?
424
424
  args.concat ['-a', script.job_array_request] unless script.job_array_request.nil?
425
425
  args.concat ['--qos', script.qos] unless script.qos.nil?
426
+ args.concat ['--gpus-per-node', script.gpus_per_node] unless script.gpus_per_node.nil?
426
427
  # ignore nodes, don't know how to do this for slurm
427
428
 
428
429
  # Set dependencies
@@ -159,6 +159,8 @@ module OodCore
159
159
  args.concat ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
160
160
  args.concat ['-t', script.job_array_request] unless script.job_array_request.nil?
161
161
  args.concat ['-l', "qos=#{script.qos}"] unless script.qos.nil?
162
+ args.concat ['-l', "gpus=#{script.gpus_per_node}"] unless script.gpus_per_node.nil?
163
+
162
164
  # Set environment variables
163
165
  env = script.job_environment.to_h
164
166
  args.concat ["-v", env.keys.join(",")] unless env.empty?
@@ -103,6 +103,10 @@ module OodCore
103
103
  # @return [String, nil] qos
104
104
  attr_reader :qos
105
105
 
106
+ # The GPUs per node for the job
107
+ # @return [Integer, nil] gpus per node
108
+ attr_reader :gpus_per_node
109
+
106
110
  # Object detailing any native specifications that are implementation specific
107
111
  # @note Should not be used at all costs.
108
112
  # @return [Object, nil] native specifications
@@ -136,6 +140,7 @@ module OodCore
136
140
  # @param accounting_id [#to_s, nil] accounting id
137
141
  # @param job_array_request [#to_s, nil] job array request
138
142
  # @param qos [#to_s, nil] qos
143
+ # @param gpus_per_node [#to_i, nil] gpus per node
139
144
  # @param native [Object, nil] native specifications
140
145
  # @param copy_environment [Boolean, nil] copy the environment
141
146
  def initialize(content:, args: nil, submit_as_hold: nil, rerunnable: nil,
@@ -145,7 +150,7 @@ module OodCore
145
150
  output_path: nil, error_path: nil, reservation_id: nil,
146
151
  queue_name: nil, priority: nil, start_time: nil,
147
152
  wall_time: nil, accounting_id: nil, job_array_request: nil,
148
- qos: nil, native: nil, copy_environment: nil, **_)
153
+ qos: nil, gpus_per_node: nil, native: nil, copy_environment: nil, **_)
149
154
  @content = content.to_s
150
155
 
151
156
  @submit_as_hold = submit_as_hold
@@ -170,6 +175,7 @@ module OodCore
170
175
  @accounting_id = accounting_id && accounting_id.to_s
171
176
  @job_array_request = job_array_request && job_array_request.to_s
172
177
  @qos = qos && qos.to_s
178
+ @gpus_per_node = gpus_per_node && gpus_per_node.to_i
173
179
  @native = native
174
180
  @copy_environment = (copy_environment.nil?) ? nil : !! copy_environment
175
181
  end
@@ -200,6 +206,7 @@ module OodCore
200
206
  accounting_id: accounting_id,
201
207
  job_array_request: job_array_request,
202
208
  qos: qos,
209
+ gpus_per_node: gpus_per_node,
203
210
  native: native,
204
211
  copy_environment: copy_environment
205
212
  }
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.15.0"
3
+ VERSION = "0.17.1"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.0
4
+ version: 0.17.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2021-01-26 00:00:00.000000000 Z
13
+ date: 2021-06-14 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support