ood_core 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3296708d7bc47f3379a9e4a6c845d3f25c5ccefb599f4b92406d9dffdaef220b
4
- data.tar.gz: b6af9e90b67bc9a7a52203808d849d8800336b30b09bdb8ed204526d01bc92e9
3
+ metadata.gz: 52ba764b085dedb7eaeb06d95751f1804a50488e1859f980a7836d2d9032b95d
4
+ data.tar.gz: c2dc5edf395fe158960f33b80c554f3dc745f15e7ec1337b738683a0e1bbdc7f
5
5
  SHA512:
6
- metadata.gz: 623ac6e6f8081d68a3e925d1150c9f20a0f613ccfb6837519d1b95d04533a72caa403c54327aad85dcea9c0694cc23941f40307d942623c095f53fed7fc32026
7
- data.tar.gz: 0d785a9ade36b2f6f62f9ae55672091346aa4fb76bf358e6c00d4bc007623b8d1798813474665fc7b4d850d89e041fae5c2fefc9719fbe9f53a161a76127eaad
6
+ metadata.gz: 59915bae23a008a923c249d222e50548a7bee3438144068a29ae1cafdd489ca1229ee1a14f4f81e3fd065381f46f920bef24344fe633c7c578cb1f6a4f9a2a77
7
+ data.tar.gz: 8d2ca42c7f49158c8d321c21b79aff1c636df3c77bb7e71107db70371a34058d79c8a5ec32ca93883e7d3bcc7dc2202375144d23613f167ab089318d6270248c
@@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6
6
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7
7
 
8
8
  ## [Unreleased]
9
+ ## [0.14.0] - 2020-10-01
10
+ ### Added
11
+ - Kubernetes adapter in PR [156](https://github.com/OSC/ood_core/pull/156)
12
+
13
+ ### Fixed
14
+ - Catch Slurm times. [209](https://github.com/OSC/ood_core/pull/209)
15
+ - LHA race condition in deleteing tmp files. [212](https://github.com/OSC/ood_core/pull/212)
16
+
9
17
  ## [0.13.0] - 2020-08-10
10
18
  ### Added
11
19
  - CloudyCluster CCQ Adapter
@@ -247,7 +255,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
247
255
  ### Added
248
256
  - Initial release!
249
257
 
250
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.13.0...HEAD
258
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.14.0...HEAD
259
+ [0.14.0]: https://github.com/OSC/ood_core/compare/v0.13.0...v0.14.0
251
260
  [0.13.0]: https://github.com/OSC/ood_core/compare/v0.12.0...v0.13.0
252
261
  [0.12.0]: https://github.com/OSC/ood_core/compare/v0.11.4...v0.12.0
253
262
  [0.11.4]: https://github.com/OSC/ood_core/compare/v0.11.3...v0.11.4
data/README.md CHANGED
@@ -6,7 +6,7 @@
6
6
 
7
7
  - Website: http://openondemand.org/
8
8
  - Website repo with JOSS publication: https://github.com/OSC/Open-OnDemand
9
- - Documentation: https://osc.github.io/ood-documentation/master/
9
+ - Documentation: https://osc.github.io/ood-documentation/latest/
10
10
  - Main code repo: https://github.com/OSC/ondemand
11
11
  - Core library repo: https://github.com/OSC/ood_core
12
12
 
@@ -0,0 +1,193 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "ood_core/refinements/array_extensions"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ def self.build_kubernetes(config)
10
+ batch = Adapters::Kubernetes::Batch.new(config.to_h.symbolize_keys, Adapters::Kubernetes::Helper.new)
11
+ Adapters::Kubernetes.new(batch)
12
+ end
13
+ end
14
+
15
+ module Adapters
16
+ class Kubernetes < Adapter
17
+
18
+ using Refinements::ArrayExtensions
19
+ using Refinements::HashExtensions
20
+
21
+ require "ood_core/job/adapters/kubernetes/batch"
22
+
23
+ attr_reader :batch
24
+
25
+ def initialize(batch)
26
+ @batch = batch
27
+ end
28
+
29
+ # Submit a job with the attributes defined in the job template instance
30
+ # @abstract Subclass is expected to implement {#submit}
31
+ # @raise [NotImplementedError] if subclass did not define {#submit}
32
+ # @example Submit job template to cluster
33
+ # solver_id = job_adapter.submit(solver_script)
34
+ # #=> "1234.server"
35
+ # @example Submit job that depends on previous job
36
+ # post_id = job_adapter.submit(
37
+ # post_script,
38
+ # afterok: solver_id
39
+ # )
40
+ # #=> "1235.server"
41
+ # @param script [Script] script object that describes the
42
+ # script and attributes for the submitted job
43
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
44
+ # at any point after dependent jobs have started execution
45
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
46
+ # execution only after dependent jobs have terminated with no errors
47
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
48
+ # execution only after dependent jobs have terminated with errors
49
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
50
+ # execution after dependent jobs have terminated
51
+ # @return [String] the job id returned after successfully submitting a job
52
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
53
+ raise ArgumentError, 'Must specify the script' if script.nil?
54
+
55
+ batch.submit(script)
56
+ rescue Batch::Error => e
57
+ raise JobAdapterError, e.message
58
+ end
59
+
60
+
61
+ # Retrieve info for all jobs from the resource manager
62
+ # @abstract Subclass is expected to implement {#info_all}
63
+ # @raise [NotImplementedError] if subclass did not define {#info_all}
64
+ # @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
65
+ # This array specifies only attrs you want, in addition to id and status.
66
+ # If an array, the Info object that is returned to you is not guarenteed
67
+ # to have a value for any attr besides the ones specified and id and status.
68
+ #
69
+ # For certain adapters this may speed up the response since
70
+ # adapters can get by without populating the entire Info object
71
+ # @return [Array<Info>] information describing submitted jobs
72
+ def info_all(attrs: nil)
73
+ batch.info_all(attrs: attrs)
74
+ rescue Batch::Error => e
75
+ raise JobAdapterError, e.message
76
+ end
77
+
78
+ # Retrieve info for all jobs for a given owner or owners from the
79
+ # resource manager
80
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
81
+ # @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
82
+ # This array specifies only attrs you want, in addition to id and status.
83
+ # If an array, the Info object that is returned to you is not guarenteed
84
+ # to have a value for any attr besides the ones specified and id and status.
85
+ #
86
+ # For certain adapters this may speed up the response since
87
+ # adapters can get by without populating the entire Info object
88
+ # @return [Array<Info>] information describing submitted jobs
89
+ def info_where_owner(owner, attrs: nil)
90
+ owner = Array.wrap(owner).map(&:to_s)
91
+
92
+ # must at least have job_owner to filter by job_owner
93
+ attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
94
+
95
+ info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
96
+ end
97
+
98
+ # Iterate over each job Info object
99
+ # @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
100
+ # This array specifies only attrs you want, in addition to id and status.
101
+ # If an array, the Info object that is returned to you is not guarenteed
102
+ # to have a value for any attr besides the ones specified and id and status.
103
+ #
104
+ # For certain adapters this may speed up the response since
105
+ # adapters can get by without populating the entire Info object
106
+ # @yield [Info] of each job to block
107
+ # @return [Enumerator] if no block given
108
+ def info_all_each(attrs: nil)
109
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
110
+
111
+ info_all(attrs: attrs).each do |job|
112
+ yield job
113
+ end
114
+ end
115
+
116
+ # Iterate over each job Info object
117
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
118
+ # @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
119
+ # This array specifies only attrs you want, in addition to id and status.
120
+ # If an array, the Info object that is returned to you is not guarenteed
121
+ # to have a value for any attr besides the ones specified and id and status.
122
+ #
123
+ # For certain adapters this may speed up the response since
124
+ # adapters can get by without populating the entire Info object
125
+ # @yield [Info] of each job to block
126
+ # @return [Enumerator] if no block given
127
+ def info_where_owner_each(owner, attrs: nil)
128
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
129
+
130
+ info_where_owner(owner, attrs: attrs).each do |job|
131
+ yield job
132
+ end
133
+ end
134
+
135
+ # Whether the adapter supports job arrays
136
+ # @return [Boolean] - assumes true; but can be overridden by adapters that
137
+ # explicitly do not
138
+ def supports_job_arrays?
139
+ false
140
+ end
141
+
142
+ # Retrieve job info from the resource manager
143
+ # @abstract Subclass is expected to implement {#info}
144
+ # @raise [NotImplementedError] if subclass did not define {#info}
145
+ # @param id [#to_s] the id of the job
146
+ # @return [Info] information describing submitted job
147
+ def info(id)
148
+ batch.info(id.to_s)
149
+ rescue Batch::Error => e
150
+ raise JobAdapterError, e.message
151
+ end
152
+
153
+ # Retrieve job status from resource manager
154
+ # @note Optimized slightly over retrieving complete job information from server
155
+ # @abstract Subclass is expected to implement {#status}
156
+ # @raise [NotImplementedError] if subclass did not define {#status}
157
+ # @param id [#to_s] the id of the job
158
+ # @return [Status] status of job
159
+ def status(id)
160
+ info(id).status
161
+ end
162
+
163
+ # Put the submitted job on hold
164
+ # @abstract Subclass is expected to implement {#hold}
165
+ # @raise [NotImplementedError] if subclass did not define {#hold}
166
+ # @param id [#to_s] the id of the job
167
+ # @return [void]
168
+ def hold(id)
169
+ raise NotImplementedError, 'subclass did not define #hold'
170
+ end
171
+
172
+ # Release the job that is on hold
173
+ # @abstract Subclass is expected to implement {#release}
174
+ # @raise [NotImplementedError] if subclass did not define {#release}
175
+ # @param id [#to_s] the id of the job
176
+ # @return [void]
177
+ def release(id)
178
+ raise NotImplementedError, 'subclass did not define #release'
179
+ end
180
+
181
+ # Delete the submitted job.
182
+ #
183
+ # @param id [#to_s] the id of the job
184
+ # @return [void]
185
+ def delete(id)
186
+ batch.delete(id.to_s)
187
+ rescue Batch::Error => e
188
+ raise JobAdapterError, e.message
189
+ end
190
+ end
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,350 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "json"
3
+
4
+ class OodCore::Job::Adapters::Kubernetes::Batch
5
+
6
+ require "ood_core/job/adapters/kubernetes/helper"
7
+
8
+ Helper = OodCore::Job::Adapters::Kubernetes::Helper
9
+ Resources = OodCore::Job::Adapters::Kubernetes::Resources
10
+
11
+ using OodCore::Refinements::HashExtensions
12
+
13
+ class Error < StandardError; end
14
+
15
+ attr_reader :config_file, :bin, :cluster_name, :mounts
16
+ attr_reader :all_namespaces, :using_context, :helper
17
+ attr_reader :username_prefix
18
+
19
+ def initialize(options = {}, helper = Helper.new)
20
+ options = options.to_h.symbolize_keys
21
+
22
+ @config_file = options.fetch(:config_file, default_config_file)
23
+ @bin = options.fetch(:bin, '/usr/bin/kubectl')
24
+ @cluster_name = options.fetch(:cluster_name, 'open-ondemand')
25
+ @mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
26
+ @all_namespaces = options.fetch(:all_namespaces, false)
27
+ @username_prefix = options.fetch(:username_prefix, nil)
28
+
29
+ @using_context = false
30
+ @helper = helper
31
+
32
+ begin
33
+ make_kubectl_config(options)
34
+ rescue
35
+ # FIXME could use a log here
36
+ # means you couldn't 'kubectl set config'
37
+ end
38
+ end
39
+
40
+ def resource_file(resource_type = 'pod')
41
+ File.dirname(__FILE__) + "/templates/#{resource_type}.yml.erb"
42
+ end
43
+
44
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
45
+ raise ArgumentError, 'Must specify the script' if script.nil?
46
+
47
+ resource_yml, id = generate_id_yml(script.native)
48
+ call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
49
+
50
+ id
51
+ end
52
+
53
+ def generate_id(name)
54
+ # 2_821_109_907_456 = 36**8
55
+ name.downcase.tr(' ', '-') + '-' + rand(2_821_109_907_456).to_s(36)
56
+ end
57
+
58
+ def info_all(attrs: nil)
59
+ cmd = if all_namespaces
60
+ "#{base_cmd} get pods -o json --all-namespaces"
61
+ else
62
+ "#{namespaced_cmd} get pods -o json"
63
+ end
64
+
65
+ output = call(cmd)
66
+ all_pods_to_info(output)
67
+ end
68
+
69
+ def info_where_owner(owner, attrs: nil)
70
+ owner = Array.wrap(owner).map(&:to_s)
71
+
72
+ # must at least have job_owner to filter by job_owner
73
+ attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
74
+
75
+ info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
76
+ end
77
+
78
+ def info_all_each(attrs: nil)
79
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
80
+
81
+ info_all(attrs: attrs).each do |job|
82
+ yield job
83
+ end
84
+ end
85
+
86
+ def info_where_owner_each(owner, attrs: nil)
87
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
88
+
89
+ info_where_owner(owner, attrs: attrs).each do |job|
90
+ yield job
91
+ end
92
+ end
93
+
94
+ def info(id)
95
+ pod_json = call_json_output('get', 'pod', id)
96
+
97
+ begin
98
+ service_json = call_json_output('get', 'service', service_name(id))
99
+ secret_json = call_json_output('get', 'secret', secret_name(id))
100
+ rescue
101
+ # it's ok if these don't exist
102
+ service_json ||= nil
103
+ secret_json ||= nil
104
+ end
105
+
106
+ helper.info_from_json(pod_json: pod_json, service_json: service_json, secret_json: secret_json)
107
+ end
108
+
109
+ def status(id)
110
+ info(id).status
111
+ end
112
+
113
+ def delete(id)
114
+ call("#{namespaced_cmd} delete pod #{id}")
115
+
116
+ begin
117
+ call("#{namespaced_cmd} delete service #{service_name(id)}")
118
+ call("#{namespaced_cmd} delete secret #{secret_name(id)}")
119
+ call("#{namespaced_cmd} delete configmap #{configmap_name(id)}")
120
+ rescue
121
+ # FIXME: retries? delete if exists?
122
+ # just eat the results of deleting services and secrets
123
+ end
124
+ end
125
+
126
+ def configmap_mount_path
127
+ '/ood'
128
+ end
129
+
130
+ private
131
+
132
+ # helper to help format multi-line yaml data from the submit.yml into
133
+ # mutli-line yaml in the pod.yml.erb
134
+ def config_data_lines(data)
135
+ output = []
136
+ first = true
137
+
138
+ data.to_s.each_line do |line|
139
+ output.append(first ? line : line.prepend(" "))
140
+ first = false
141
+ end
142
+
143
+ output
144
+ end
145
+
146
+ def username
147
+ @username ||= Etc.getlogin
148
+ end
149
+
150
+ def k8s_username
151
+ username_prefix.nil? ? username : "#{username_prefix}-#{username}"
152
+ end
153
+
154
+ def run_as_user
155
+ Etc.getpwnam(username).uid
156
+ end
157
+
158
+ def run_as_group
159
+ Etc.getpwnam(username).gid
160
+ end
161
+
162
+ def fs_group
163
+ run_as_group
164
+ end
165
+
166
+ # helper to template resource yml you're going to submit and
167
+ # create an id.
168
+ def generate_id_yml(native_data)
169
+ container = helper.container_from_native(native_data[:container])
170
+ id = generate_id(container.name)
171
+ configmap = helper.configmap_from_native(native_data, id)
172
+ init_containers = helper.init_ctrs_from_native(native_data[:init_containers])
173
+ spec = Resources::PodSpec.new(container, init_containers: init_containers)
174
+ all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
175
+
176
+ template = ERB.new(File.read(resource_file))
177
+
178
+ [template.result(binding), id]
179
+ end
180
+
181
+ # helper to call kubectl and get json data back.
182
+ # verb, resrouce and id are the kubernetes parlance terms.
183
+ # example: 'kubectl get pod my-pod-id' is verb=get, resource=pod
184
+ # and id=my-pod-id
185
+ def call_json_output(verb, resource, id, stdin: nil)
186
+ cmd = "#{formatted_ns_cmd} #{verb} #{resource} #{id}"
187
+ data = call(cmd, stdin: stdin)
188
+ data = data.empty? ? '{}' : data
189
+ json_data = JSON.parse(data, symbolize_names: true)
190
+
191
+ json_data
192
+ end
193
+
194
+ def service_name(id)
195
+ helper.service_name(id)
196
+ end
197
+
198
+ def secret_name(id)
199
+ helper.secret_name(id)
200
+ end
201
+
202
+ def configmap_name(id)
203
+ helper.configmap_name(id)
204
+ end
205
+
206
+ def namespace
207
+ default_namespace
208
+ end
209
+
210
+ def default_namespace
211
+ username
212
+ end
213
+
214
+ def context
215
+ cluster_name
216
+ end
217
+
218
+ def default_config_file
219
+ (ENV['KUBECONFIG'] || "#{Dir.home}/.kube/config")
220
+ end
221
+
222
+ def default_auth
223
+ {
224
+ type: 'managaged'
225
+ }.symbolize_keys
226
+ end
227
+
228
+ def default_server
229
+ {
230
+ endpoint: 'https://localhost:8080',
231
+ cert_authority_file: nil
232
+ }.symbolize_keys
233
+ end
234
+
235
+ def formatted_ns_cmd
236
+ "#{namespaced_cmd} -o json"
237
+ end
238
+
239
+ def namespaced_cmd
240
+ "#{base_cmd} --namespace=#{namespace}"
241
+ end
242
+
243
+ def base_cmd
244
+ base = "#{bin} --kubeconfig=#{config_file}"
245
+ base << " --context=#{context}" if using_context
246
+ base
247
+ end
248
+
249
+ def all_pods_to_info(data)
250
+ json_data = JSON.parse(data, symbolize_names: true)
251
+ pods = json_data.dig(:items)
252
+
253
+ info_array = []
254
+ pods.each do |pod|
255
+ info = pod_info_from_json(pod)
256
+ info_array.push(info) unless info.nil?
257
+ end
258
+
259
+ info_array
260
+ rescue JSON::ParserError
261
+ # 'no resources in <namespace>' throws parse error
262
+ []
263
+ end
264
+
265
+ def pod_info_from_json(pod)
266
+ hash = helper.pod_info_from_json(pod)
267
+ OodCore::Job::Info.new(hash)
268
+ rescue Helper::K8sDataError
269
+ # FIXME: silently eating error, could probably use a logger
270
+ nil
271
+ end
272
+
273
+ def make_kubectl_config(config)
274
+ set_cluster(config.fetch(:server, default_server).to_h.symbolize_keys)
275
+ configure_auth(config.fetch(:auth, default_auth).to_h.symbolize_keys)
276
+ end
277
+
278
+ def configure_auth(auth)
279
+ type = auth.fetch(:type)
280
+ return if managed?(type)
281
+
282
+ case type
283
+ when 'gke'
284
+ set_gke_config(auth)
285
+ when 'oidc'
286
+ set_context
287
+ end
288
+ end
289
+
290
+ def use_context
291
+ @using_context = true
292
+ end
293
+
294
+ def managed?(type)
295
+ if type.nil?
296
+ true # maybe should be false?
297
+ else
298
+ type.to_s == 'managed'
299
+ end
300
+ end
301
+
302
+ def set_gke_config(auth)
303
+ cred_file = auth.fetch(:svc_acct_file)
304
+
305
+ cmd = "gcloud auth activate-service-account --key-file=#{cred_file}"
306
+ call(cmd)
307
+
308
+ set_gke_credentials(auth)
309
+ end
310
+
311
+ def set_gke_credentials(auth)
312
+
313
+ zone = auth.fetch(:zone, nil)
314
+ region = auth.fetch(:region, nil)
315
+
316
+ locale = ''
317
+ locale = "--zone=#{zone}" unless zone.nil?
318
+ locale = "--region=#{region}" unless region.nil?
319
+
320
+ # gke cluster name can probably can differ from what ood calls the cluster
321
+ cmd = "gcloud container clusters get-credentials #{locale} #{cluster_name}"
322
+ env = { 'KUBECONFIG' => config_file }
323
+ call(cmd, env)
324
+ end
325
+
326
+ def set_context
327
+ cmd = "#{base_cmd} config set-context #{cluster_name}"
328
+ cmd << " --cluster=#{cluster_name} --namespace=#{namespace}"
329
+ cmd << " --user=#{k8s_username}"
330
+
331
+ call(cmd)
332
+ use_context
333
+ end
334
+
335
+ def set_cluster(config)
336
+ server = config.fetch(:endpoint)
337
+ cert = config.fetch(:cert_authority_file, nil)
338
+
339
+ cmd = "#{base_cmd} config set-cluster #{cluster_name}"
340
+ cmd << " --server=#{server}"
341
+ cmd << " --certificate-authority=#{cert}" unless cert.nil?
342
+
343
+ call(cmd)
344
+ end
345
+
346
+ def call(cmd = '', env: {}, stdin: nil)
347
+ o, error, s = Open3.capture3(env, cmd, stdin_data: stdin.to_s)
348
+ s.success? ? o : raise(Error, error)
349
+ end
350
+ end
@@ -0,0 +1,298 @@
1
+ class OodCore::Job::Adapters::Kubernetes::Helper
2
+
3
+ require 'ood_core/job/adapters/kubernetes/resources'
4
+ require 'resolv'
5
+ require 'base64'
6
+
7
+ class K8sDataError < StandardError; end
8
+
9
+ Resources = OodCore::Job::Adapters::Kubernetes::Resources
10
+
11
+ # Extract info from json data. The data is expected to be from the kubectl
12
+ # command and conform to kubernetes' datatype structures.
13
+ #
14
+ # Returns { native: {host: localhost, port:80, password: sshhh }} in the info
15
+ # object field in lieu of writing a connection.yml
16
+ #
17
+ # @param pod_json [#to_h]
18
+ # the pod data returned from 'kubectl get pod abc-123'
19
+ # @param service_json [#to_h]
20
+ # the service data returned from 'kubectl get service abc-123-service'
21
+ # @param secret_json [#to_h]
22
+ # the secret data returned from 'kubectl get secret abc-123-secret'
23
+ # @return [OodCore::Job::Info]
24
+ def info_from_json(pod_json: nil, service_json: nil, secret_json: nil)
25
+ pod_hash = pod_info_from_json(pod_json)
26
+ service_hash = service_info_from_json(service_json)
27
+ secret_hash = secret_info_from_json(secret_json)
28
+
29
+ # can't just use deep_merge bc we don't depend *directly* on rails
30
+ pod_hash[:native] = pod_hash[:native].merge(service_hash[:native])
31
+ pod_hash[:native] = pod_hash[:native].merge(secret_hash[:native])
32
+ OodCore::Job::Info.new(pod_hash)
33
+ rescue NoMethodError
34
+ raise K8sDataError, "unable to read data correctly from json"
35
+ end
36
+
37
+ # Turn a container hash into a Kubernetes::Resources::Container
38
+ #
39
+ # @param container [#to_h]
40
+ # the input container hash
41
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::Container]
42
+ def container_from_native(container)
43
+ Resources::Container.new(
44
+ container[:name],
45
+ container[:image],
46
+ command: parse_command(container[:command]),
47
+ port: container[:port],
48
+ env: container.fetch(:env, []),
49
+ memory: container[:memory],
50
+ cpu: container[:cpu],
51
+ working_dir: container[:working_dir],
52
+ restart_policy: container[:restart_policy]
53
+ )
54
+ end
55
+
56
+ # Parse a command string given from a user and return an array.
57
+ # If given an array, the input is simply returned back.
58
+ #
59
+ # @param cmd [#to_s]
60
+ # the command to parse
61
+ # @return [Array<#to_s>]
62
+ # the command parsed into an array of arguements
63
+ def parse_command(cmd)
64
+ if cmd&.is_a?(Array)
65
+ cmd
66
+ else
67
+ Shellwords.split(cmd.to_s)
68
+ end
69
+ end
70
+
71
+ # Turn a configmap hash into a Kubernetes::Resources::ConfigMap
72
+ # that can be used in templates. Needs an id so that the resulting
73
+ # configmap has a known name.
74
+ #
75
+ # @param native [#to_h]
76
+ # the input configmap hash
77
+ # @param id [#to_s]
78
+ # the id to use for giving the configmap a name
79
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
80
+ def configmap_from_native(native, id)
81
+ configmap = native.fetch(:configmap, nil)
82
+ return nil if configmap.nil?
83
+
84
+ Resources::ConfigMap.new(
85
+ configmap_name(id),
86
+ configmap[:filename],
87
+ configmap[:data]
88
+ )
89
+ end
90
+
91
+ # parse initialization containers from native data
92
+ #
93
+ # @param native_data [#to_h]
94
+ # the native data to parse. Expected key init_ctrs and for that
95
+ # key to be an array of hashes.
96
+ # @return [Array<OodCore::Job::Adapters::Kubernetes::Resources::Container>]
97
+ # the array of init containers
98
+ def init_ctrs_from_native(ctrs)
99
+ init_ctrs = []
100
+
101
+ ctrs&.each do |ctr_raw|
102
+ ctr = container_from_native(ctr_raw)
103
+ init_ctrs.push(ctr)
104
+ end
105
+
106
+ init_ctrs
107
+ end
108
+
109
+ def service_name(id)
110
+ id + '-service'
111
+ end
112
+
113
+ def secret_name(id)
114
+ id + '-secret'
115
+ end
116
+
117
+ def configmap_name(id)
118
+ id + '-configmap'
119
+ end
120
+
121
+ # Extract pod info from json data. The data is expected to be from the kubectl
122
+ # command and conform to kubernetes' datatype structures.
123
+ #
124
+ # @param json_data [#to_h]
125
+ # the pod data returned from 'kubectl get pod abc-123'
126
+ # @return [#to_h]
127
+ # the hash of info expected from adapters
128
+ def pod_info_from_json(json_data)
129
+ {
130
+ id: json_data.dig(:metadata, :name).to_s,
131
+ job_name: name_from_metadata(json_data.dig(:metadata)),
132
+ status: pod_status_from_json(json_data),
133
+ job_owner: json_data.dig(:metadata, :namespace).to_s,
134
+ submission_time: submission_time(json_data),
135
+ dispatch_time: dispatch_time(json_data),
136
+ wallclock_time: wallclock_time(json_data),
137
+ native: {
138
+ host: get_host(json_data.dig(:status, :hostIP))
139
+ },
140
+ procs: procs_from_json(json_data)
141
+ }
142
+ rescue NoMethodError
143
+ # gotta raise an error because Info.new will throw an error if id is undefined
144
+ raise K8sDataError, "unable to read data correctly from json"
145
+ end
146
+
147
+ private
148
+
149
+ def get_host(ip)
150
+ Resolv.getname(ip)
151
+ rescue Resolv::ResolvError
152
+ ip
153
+ end
154
+
155
+ def name_from_metadata(metadata)
156
+ name = metadata.dig(:labels, :'app.kubernetes.io/name')
157
+ name = metadata.dig(:labels, :'k8s-app') if name.nil?
158
+ name = metadata.dig(:name) if name.nil? # pod-id but better than nil?
159
+ name
160
+ end
161
+
162
+ def service_info_from_json(json_data)
163
+ # all we need is the port - .spec.ports[0].nodePort
164
+ ports = json_data.dig(:spec, :ports)
165
+ {
166
+ native:
167
+ {
168
+ port: ports[0].dig(:nodePort)
169
+ }
170
+ }
171
+ rescue
172
+ empty_native
173
+ end
174
+
175
+ def secret_info_from_json(json_data)
176
+ raw = json_data.dig(:data, :password)
177
+ {
178
+ native:
179
+ {
180
+ password: Base64.decode64(raw)
181
+ }
182
+ }
183
+ rescue
184
+ empty_native
185
+ end
186
+
187
+ def empty_native
188
+ {
189
+ native: {}
190
+ }
191
+ end
192
+
193
+ def dispatch_time(json_data)
194
+ status = pod_status_from_json(json_data)
195
+ return nil if status == 'undetermined'
196
+
197
+ state_data = json_data.dig(:status, :containerStatuses)[0].dig(:state)
198
+ date_string = nil
199
+
200
+ if status == 'completed'
201
+ date_string = state_data.dig(:terminated, :startedAt)
202
+ elsif status == 'running'
203
+ date_string = state_data.dig(:running, :startedAt)
204
+ end
205
+
206
+ date_string.nil? ? nil : DateTime.parse(date_string).to_time.to_i
207
+ end
208
+
209
+ def wallclock_time(json_data)
210
+ status = pod_status_from_json(json_data)
211
+ return nil if status == 'undetermined'
212
+
213
+ state_data = json_data.dig(:status, :containerStatuses)[0].dig(:state)
214
+ start_time = dispatch_time(json_data)
215
+ return nil if start_time.nil?
216
+
217
+ et = end_time(status, state_data)
218
+
219
+ et.nil? ? nil : et - start_time
220
+ end
221
+
222
+ def end_time(status, state_data)
223
+ if status == 'completed'
224
+ end_time_string = state_data.dig(:terminated, :finishedAt)
225
+ et = DateTime.parse(end_time_string).to_time.to_i
226
+ elsif status == 'running'
227
+ et = DateTime.now.to_time.to_i
228
+ else
229
+ et = nil
230
+ end
231
+
232
+ et
233
+ end
234
+
235
+ def submission_time(json_data)
236
+ status = json_data.dig(:status)
237
+ start = status.dig(:startTime)
238
+
239
+ if start.nil?
240
+ # the pod is in some pending state limbo
241
+ conditions = status.dig(:conditions)
242
+ # best guess to start time is just the first condition's
243
+ # transition time
244
+ str = conditions[0].dig(:lastTransitionTime)
245
+ else
246
+ str = start
247
+ end
248
+
249
+ DateTime.parse(str).to_time.to_i
250
+ end
251
+
252
+ def pod_status_from_json(json_data)
253
+ state = 'undetermined'
254
+ status = json_data.dig(:status)
255
+ container_statuses = status.dig(:containerStatuses)
256
+
257
+ if container_statuses.nil?
258
+ # if you're here, it means you're pending, probably unschedulable
259
+ return OodCore::Job::Status.new(state: state)
260
+ end
261
+
262
+ # only support 1 container/pod
263
+ json_state = container_statuses[0].dig(:state)
264
+ state = 'running' unless json_state.dig(:running).nil?
265
+ state = terminated_state(json_state) unless json_state.dig(:terminated).nil?
266
+ state = 'queued' unless json_state.dig(:waiting).nil?
267
+
268
+ OodCore::Job::Status.new(state: state)
269
+ end
270
+
271
+ def terminated_state(status)
272
+ reason = status.dig(:terminated, :reason)
273
+ if reason == 'Error'
274
+ 'suspended'
275
+ else
276
+ 'completed'
277
+ end
278
+ end
279
+
280
+ def procs_from_json(json_data)
281
+ containers = json_data.dig(:spec, :containers)
282
+ resources = containers[0].dig(:resources)
283
+
284
+ cpu = resources.dig(:limits, :cpu)
285
+ millicores_rex = /(\d+)m/
286
+
287
+ # ok to return string bc nil.to_i == 0 and we'd rather return
288
+ # nil (undefined) than 0 which is confusing.
289
+ if millicores_rex.match?(cpu)
290
+ millicores = millicores_rex.match(cpu)[1].to_i
291
+
292
+ # have to return at least 1 bc 200m could be 0
293
+ ((millicores + 1000) / 1000).to_s
294
+ else
295
+ cpu
296
+ end
297
+ end
298
+ end
@@ -0,0 +1,56 @@
1
+ module OodCore::Job::Adapters::Kubernetes::Resources
2
+
3
+ class ConfigMap
4
+ attr_accessor :name, :filename, :data
5
+
6
+ def initialize(name, filename, data)
7
+ @name = name
8
+ @filename = filename
9
+ @data = data
10
+ end
11
+ end
12
+
13
+ class Container
14
+ attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
15
+ :restart_policy
16
+
17
+ def initialize(
18
+ name, image, command: [], port: nil, env: [], memory: "4Gi", cpu: "1",
19
+ working_dir: "", restart_policy: "Never"
20
+ )
21
+ raise ArgumentError, "containers need valid names and images" unless name && image
22
+
23
+ @name = name
24
+ @image = image
25
+ @command = command.nil? ? [] : command
26
+ @port = port&.to_i
27
+ @env = env.nil? ? [] : env
28
+ @memory = memory.nil? ? "4Gi" : memory
29
+ @cpu = cpu.nil? ? "1" : cpu
30
+ @working_dir = working_dir.nil? ? "" : working_dir
31
+ @restart_policy = restart_policy.nil? ? "Never" : restart_policy
32
+ end
33
+
34
+ def ==(other)
35
+ name == other.name &&
36
+ image == other.image &&
37
+ command == other.command &&
38
+ port == other.port &&
39
+ env == other.env &&
40
+ memory == other.memory &&
41
+ cpu == other.cpu &&
42
+ working_dir == other.working_dir &&
43
+ restart_policy == other.restart_policy
44
+ end
45
+
46
+ end
47
+
48
+ class PodSpec
49
+ attr_accessor :container, :init_containers
50
+ def initialize(container, init_containers: nil)
51
+ @container = container
52
+ @init_containers = init_containers
53
+ end
54
+ end
55
+
56
+ end
@@ -0,0 +1,123 @@
1
+ apiVersion: v1
2
+ kind: Pod
3
+ metadata:
4
+ namespace: <%= namespace %>
5
+ name: <%= id %>
6
+ labels:
7
+ job: <%= id %>
8
+ app.kubernetes.io/name: <%= container.name %>
9
+ app.kubernetes.io/managed-by: open-ondemand
10
+ spec:
11
+ restartPolicy: <%= spec.container.restart_policy %>
12
+ securityContext:
13
+ runAsUser: <%= run_as_user %>
14
+ runAsGroup: <%= run_as_group %>
15
+ fsGroup: <%= fs_group %>
16
+ containers:
17
+ - name: "<%= spec.container.name %>"
18
+ image: <%= spec.container.image %>
19
+ imagePullPolicy: IfNotPresent
20
+ <% unless spec.container.working_dir.empty? %>
21
+ workingDir: "<%= spec.container.working_dir %>"
22
+ <% end %>
23
+ <% unless spec.container.env.empty? %>
24
+ env:
25
+ <% spec.container.env.each do |env| %>
26
+ - name: <%= env[:name] %>
27
+ value: "<%= env[:value] %>"
28
+ <% end %> <%# for each env %>
29
+ <% end %> <%# unless env is nil %>
30
+ <% unless spec.container.command.empty? %>
31
+ command:
32
+ <% spec.container.command.each do |cmd| %>
33
+ - "<%= cmd %>"
34
+ <% end %> <%# for each command %>
35
+ <% end %> <%# unless command is nil %>
36
+ <% unless spec.container.port.nil? %>
37
+ ports:
38
+ - containerPort: <%= spec.container.port %>
39
+ <% end %>
40
+ volumeMounts:
41
+ <% unless configmap.nil? %>
42
+ - name: configmap-volume
43
+ mountPath: <%= configmap_mount_path %>
44
+ <% end %>
45
+ <% all_mounts.each do |mount| %>
46
+ - name: <%= mount[:name] %>
47
+ mountPath: <%= mount[:destination_path] %>
48
+ <% end %> <%# for each mount %>
49
+ resources:
50
+ limits:
51
+ memory: "<%= spec.container.memory %>"
52
+ cpu: "<%= spec.container.cpu %>"
53
+ requests:
54
+ memory: "<%= spec.container.memory %>"
55
+ cpu: "<%= spec.container.cpu %>"
56
+ <% unless spec.init_containers.nil? %>
57
+ initContainers:
58
+ <% spec.init_containers.each do |ctr| %>
59
+ - name: "<%= ctr.name %>"
60
+ image: "<%= ctr.image %>"
61
+ command:
62
+ <% ctr.command.each do |cmd| %>
63
+ - "<%= cmd %>"
64
+ <% end %> <%# command loop %>
65
+ volumeMounts:
66
+ <% unless configmap.nil? %>
67
+ - name: configmap-volume
68
+ mountPath: <%= configmap_mount_path %>
69
+ <% end %>
70
+ <% all_mounts.each do |mount| %>
71
+ - name: <%= mount[:name] %>
72
+ mountPath: <%= mount[:destination_path] %>
73
+ <% end %> <%# for each mount %>
74
+ <% end %> <%# init container loop %>
75
+ <% end %> <%# if init containers %>
76
+ <% unless configmap.nil? || all_mounts.empty? %>
77
+ volumes:
78
+ <% end %> <%# configmap.nil? || all_mounts.empty? %>
79
+ <% unless configmap.nil? %>
80
+ - name: configmap-volume
81
+ configMap:
82
+ name: <%= configmap_name(id) %>
83
+ <% end %>
84
+ <% all_mounts.each do |mount| %>
85
+ <% if mount[:type] == 'nfs' %>
86
+ - name: <%= mount[:name] %>
87
+ nfs:
88
+ server: <%= mount[:host] %>
89
+ path: <%= mount[:path] %>
90
+ <% elsif mount[:type] == 'host' %>
91
+ - name: <%= mount[:name] %>
92
+ hostPath:
93
+ path: <%= mount[:path] %>
94
+ type: <%= mount[:host_type] %>
95
+ <% end %> <%# if mount is [host,nfs] %>
96
+ <% end %> <%# for each mount %>
97
+ ---
98
+ <% unless spec.container.port.nil? %>
99
+ apiVersion: v1
100
+ kind: Service
101
+ metadata:
102
+ name: <%= service_name(id) %>
103
+ namespace: <%= namespace %>
104
+ spec:
105
+ selector:
106
+ job: <%= id %>
107
+ ports:
108
+ - protocol: TCP
109
+ port: 80
110
+ targetPort: <%= spec.container.port %>
111
+ type: NodePort
112
+ <% end %> <%# end for service %>
113
+ ---
114
+ <% unless configmap.nil? %>
115
+ apiVersion: v1
116
+ kind: ConfigMap
117
+ metadata:
118
+ name: <%= configmap_name(id) %>
119
+ namespace: <%= namespace %>
120
+ data:
121
+ <%= configmap.filename %>: |
122
+ <% config_data_lines(configmap.data).each do |line| %><%= line %><% end %>
123
+ <% end %> <%# end for configmap %>
@@ -166,7 +166,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
166
166
  'email_on_terminated' => script_email_on_event(script, 'terminated'),
167
167
  'email_on_start' => script_email_on_event(script, 'started'),
168
168
  'environment' => export_env(script),
169
- 'error_path' => (script.error_path) ? script.error_path.to_s : '/dev/null',
169
+ 'error_path' => error_path(script),
170
170
  'job_name' => script.job_name.to_s,
171
171
  'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
172
172
  'script_content' => content,
@@ -176,6 +176,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
176
176
  'singularity_image' => singularity_image(script.native),
177
177
  'ssh_hosts' => ssh_hosts,
178
178
  'tmux_bin' => tmux_bin,
179
+ 'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
179
180
  }.each{
180
181
  |key, value| bnd.local_variable_set(key, value)
181
182
  }
@@ -272,4 +273,11 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
272
273
  return false if script.content.empty?
273
274
  script.content.split("\n").first.start_with?('#!/')
274
275
  end
276
+
277
+ def error_path(script)
278
+ return script.error_path.to_s if script.error_path
279
+ return script.output_path.to_s if script.output_path
280
+
281
+ '/dev/null'
282
+ end
275
283
  end
@@ -16,13 +16,9 @@ fi
16
16
  echo $hostname
17
17
 
18
18
  # Put the script into a temp file on localhost
19
- <% if debug %>
20
- singularity_tmp_file=$(mktemp -p "$HOME" --suffix '_sing')
21
- tmux_tmp_file=$(mktemp -p "$HOME" --suffix "_tmux")
22
- <% else %>
23
- singularity_tmp_file=$(mktemp)
24
- tmux_tmp_file=$(mktemp)
25
- <% end %>
19
+ singularity_tmp_file=$(mktemp -p "<%= workdir %>" --suffix '_sing')
20
+ tmux_tmp_file=$(mktemp -p "<%= workdir %>" --suffix "_tmux")
21
+
26
22
 
27
23
  # Create an executable to run in a tmux session
28
24
  # The escaped HEREDOC means that we need to substitute in $singularity_tmp_file ourselves
@@ -69,10 +65,3 @@ SINGULARITY_LAUNCHER
69
65
  chmod +x "$singularity_tmp_file"
70
66
  chmod +x "$tmux_tmp_file"
71
67
  <%= tmux_bin %> new-session -d -s "<%= session_name %>" "$tmux_tmp_file"
72
-
73
- # Remove the file
74
- <% if ! debug %>
75
- # Wait 1 second to ensure that tmux session has started before the file is removed
76
- sleep 1
77
- rm -f "$tmux_tmp_file"; rm -f "$singularity_tmp_file"
78
- <% end %>
@@ -80,6 +80,9 @@ module OodCore
80
80
  # from
81
81
  class Error < StandardError; end
82
82
 
83
+ # An error indicating the slurm command timed out
84
+ class SlurmTimeoutError < Error; end
85
+
83
86
  # @param cluster [#to_s, nil] the cluster name
84
87
  # @param conf [#to_s, nil] path to the slurm conf
85
88
  # @param bin [#to_s] path to slurm installation binaries
@@ -147,6 +150,9 @@ module OodCore
147
150
  end
148
151
  jobs
149
152
  end
153
+ rescue SlurmTimeoutError
154
+ # TODO: could use a log entry here
155
+ return [{ id: id, state: 'undetermined' }]
150
156
  end
151
157
 
152
158
  def squeue_fields(attrs)
@@ -303,7 +309,18 @@ module OodCore
303
309
 
304
310
  cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
305
311
  o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
306
- s.success? ? o : raise(Error, e)
312
+ s.success? ? interpret_and_raise(o, e) : raise(Error, e)
313
+ end
314
+
315
+ # Helper function to raise an error based on the contents of stderr.
316
+ # Slurm exits 0 even when the command fails, so we need to interpret stderr
317
+ # to see if the command was actually successful.
318
+ def interpret_and_raise(stdout, stderr)
319
+ return stdout if stderr.empty?
320
+
321
+ raise SlurmTimeoutError, stderr if /^slurm_load_jobs error: Socket timed out/.match(stderr)
322
+
323
+ stdout
307
324
  end
308
325
 
309
326
  def squeue_attrs_for_info_attrs(attrs)
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.13.0"
3
+ VERSION = "0.14.0"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2020-08-10 00:00:00.000000000 Z
13
+ date: 2020-10-01 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -166,6 +166,11 @@ files:
166
166
  - lib/ood_core/job/adapters/ccq.rb
167
167
  - lib/ood_core/job/adapters/drmaa.rb
168
168
  - lib/ood_core/job/adapters/helper.rb
169
+ - lib/ood_core/job/adapters/kubernetes.rb
170
+ - lib/ood_core/job/adapters/kubernetes/batch.rb
171
+ - lib/ood_core/job/adapters/kubernetes/helper.rb
172
+ - lib/ood_core/job/adapters/kubernetes/resources.rb
173
+ - lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb
169
174
  - lib/ood_core/job/adapters/linux_host.rb
170
175
  - lib/ood_core/job/adapters/linux_host/launcher.rb
171
176
  - lib/ood_core/job/adapters/linux_host/templates/email.erb.sh
@@ -216,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
216
221
  - !ruby/object:Gem::Version
217
222
  version: '0'
218
223
  requirements: []
219
- rubygems_version: 3.0.3
224
+ rubygems_version: 3.0.8
220
225
  signing_key:
221
226
  specification_version: 4
222
227
  summary: Open OnDemand core library