ood_core 0.13.0 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3296708d7bc47f3379a9e4a6c845d3f25c5ccefb599f4b92406d9dffdaef220b
4
- data.tar.gz: b6af9e90b67bc9a7a52203808d849d8800336b30b09bdb8ed204526d01bc92e9
3
+ metadata.gz: 52ba764b085dedb7eaeb06d95751f1804a50488e1859f980a7836d2d9032b95d
4
+ data.tar.gz: c2dc5edf395fe158960f33b80c554f3dc745f15e7ec1337b738683a0e1bbdc7f
5
5
  SHA512:
6
- metadata.gz: 623ac6e6f8081d68a3e925d1150c9f20a0f613ccfb6837519d1b95d04533a72caa403c54327aad85dcea9c0694cc23941f40307d942623c095f53fed7fc32026
7
- data.tar.gz: 0d785a9ade36b2f6f62f9ae55672091346aa4fb76bf358e6c00d4bc007623b8d1798813474665fc7b4d850d89e041fae5c2fefc9719fbe9f53a161a76127eaad
6
+ metadata.gz: 59915bae23a008a923c249d222e50548a7bee3438144068a29ae1cafdd489ca1229ee1a14f4f81e3fd065381f46f920bef24344fe633c7c578cb1f6a4f9a2a77
7
+ data.tar.gz: 8d2ca42c7f49158c8d321c21b79aff1c636df3c77bb7e71107db70371a34058d79c8a5ec32ca93883e7d3bcc7dc2202375144d23613f167ab089318d6270248c
@@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6
6
  and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7
7
 
8
8
  ## [Unreleased]
9
+ ## [0.14.0] - 2020-10-01
10
+ ### Added
11
+ - Kubernetes adapter in PR [156](https://github.com/OSC/ood_core/pull/156)
12
+
13
+ ### Fixed
14
+ - Catch Slurm times. [209](https://github.com/OSC/ood_core/pull/209)
15
+ - LHA race condition in deleteing tmp files. [212](https://github.com/OSC/ood_core/pull/212)
16
+
9
17
  ## [0.13.0] - 2020-08-10
10
18
  ### Added
11
19
  - CloudyCluster CCQ Adapter
@@ -247,7 +255,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
247
255
  ### Added
248
256
  - Initial release!
249
257
 
250
- [Unreleased]: https://github.com/OSC/ood_core/compare/v0.13.0...HEAD
258
+ [Unreleased]: https://github.com/OSC/ood_core/compare/v0.14.0...HEAD
259
+ [0.14.0]: https://github.com/OSC/ood_core/compare/v0.13.0...v0.14.0
251
260
  [0.13.0]: https://github.com/OSC/ood_core/compare/v0.12.0...v0.13.0
252
261
  [0.12.0]: https://github.com/OSC/ood_core/compare/v0.11.4...v0.12.0
253
262
  [0.11.4]: https://github.com/OSC/ood_core/compare/v0.11.3...v0.11.4
data/README.md CHANGED
@@ -6,7 +6,7 @@
6
6
 
7
7
  - Website: http://openondemand.org/
8
8
  - Website repo with JOSS publication: https://github.com/OSC/Open-OnDemand
9
- - Documentation: https://osc.github.io/ood-documentation/master/
9
+ - Documentation: https://osc.github.io/ood-documentation/latest/
10
10
  - Main code repo: https://github.com/OSC/ondemand
11
11
  - Core library repo: https://github.com/OSC/ood_core
12
12
 
@@ -0,0 +1,193 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "ood_core/refinements/array_extensions"
3
+
4
+ module OodCore
5
+ module Job
6
+ class Factory
7
+ using Refinements::HashExtensions
8
+
9
+ def self.build_kubernetes(config)
10
+ batch = Adapters::Kubernetes::Batch.new(config.to_h.symbolize_keys, Adapters::Kubernetes::Helper.new)
11
+ Adapters::Kubernetes.new(batch)
12
+ end
13
+ end
14
+
15
+ module Adapters
16
+ class Kubernetes < Adapter
17
+
18
+ using Refinements::ArrayExtensions
19
+ using Refinements::HashExtensions
20
+
21
+ require "ood_core/job/adapters/kubernetes/batch"
22
+
23
+ attr_reader :batch
24
+
25
+ def initialize(batch)
26
+ @batch = batch
27
+ end
28
+
29
+ # Submit a job with the attributes defined in the job template instance
30
+ # @abstract Subclass is expected to implement {#submit}
31
+ # @raise [NotImplementedError] if subclass did not define {#submit}
32
+ # @example Submit job template to cluster
33
+ # solver_id = job_adapter.submit(solver_script)
34
+ # #=> "1234.server"
35
+ # @example Submit job that depends on previous job
36
+ # post_id = job_adapter.submit(
37
+ # post_script,
38
+ # afterok: solver_id
39
+ # )
40
+ # #=> "1235.server"
41
+ # @param script [Script] script object that describes the
42
+ # script and attributes for the submitted job
43
+ # @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
44
+ # at any point after dependent jobs have started execution
45
+ # @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
46
+ # execution only after dependent jobs have terminated with no errors
47
+ # @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
48
+ # execution only after dependent jobs have terminated with errors
49
+ # @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
50
+ # execution after dependent jobs have terminated
51
+ # @return [String] the job id returned after successfully submitting a job
52
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
53
+ raise ArgumentError, 'Must specify the script' if script.nil?
54
+
55
+ batch.submit(script)
56
+ rescue Batch::Error => e
57
+ raise JobAdapterError, e.message
58
+ end
59
+
60
+
61
+ # Retrieve info for all jobs from the resource manager
62
+ # @abstract Subclass is expected to implement {#info_all}
63
+ # @raise [NotImplementedError] if subclass did not define {#info_all}
64
+ # @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
65
+ # This array specifies only attrs you want, in addition to id and status.
66
+ # If an array, the Info object that is returned to you is not guarenteed
67
+ # to have a value for any attr besides the ones specified and id and status.
68
+ #
69
+ # For certain adapters this may speed up the response since
70
+ # adapters can get by without populating the entire Info object
71
+ # @return [Array<Info>] information describing submitted jobs
72
+ def info_all(attrs: nil)
73
+ batch.info_all(attrs: attrs)
74
+ rescue Batch::Error => e
75
+ raise JobAdapterError, e.message
76
+ end
77
+
78
+ # Retrieve info for all jobs for a given owner or owners from the
79
+ # resource manager
80
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
81
+ # @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
82
+ # This array specifies only attrs you want, in addition to id and status.
83
+ # If an array, the Info object that is returned to you is not guarenteed
84
+ # to have a value for any attr besides the ones specified and id and status.
85
+ #
86
+ # For certain adapters this may speed up the response since
87
+ # adapters can get by without populating the entire Info object
88
+ # @return [Array<Info>] information describing submitted jobs
89
+ def info_where_owner(owner, attrs: nil)
90
+ owner = Array.wrap(owner).map(&:to_s)
91
+
92
+ # must at least have job_owner to filter by job_owner
93
+ attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
94
+
95
+ info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
96
+ end
97
+
98
+ # Iterate over each job Info object
99
+ # @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
100
+ # This array specifies only attrs you want, in addition to id and status.
101
+ # If an array, the Info object that is returned to you is not guarenteed
102
+ # to have a value for any attr besides the ones specified and id and status.
103
+ #
104
+ # For certain adapters this may speed up the response since
105
+ # adapters can get by without populating the entire Info object
106
+ # @yield [Info] of each job to block
107
+ # @return [Enumerator] if no block given
108
+ def info_all_each(attrs: nil)
109
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
110
+
111
+ info_all(attrs: attrs).each do |job|
112
+ yield job
113
+ end
114
+ end
115
+
116
+ # Iterate over each job Info object
117
+ # @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
118
+ # @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
119
+ # This array specifies only attrs you want, in addition to id and status.
120
+ # If an array, the Info object that is returned to you is not guarenteed
121
+ # to have a value for any attr besides the ones specified and id and status.
122
+ #
123
+ # For certain adapters this may speed up the response since
124
+ # adapters can get by without populating the entire Info object
125
+ # @yield [Info] of each job to block
126
+ # @return [Enumerator] if no block given
127
+ def info_where_owner_each(owner, attrs: nil)
128
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
129
+
130
+ info_where_owner(owner, attrs: attrs).each do |job|
131
+ yield job
132
+ end
133
+ end
134
+
135
+ # Whether the adapter supports job arrays
136
+ # @return [Boolean] - assumes true; but can be overridden by adapters that
137
+ # explicitly do not
138
+ def supports_job_arrays?
139
+ false
140
+ end
141
+
142
+ # Retrieve job info from the resource manager
143
+ # @abstract Subclass is expected to implement {#info}
144
+ # @raise [NotImplementedError] if subclass did not define {#info}
145
+ # @param id [#to_s] the id of the job
146
+ # @return [Info] information describing submitted job
147
+ def info(id)
148
+ batch.info(id.to_s)
149
+ rescue Batch::Error => e
150
+ raise JobAdapterError, e.message
151
+ end
152
+
153
+ # Retrieve job status from resource manager
154
+ # @note Optimized slightly over retrieving complete job information from server
155
+ # @abstract Subclass is expected to implement {#status}
156
+ # @raise [NotImplementedError] if subclass did not define {#status}
157
+ # @param id [#to_s] the id of the job
158
+ # @return [Status] status of job
159
+ def status(id)
160
+ info(id).status
161
+ end
162
+
163
+ # Put the submitted job on hold
164
+ # @abstract Subclass is expected to implement {#hold}
165
+ # @raise [NotImplementedError] if subclass did not define {#hold}
166
+ # @param id [#to_s] the id of the job
167
+ # @return [void]
168
+ def hold(id)
169
+ raise NotImplementedError, 'subclass did not define #hold'
170
+ end
171
+
172
+ # Release the job that is on hold
173
+ # @abstract Subclass is expected to implement {#release}
174
+ # @raise [NotImplementedError] if subclass did not define {#release}
175
+ # @param id [#to_s] the id of the job
176
+ # @return [void]
177
+ def release(id)
178
+ raise NotImplementedError, 'subclass did not define #release'
179
+ end
180
+
181
+ # Delete the submitted job.
182
+ #
183
+ # @param id [#to_s] the id of the job
184
+ # @return [void]
185
+ def delete(id)
186
+ batch.delete(id.to_s)
187
+ rescue Batch::Error => e
188
+ raise JobAdapterError, e.message
189
+ end
190
+ end
191
+ end
192
+ end
193
+ end
@@ -0,0 +1,350 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "json"
3
+
4
+ class OodCore::Job::Adapters::Kubernetes::Batch
5
+
6
+ require "ood_core/job/adapters/kubernetes/helper"
7
+
8
+ Helper = OodCore::Job::Adapters::Kubernetes::Helper
9
+ Resources = OodCore::Job::Adapters::Kubernetes::Resources
10
+
11
+ using OodCore::Refinements::HashExtensions
12
+
13
+ class Error < StandardError; end
14
+
15
+ attr_reader :config_file, :bin, :cluster_name, :mounts
16
+ attr_reader :all_namespaces, :using_context, :helper
17
+ attr_reader :username_prefix
18
+
19
+ def initialize(options = {}, helper = Helper.new)
20
+ options = options.to_h.symbolize_keys
21
+
22
+ @config_file = options.fetch(:config_file, default_config_file)
23
+ @bin = options.fetch(:bin, '/usr/bin/kubectl')
24
+ @cluster_name = options.fetch(:cluster_name, 'open-ondemand')
25
+ @mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
26
+ @all_namespaces = options.fetch(:all_namespaces, false)
27
+ @username_prefix = options.fetch(:username_prefix, nil)
28
+
29
+ @using_context = false
30
+ @helper = helper
31
+
32
+ begin
33
+ make_kubectl_config(options)
34
+ rescue
35
+ # FIXME could use a log here
36
+ # means you couldn't 'kubectl set config'
37
+ end
38
+ end
39
+
40
+ def resource_file(resource_type = 'pod')
41
+ File.dirname(__FILE__) + "/templates/#{resource_type}.yml.erb"
42
+ end
43
+
44
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
45
+ raise ArgumentError, 'Must specify the script' if script.nil?
46
+
47
+ resource_yml, id = generate_id_yml(script.native)
48
+ call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
49
+
50
+ id
51
+ end
52
+
53
+ def generate_id(name)
54
+ # 2_821_109_907_456 = 36**8
55
+ name.downcase.tr(' ', '-') + '-' + rand(2_821_109_907_456).to_s(36)
56
+ end
57
+
58
+ def info_all(attrs: nil)
59
+ cmd = if all_namespaces
60
+ "#{base_cmd} get pods -o json --all-namespaces"
61
+ else
62
+ "#{namespaced_cmd} get pods -o json"
63
+ end
64
+
65
+ output = call(cmd)
66
+ all_pods_to_info(output)
67
+ end
68
+
69
+ def info_where_owner(owner, attrs: nil)
70
+ owner = Array.wrap(owner).map(&:to_s)
71
+
72
+ # must at least have job_owner to filter by job_owner
73
+ attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
74
+
75
+ info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
76
+ end
77
+
78
+ def info_all_each(attrs: nil)
79
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
80
+
81
+ info_all(attrs: attrs).each do |job|
82
+ yield job
83
+ end
84
+ end
85
+
86
+ def info_where_owner_each(owner, attrs: nil)
87
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
88
+
89
+ info_where_owner(owner, attrs: attrs).each do |job|
90
+ yield job
91
+ end
92
+ end
93
+
94
+ def info(id)
95
+ pod_json = call_json_output('get', 'pod', id)
96
+
97
+ begin
98
+ service_json = call_json_output('get', 'service', service_name(id))
99
+ secret_json = call_json_output('get', 'secret', secret_name(id))
100
+ rescue
101
+ # it's ok if these don't exist
102
+ service_json ||= nil
103
+ secret_json ||= nil
104
+ end
105
+
106
+ helper.info_from_json(pod_json: pod_json, service_json: service_json, secret_json: secret_json)
107
+ end
108
+
109
+ def status(id)
110
+ info(id).status
111
+ end
112
+
113
+ def delete(id)
114
+ call("#{namespaced_cmd} delete pod #{id}")
115
+
116
+ begin
117
+ call("#{namespaced_cmd} delete service #{service_name(id)}")
118
+ call("#{namespaced_cmd} delete secret #{secret_name(id)}")
119
+ call("#{namespaced_cmd} delete configmap #{configmap_name(id)}")
120
+ rescue
121
+ # FIXME: retries? delete if exists?
122
+ # just eat the results of deleting services and secrets
123
+ end
124
+ end
125
+
126
+ def configmap_mount_path
127
+ '/ood'
128
+ end
129
+
130
+ private
131
+
132
+ # helper to help format multi-line yaml data from the submit.yml into
133
+ # mutli-line yaml in the pod.yml.erb
134
+ def config_data_lines(data)
135
+ output = []
136
+ first = true
137
+
138
+ data.to_s.each_line do |line|
139
+ output.append(first ? line : line.prepend(" "))
140
+ first = false
141
+ end
142
+
143
+ output
144
+ end
145
+
146
+ def username
147
+ @username ||= Etc.getlogin
148
+ end
149
+
150
+ def k8s_username
151
+ username_prefix.nil? ? username : "#{username_prefix}-#{username}"
152
+ end
153
+
154
+ def run_as_user
155
+ Etc.getpwnam(username).uid
156
+ end
157
+
158
+ def run_as_group
159
+ Etc.getpwnam(username).gid
160
+ end
161
+
162
+ def fs_group
163
+ run_as_group
164
+ end
165
+
166
+ # helper to template resource yml you're going to submit and
167
+ # create an id.
168
+ def generate_id_yml(native_data)
169
+ container = helper.container_from_native(native_data[:container])
170
+ id = generate_id(container.name)
171
+ configmap = helper.configmap_from_native(native_data, id)
172
+ init_containers = helper.init_ctrs_from_native(native_data[:init_containers])
173
+ spec = Resources::PodSpec.new(container, init_containers: init_containers)
174
+ all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
175
+
176
+ template = ERB.new(File.read(resource_file))
177
+
178
+ [template.result(binding), id]
179
+ end
180
+
181
+ # helper to call kubectl and get json data back.
182
+ # verb, resrouce and id are the kubernetes parlance terms.
183
+ # example: 'kubectl get pod my-pod-id' is verb=get, resource=pod
184
+ # and id=my-pod-id
185
+ def call_json_output(verb, resource, id, stdin: nil)
186
+ cmd = "#{formatted_ns_cmd} #{verb} #{resource} #{id}"
187
+ data = call(cmd, stdin: stdin)
188
+ data = data.empty? ? '{}' : data
189
+ json_data = JSON.parse(data, symbolize_names: true)
190
+
191
+ json_data
192
+ end
193
+
194
+ def service_name(id)
195
+ helper.service_name(id)
196
+ end
197
+
198
+ def secret_name(id)
199
+ helper.secret_name(id)
200
+ end
201
+
202
+ def configmap_name(id)
203
+ helper.configmap_name(id)
204
+ end
205
+
206
+ def namespace
207
+ default_namespace
208
+ end
209
+
210
+ def default_namespace
211
+ username
212
+ end
213
+
214
+ def context
215
+ cluster_name
216
+ end
217
+
218
+ def default_config_file
219
+ (ENV['KUBECONFIG'] || "#{Dir.home}/.kube/config")
220
+ end
221
+
222
+ def default_auth
223
+ {
224
+ type: 'managaged'
225
+ }.symbolize_keys
226
+ end
227
+
228
+ def default_server
229
+ {
230
+ endpoint: 'https://localhost:8080',
231
+ cert_authority_file: nil
232
+ }.symbolize_keys
233
+ end
234
+
235
+ def formatted_ns_cmd
236
+ "#{namespaced_cmd} -o json"
237
+ end
238
+
239
+ def namespaced_cmd
240
+ "#{base_cmd} --namespace=#{namespace}"
241
+ end
242
+
243
+ def base_cmd
244
+ base = "#{bin} --kubeconfig=#{config_file}"
245
+ base << " --context=#{context}" if using_context
246
+ base
247
+ end
248
+
249
+ def all_pods_to_info(data)
250
+ json_data = JSON.parse(data, symbolize_names: true)
251
+ pods = json_data.dig(:items)
252
+
253
+ info_array = []
254
+ pods.each do |pod|
255
+ info = pod_info_from_json(pod)
256
+ info_array.push(info) unless info.nil?
257
+ end
258
+
259
+ info_array
260
+ rescue JSON::ParserError
261
+ # 'no resources in <namespace>' throws parse error
262
+ []
263
+ end
264
+
265
+ def pod_info_from_json(pod)
266
+ hash = helper.pod_info_from_json(pod)
267
+ OodCore::Job::Info.new(hash)
268
+ rescue Helper::K8sDataError
269
+ # FIXME: silently eating error, could probably use a logger
270
+ nil
271
+ end
272
+
273
+ def make_kubectl_config(config)
274
+ set_cluster(config.fetch(:server, default_server).to_h.symbolize_keys)
275
+ configure_auth(config.fetch(:auth, default_auth).to_h.symbolize_keys)
276
+ end
277
+
278
+ def configure_auth(auth)
279
+ type = auth.fetch(:type)
280
+ return if managed?(type)
281
+
282
+ case type
283
+ when 'gke'
284
+ set_gke_config(auth)
285
+ when 'oidc'
286
+ set_context
287
+ end
288
+ end
289
+
290
+ def use_context
291
+ @using_context = true
292
+ end
293
+
294
+ def managed?(type)
295
+ if type.nil?
296
+ true # maybe should be false?
297
+ else
298
+ type.to_s == 'managed'
299
+ end
300
+ end
301
+
302
+ def set_gke_config(auth)
303
+ cred_file = auth.fetch(:svc_acct_file)
304
+
305
+ cmd = "gcloud auth activate-service-account --key-file=#{cred_file}"
306
+ call(cmd)
307
+
308
+ set_gke_credentials(auth)
309
+ end
310
+
311
+ def set_gke_credentials(auth)
312
+
313
+ zone = auth.fetch(:zone, nil)
314
+ region = auth.fetch(:region, nil)
315
+
316
+ locale = ''
317
+ locale = "--zone=#{zone}" unless zone.nil?
318
+ locale = "--region=#{region}" unless region.nil?
319
+
320
+ # gke cluster name can probably can differ from what ood calls the cluster
321
+ cmd = "gcloud container clusters get-credentials #{locale} #{cluster_name}"
322
+ env = { 'KUBECONFIG' => config_file }
323
+ call(cmd, env)
324
+ end
325
+
326
+ def set_context
327
+ cmd = "#{base_cmd} config set-context #{cluster_name}"
328
+ cmd << " --cluster=#{cluster_name} --namespace=#{namespace}"
329
+ cmd << " --user=#{k8s_username}"
330
+
331
+ call(cmd)
332
+ use_context
333
+ end
334
+
335
+ def set_cluster(config)
336
+ server = config.fetch(:endpoint)
337
+ cert = config.fetch(:cert_authority_file, nil)
338
+
339
+ cmd = "#{base_cmd} config set-cluster #{cluster_name}"
340
+ cmd << " --server=#{server}"
341
+ cmd << " --certificate-authority=#{cert}" unless cert.nil?
342
+
343
+ call(cmd)
344
+ end
345
+
346
+ def call(cmd = '', env: {}, stdin: nil)
347
+ o, error, s = Open3.capture3(env, cmd, stdin_data: stdin.to_s)
348
+ s.success? ? o : raise(Error, error)
349
+ end
350
+ end
@@ -0,0 +1,298 @@
1
+ class OodCore::Job::Adapters::Kubernetes::Helper
2
+
3
+ require 'ood_core/job/adapters/kubernetes/resources'
4
+ require 'resolv'
5
+ require 'base64'
6
+
7
+ class K8sDataError < StandardError; end
8
+
9
+ Resources = OodCore::Job::Adapters::Kubernetes::Resources
10
+
11
+ # Extract info from json data. The data is expected to be from the kubectl
12
+ # command and conform to kubernetes' datatype structures.
13
+ #
14
+ # Returns { native: {host: localhost, port:80, password: sshhh }} in the info
15
+ # object field in lieu of writing a connection.yml
16
+ #
17
+ # @param pod_json [#to_h]
18
+ # the pod data returned from 'kubectl get pod abc-123'
19
+ # @param service_json [#to_h]
20
+ # the service data returned from 'kubectl get service abc-123-service'
21
+ # @param secret_json [#to_h]
22
+ # the secret data returned from 'kubectl get secret abc-123-secret'
23
+ # @return [OodCore::Job::Info]
24
+ def info_from_json(pod_json: nil, service_json: nil, secret_json: nil)
25
+ pod_hash = pod_info_from_json(pod_json)
26
+ service_hash = service_info_from_json(service_json)
27
+ secret_hash = secret_info_from_json(secret_json)
28
+
29
+ # can't just use deep_merge bc we don't depend *directly* on rails
30
+ pod_hash[:native] = pod_hash[:native].merge(service_hash[:native])
31
+ pod_hash[:native] = pod_hash[:native].merge(secret_hash[:native])
32
+ OodCore::Job::Info.new(pod_hash)
33
+ rescue NoMethodError
34
+ raise K8sDataError, "unable to read data correctly from json"
35
+ end
36
+
37
+ # Turn a container hash into a Kubernetes::Resources::Container
38
+ #
39
+ # @param container [#to_h]
40
+ # the input container hash
41
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::Container]
42
+ def container_from_native(container)
43
+ Resources::Container.new(
44
+ container[:name],
45
+ container[:image],
46
+ command: parse_command(container[:command]),
47
+ port: container[:port],
48
+ env: container.fetch(:env, []),
49
+ memory: container[:memory],
50
+ cpu: container[:cpu],
51
+ working_dir: container[:working_dir],
52
+ restart_policy: container[:restart_policy]
53
+ )
54
+ end
55
+
56
+ # Parse a command string given from a user and return an array.
57
+ # If given an array, the input is simply returned back.
58
+ #
59
+ # @param cmd [#to_s]
60
+ # the command to parse
61
+ # @return [Array<#to_s>]
62
+ # the command parsed into an array of arguements
63
+ def parse_command(cmd)
64
+ if cmd&.is_a?(Array)
65
+ cmd
66
+ else
67
+ Shellwords.split(cmd.to_s)
68
+ end
69
+ end
70
+
71
+ # Turn a configmap hash into a Kubernetes::Resources::ConfigMap
72
+ # that can be used in templates. Needs an id so that the resulting
73
+ # configmap has a known name.
74
+ #
75
+ # @param native [#to_h]
76
+ # the input configmap hash
77
+ # @param id [#to_s]
78
+ # the id to use for giving the configmap a name
79
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
80
+ def configmap_from_native(native, id)
81
+ configmap = native.fetch(:configmap, nil)
82
+ return nil if configmap.nil?
83
+
84
+ Resources::ConfigMap.new(
85
+ configmap_name(id),
86
+ configmap[:filename],
87
+ configmap[:data]
88
+ )
89
+ end
90
+
91
+ # parse initialization containers from native data
92
+ #
93
+ # @param native_data [#to_h]
94
+ # the native data to parse. Expected key init_ctrs and for that
95
+ # key to be an array of hashes.
96
+ # @return [Array<OodCore::Job::Adapters::Kubernetes::Resources::Container>]
97
+ # the array of init containers
98
+ def init_ctrs_from_native(ctrs)
99
+ init_ctrs = []
100
+
101
+ ctrs&.each do |ctr_raw|
102
+ ctr = container_from_native(ctr_raw)
103
+ init_ctrs.push(ctr)
104
+ end
105
+
106
+ init_ctrs
107
+ end
108
+
109
+ def service_name(id)
110
+ id + '-service'
111
+ end
112
+
113
+ def secret_name(id)
114
+ id + '-secret'
115
+ end
116
+
117
+ def configmap_name(id)
118
+ id + '-configmap'
119
+ end
120
+
121
+ # Extract pod info from json data. The data is expected to be from the kubectl
122
+ # command and conform to kubernetes' datatype structures.
123
+ #
124
+ # @param json_data [#to_h]
125
+ # the pod data returned from 'kubectl get pod abc-123'
126
+ # @return [#to_h]
127
+ # the hash of info expected from adapters
128
+ def pod_info_from_json(json_data)
129
+ {
130
+ id: json_data.dig(:metadata, :name).to_s,
131
+ job_name: name_from_metadata(json_data.dig(:metadata)),
132
+ status: pod_status_from_json(json_data),
133
+ job_owner: json_data.dig(:metadata, :namespace).to_s,
134
+ submission_time: submission_time(json_data),
135
+ dispatch_time: dispatch_time(json_data),
136
+ wallclock_time: wallclock_time(json_data),
137
+ native: {
138
+ host: get_host(json_data.dig(:status, :hostIP))
139
+ },
140
+ procs: procs_from_json(json_data)
141
+ }
142
+ rescue NoMethodError
143
+ # gotta raise an error because Info.new will throw an error if id is undefined
144
+ raise K8sDataError, "unable to read data correctly from json"
145
+ end
146
+
147
+ private
148
+
149
+ def get_host(ip)
150
+ Resolv.getname(ip)
151
+ rescue Resolv::ResolvError
152
+ ip
153
+ end
154
+
155
+ def name_from_metadata(metadata)
156
+ name = metadata.dig(:labels, :'app.kubernetes.io/name')
157
+ name = metadata.dig(:labels, :'k8s-app') if name.nil?
158
+ name = metadata.dig(:name) if name.nil? # pod-id but better than nil?
159
+ name
160
+ end
161
+
162
+ def service_info_from_json(json_data)
163
+ # all we need is the port - .spec.ports[0].nodePort
164
+ ports = json_data.dig(:spec, :ports)
165
+ {
166
+ native:
167
+ {
168
+ port: ports[0].dig(:nodePort)
169
+ }
170
+ }
171
+ rescue
172
+ empty_native
173
+ end
174
+
175
+ def secret_info_from_json(json_data)
176
+ raw = json_data.dig(:data, :password)
177
+ {
178
+ native:
179
+ {
180
+ password: Base64.decode64(raw)
181
+ }
182
+ }
183
+ rescue
184
+ empty_native
185
+ end
186
+
187
+ def empty_native
188
+ {
189
+ native: {}
190
+ }
191
+ end
192
+
193
+ def dispatch_time(json_data)
194
+ status = pod_status_from_json(json_data)
195
+ return nil if status == 'undetermined'
196
+
197
+ state_data = json_data.dig(:status, :containerStatuses)[0].dig(:state)
198
+ date_string = nil
199
+
200
+ if status == 'completed'
201
+ date_string = state_data.dig(:terminated, :startedAt)
202
+ elsif status == 'running'
203
+ date_string = state_data.dig(:running, :startedAt)
204
+ end
205
+
206
+ date_string.nil? ? nil : DateTime.parse(date_string).to_time.to_i
207
+ end
208
+
209
+ def wallclock_time(json_data)
210
+ status = pod_status_from_json(json_data)
211
+ return nil if status == 'undetermined'
212
+
213
+ state_data = json_data.dig(:status, :containerStatuses)[0].dig(:state)
214
+ start_time = dispatch_time(json_data)
215
+ return nil if start_time.nil?
216
+
217
+ et = end_time(status, state_data)
218
+
219
+ et.nil? ? nil : et - start_time
220
+ end
221
+
222
+ def end_time(status, state_data)
223
+ if status == 'completed'
224
+ end_time_string = state_data.dig(:terminated, :finishedAt)
225
+ et = DateTime.parse(end_time_string).to_time.to_i
226
+ elsif status == 'running'
227
+ et = DateTime.now.to_time.to_i
228
+ else
229
+ et = nil
230
+ end
231
+
232
+ et
233
+ end
234
+
235
+ def submission_time(json_data)
236
+ status = json_data.dig(:status)
237
+ start = status.dig(:startTime)
238
+
239
+ if start.nil?
240
+ # the pod is in some pending state limbo
241
+ conditions = status.dig(:conditions)
242
+ # best guess to start time is just the first condition's
243
+ # transition time
244
+ str = conditions[0].dig(:lastTransitionTime)
245
+ else
246
+ str = start
247
+ end
248
+
249
+ DateTime.parse(str).to_time.to_i
250
+ end
251
+
252
+ def pod_status_from_json(json_data)
253
+ state = 'undetermined'
254
+ status = json_data.dig(:status)
255
+ container_statuses = status.dig(:containerStatuses)
256
+
257
+ if container_statuses.nil?
258
+ # if you're here, it means you're pending, probably unschedulable
259
+ return OodCore::Job::Status.new(state: state)
260
+ end
261
+
262
+ # only support 1 container/pod
263
+ json_state = container_statuses[0].dig(:state)
264
+ state = 'running' unless json_state.dig(:running).nil?
265
+ state = terminated_state(json_state) unless json_state.dig(:terminated).nil?
266
+ state = 'queued' unless json_state.dig(:waiting).nil?
267
+
268
+ OodCore::Job::Status.new(state: state)
269
+ end
270
+
271
+ def terminated_state(status)
272
+ reason = status.dig(:terminated, :reason)
273
+ if reason == 'Error'
274
+ 'suspended'
275
+ else
276
+ 'completed'
277
+ end
278
+ end
279
+
280
+ def procs_from_json(json_data)
281
+ containers = json_data.dig(:spec, :containers)
282
+ resources = containers[0].dig(:resources)
283
+
284
+ cpu = resources.dig(:limits, :cpu)
285
+ millicores_rex = /(\d+)m/
286
+
287
+ # ok to return string bc nil.to_i == 0 and we'd rather return
288
+ # nil (undefined) than 0 which is confusing.
289
+ if millicores_rex.match?(cpu)
290
+ millicores = millicores_rex.match(cpu)[1].to_i
291
+
292
+ # have to return at least 1 bc 200m could be 0
293
+ ((millicores + 1000) / 1000).to_s
294
+ else
295
+ cpu
296
+ end
297
+ end
298
+ end
@@ -0,0 +1,56 @@
1
+ module OodCore::Job::Adapters::Kubernetes::Resources
2
+
3
+ class ConfigMap
4
+ attr_accessor :name, :filename, :data
5
+
6
+ def initialize(name, filename, data)
7
+ @name = name
8
+ @filename = filename
9
+ @data = data
10
+ end
11
+ end
12
+
13
+ class Container
14
+ attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
15
+ :restart_policy
16
+
17
+ def initialize(
18
+ name, image, command: [], port: nil, env: [], memory: "4Gi", cpu: "1",
19
+ working_dir: "", restart_policy: "Never"
20
+ )
21
+ raise ArgumentError, "containers need valid names and images" unless name && image
22
+
23
+ @name = name
24
+ @image = image
25
+ @command = command.nil? ? [] : command
26
+ @port = port&.to_i
27
+ @env = env.nil? ? [] : env
28
+ @memory = memory.nil? ? "4Gi" : memory
29
+ @cpu = cpu.nil? ? "1" : cpu
30
+ @working_dir = working_dir.nil? ? "" : working_dir
31
+ @restart_policy = restart_policy.nil? ? "Never" : restart_policy
32
+ end
33
+
34
+ def ==(other)
35
+ name == other.name &&
36
+ image == other.image &&
37
+ command == other.command &&
38
+ port == other.port &&
39
+ env == other.env &&
40
+ memory == other.memory &&
41
+ cpu == other.cpu &&
42
+ working_dir == other.working_dir &&
43
+ restart_policy == other.restart_policy
44
+ end
45
+
46
+ end
47
+
48
+ class PodSpec
49
+ attr_accessor :container, :init_containers
50
+ def initialize(container, init_containers: nil)
51
+ @container = container
52
+ @init_containers = init_containers
53
+ end
54
+ end
55
+
56
+ end
@@ -0,0 +1,123 @@
1
+ apiVersion: v1
2
+ kind: Pod
3
+ metadata:
4
+ namespace: <%= namespace %>
5
+ name: <%= id %>
6
+ labels:
7
+ job: <%= id %>
8
+ app.kubernetes.io/name: <%= container.name %>
9
+ app.kubernetes.io/managed-by: open-ondemand
10
+ spec:
11
+ restartPolicy: <%= spec.container.restart_policy %>
12
+ securityContext:
13
+ runAsUser: <%= run_as_user %>
14
+ runAsGroup: <%= run_as_group %>
15
+ fsGroup: <%= fs_group %>
16
+ containers:
17
+ - name: "<%= spec.container.name %>"
18
+ image: <%= spec.container.image %>
19
+ imagePullPolicy: IfNotPresent
20
+ <% unless spec.container.working_dir.empty? %>
21
+ workingDir: "<%= spec.container.working_dir %>"
22
+ <% end %>
23
+ <% unless spec.container.env.empty? %>
24
+ env:
25
+ <% spec.container.env.each do |env| %>
26
+ - name: <%= env[:name] %>
27
+ value: "<%= env[:value] %>"
28
+ <% end %> <%# for each env %>
29
+ <% end %> <%# unless env is nil %>
30
+ <% unless spec.container.command.empty? %>
31
+ command:
32
+ <% spec.container.command.each do |cmd| %>
33
+ - "<%= cmd %>"
34
+ <% end %> <%# for each command %>
35
+ <% end %> <%# unless command is nil %>
36
+ <% unless spec.container.port.nil? %>
37
+ ports:
38
+ - containerPort: <%= spec.container.port %>
39
+ <% end %>
40
+ volumeMounts:
41
+ <% unless configmap.nil? %>
42
+ - name: configmap-volume
43
+ mountPath: <%= configmap_mount_path %>
44
+ <% end %>
45
+ <% all_mounts.each do |mount| %>
46
+ - name: <%= mount[:name] %>
47
+ mountPath: <%= mount[:destination_path] %>
48
+ <% end %> <%# for each mount %>
49
+ resources:
50
+ limits:
51
+ memory: "<%= spec.container.memory %>"
52
+ cpu: "<%= spec.container.cpu %>"
53
+ requests:
54
+ memory: "<%= spec.container.memory %>"
55
+ cpu: "<%= spec.container.cpu %>"
56
+ <% unless spec.init_containers.nil? %>
57
+ initContainers:
58
+ <% spec.init_containers.each do |ctr| %>
59
+ - name: "<%= ctr.name %>"
60
+ image: "<%= ctr.image %>"
61
+ command:
62
+ <% ctr.command.each do |cmd| %>
63
+ - "<%= cmd %>"
64
+ <% end %> <%# command loop %>
65
+ volumeMounts:
66
+ <% unless configmap.nil? %>
67
+ - name: configmap-volume
68
+ mountPath: <%= configmap_mount_path %>
69
+ <% end %>
70
+ <% all_mounts.each do |mount| %>
71
+ - name: <%= mount[:name] %>
72
+ mountPath: <%= mount[:destination_path] %>
73
+ <% end %> <%# for each mount %>
74
+ <% end %> <%# init container loop %>
75
+ <% end %> <%# if init containers %>
76
+ <% unless configmap.nil? || all_mounts.empty? %>
77
+ volumes:
78
+ <% end %> <%# configmap.nil? || all_mounts.empty? %>
79
+ <% unless configmap.nil? %>
80
+ - name: configmap-volume
81
+ configMap:
82
+ name: <%= configmap_name(id) %>
83
+ <% end %>
84
+ <% all_mounts.each do |mount| %>
85
+ <% if mount[:type] == 'nfs' %>
86
+ - name: <%= mount[:name] %>
87
+ nfs:
88
+ server: <%= mount[:host] %>
89
+ path: <%= mount[:path] %>
90
+ <% elsif mount[:type] == 'host' %>
91
+ - name: <%= mount[:name] %>
92
+ hostPath:
93
+ path: <%= mount[:path] %>
94
+ type: <%= mount[:host_type] %>
95
+ <% end %> <%# if mount is [host,nfs] %>
96
+ <% end %> <%# for each mount %>
97
+ ---
98
+ <% unless spec.container.port.nil? %>
99
+ apiVersion: v1
100
+ kind: Service
101
+ metadata:
102
+ name: <%= service_name(id) %>
103
+ namespace: <%= namespace %>
104
+ spec:
105
+ selector:
106
+ job: <%= id %>
107
+ ports:
108
+ - protocol: TCP
109
+ port: 80
110
+ targetPort: <%= spec.container.port %>
111
+ type: NodePort
112
+ <% end %> <%# end for service %>
113
+ ---
114
+ <% unless configmap.nil? %>
115
+ apiVersion: v1
116
+ kind: ConfigMap
117
+ metadata:
118
+ name: <%= configmap_name(id) %>
119
+ namespace: <%= namespace %>
120
+ data:
121
+ <%= configmap.filename %>: |
122
+ <% config_data_lines(configmap.data).each do |line| %><%= line %><% end %>
123
+ <% end %> <%# end for configmap %>
@@ -166,7 +166,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
166
166
  'email_on_terminated' => script_email_on_event(script, 'terminated'),
167
167
  'email_on_start' => script_email_on_event(script, 'started'),
168
168
  'environment' => export_env(script),
169
- 'error_path' => (script.error_path) ? script.error_path.to_s : '/dev/null',
169
+ 'error_path' => error_path(script),
170
170
  'job_name' => script.job_name.to_s,
171
171
  'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
172
172
  'script_content' => content,
@@ -176,6 +176,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
176
176
  'singularity_image' => singularity_image(script.native),
177
177
  'ssh_hosts' => ssh_hosts,
178
178
  'tmux_bin' => tmux_bin,
179
+ 'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
179
180
  }.each{
180
181
  |key, value| bnd.local_variable_set(key, value)
181
182
  }
@@ -272,4 +273,11 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
272
273
  return false if script.content.empty?
273
274
  script.content.split("\n").first.start_with?('#!/')
274
275
  end
276
+
277
+ def error_path(script)
278
+ return script.error_path.to_s if script.error_path
279
+ return script.output_path.to_s if script.output_path
280
+
281
+ '/dev/null'
282
+ end
275
283
  end
@@ -16,13 +16,9 @@ fi
16
16
  echo $hostname
17
17
 
18
18
  # Put the script into a temp file on localhost
19
- <% if debug %>
20
- singularity_tmp_file=$(mktemp -p "$HOME" --suffix '_sing')
21
- tmux_tmp_file=$(mktemp -p "$HOME" --suffix "_tmux")
22
- <% else %>
23
- singularity_tmp_file=$(mktemp)
24
- tmux_tmp_file=$(mktemp)
25
- <% end %>
19
+ singularity_tmp_file=$(mktemp -p "<%= workdir %>" --suffix '_sing')
20
+ tmux_tmp_file=$(mktemp -p "<%= workdir %>" --suffix "_tmux")
21
+
26
22
 
27
23
  # Create an executable to run in a tmux session
28
24
  # The escaped HEREDOC means that we need to substitute in $singularity_tmp_file ourselves
@@ -69,10 +65,3 @@ SINGULARITY_LAUNCHER
69
65
  chmod +x "$singularity_tmp_file"
70
66
  chmod +x "$tmux_tmp_file"
71
67
  <%= tmux_bin %> new-session -d -s "<%= session_name %>" "$tmux_tmp_file"
72
-
73
- # Remove the file
74
- <% if ! debug %>
75
- # Wait 1 second to ensure that tmux session has started before the file is removed
76
- sleep 1
77
- rm -f "$tmux_tmp_file"; rm -f "$singularity_tmp_file"
78
- <% end %>
@@ -80,6 +80,9 @@ module OodCore
80
80
  # from
81
81
  class Error < StandardError; end
82
82
 
83
+ # An error indicating the slurm command timed out
84
+ class SlurmTimeoutError < Error; end
85
+
83
86
  # @param cluster [#to_s, nil] the cluster name
84
87
  # @param conf [#to_s, nil] path to the slurm conf
85
88
  # @param bin [#to_s] path to slurm installation binaries
@@ -147,6 +150,9 @@ module OodCore
147
150
  end
148
151
  jobs
149
152
  end
153
+ rescue SlurmTimeoutError
154
+ # TODO: could use a log entry here
155
+ return [{ id: id, state: 'undetermined' }]
150
156
  end
151
157
 
152
158
  def squeue_fields(attrs)
@@ -303,7 +309,18 @@ module OodCore
303
309
 
304
310
  cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
305
311
  o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
306
- s.success? ? o : raise(Error, e)
312
+ s.success? ? interpret_and_raise(o, e) : raise(Error, e)
313
+ end
314
+
315
+ # Helper function to raise an error based on the contents of stderr.
316
+ # Slurm exits 0 even when the command fails, so we need to interpret stderr
317
+ # to see if the command was actually successful.
318
+ def interpret_and_raise(stdout, stderr)
319
+ return stdout if stderr.empty?
320
+
321
+ raise SlurmTimeoutError, stderr if /^slurm_load_jobs error: Socket timed out/.match(stderr)
322
+
323
+ stdout
307
324
  end
308
325
 
309
326
  def squeue_attrs_for_info_attrs(attrs)
@@ -1,4 +1,4 @@
1
1
  module OodCore
2
2
  # The current version of {OodCore}
3
- VERSION = "0.13.0"
3
+ VERSION = "0.14.0"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ood_core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eric Franz
@@ -10,7 +10,7 @@ authors:
10
10
  autorequire:
11
11
  bindir: exe
12
12
  cert_chain: []
13
- date: 2020-08-10 00:00:00.000000000 Z
13
+ date: 2020-10-01 00:00:00.000000000 Z
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: ood_support
@@ -166,6 +166,11 @@ files:
166
166
  - lib/ood_core/job/adapters/ccq.rb
167
167
  - lib/ood_core/job/adapters/drmaa.rb
168
168
  - lib/ood_core/job/adapters/helper.rb
169
+ - lib/ood_core/job/adapters/kubernetes.rb
170
+ - lib/ood_core/job/adapters/kubernetes/batch.rb
171
+ - lib/ood_core/job/adapters/kubernetes/helper.rb
172
+ - lib/ood_core/job/adapters/kubernetes/resources.rb
173
+ - lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb
169
174
  - lib/ood_core/job/adapters/linux_host.rb
170
175
  - lib/ood_core/job/adapters/linux_host/launcher.rb
171
176
  - lib/ood_core/job/adapters/linux_host/templates/email.erb.sh
@@ -216,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
216
221
  - !ruby/object:Gem::Version
217
222
  version: '0'
218
223
  requirements: []
219
- rubygems_version: 3.0.3
224
+ rubygems_version: 3.0.8
220
225
  signing_key:
221
226
  specification_version: 4
222
227
  summary: Open OnDemand core library