ood_core 0.13.0 → 0.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -1
- data/README.md +1 -1
- data/lib/ood_core/job/adapters/kubernetes.rb +193 -0
- data/lib/ood_core/job/adapters/kubernetes/batch.rb +350 -0
- data/lib/ood_core/job/adapters/kubernetes/helper.rb +298 -0
- data/lib/ood_core/job/adapters/kubernetes/resources.rb +56 -0
- data/lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb +123 -0
- data/lib/ood_core/job/adapters/linux_host/launcher.rb +9 -1
- data/lib/ood_core/job/adapters/linux_host/templates/script_wrapper.erb.sh +3 -14
- data/lib/ood_core/job/adapters/slurm.rb +18 -1
- data/lib/ood_core/version.rb +1 -1
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52ba764b085dedb7eaeb06d95751f1804a50488e1859f980a7836d2d9032b95d
|
4
|
+
data.tar.gz: c2dc5edf395fe158960f33b80c554f3dc745f15e7ec1337b738683a0e1bbdc7f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59915bae23a008a923c249d222e50548a7bee3438144068a29ae1cafdd489ca1229ee1a14f4f81e3fd065381f46f920bef24344fe633c7c578cb1f6a4f9a2a77
|
7
|
+
data.tar.gz: 8d2ca42c7f49158c8d321c21b79aff1c636df3c77bb7e71107db70371a34058d79c8a5ec32ca93883e7d3bcc7dc2202375144d23613f167ab089318d6270248c
|
data/CHANGELOG.md
CHANGED
@@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
6
6
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
|
+
## [0.14.0] - 2020-10-01
|
10
|
+
### Added
|
11
|
+
- Kubernetes adapter in PR [156](https://github.com/OSC/ood_core/pull/156)
|
12
|
+
|
13
|
+
### Fixed
|
14
|
+
- Catch Slurm times. [209](https://github.com/OSC/ood_core/pull/209)
|
15
|
+
- LHA race condition in deleteing tmp files. [212](https://github.com/OSC/ood_core/pull/212)
|
16
|
+
|
9
17
|
## [0.13.0] - 2020-08-10
|
10
18
|
### Added
|
11
19
|
- CloudyCluster CCQ Adapter
|
@@ -247,7 +255,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
247
255
|
### Added
|
248
256
|
- Initial release!
|
249
257
|
|
250
|
-
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.
|
258
|
+
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.14.0...HEAD
|
259
|
+
[0.14.0]: https://github.com/OSC/ood_core/compare/v0.13.0...v0.14.0
|
251
260
|
[0.13.0]: https://github.com/OSC/ood_core/compare/v0.12.0...v0.13.0
|
252
261
|
[0.12.0]: https://github.com/OSC/ood_core/compare/v0.11.4...v0.12.0
|
253
262
|
[0.11.4]: https://github.com/OSC/ood_core/compare/v0.11.3...v0.11.4
|
data/README.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
|
7
7
|
- Website: http://openondemand.org/
|
8
8
|
- Website repo with JOSS publication: https://github.com/OSC/Open-OnDemand
|
9
|
-
- Documentation: https://osc.github.io/ood-documentation/
|
9
|
+
- Documentation: https://osc.github.io/ood-documentation/latest/
|
10
10
|
- Main code repo: https://github.com/OSC/ondemand
|
11
11
|
- Core library repo: https://github.com/OSC/ood_core
|
12
12
|
|
@@ -0,0 +1,193 @@
|
|
1
|
+
require "ood_core/refinements/hash_extensions"
|
2
|
+
require "ood_core/refinements/array_extensions"
|
3
|
+
|
4
|
+
module OodCore
|
5
|
+
module Job
|
6
|
+
class Factory
|
7
|
+
using Refinements::HashExtensions
|
8
|
+
|
9
|
+
def self.build_kubernetes(config)
|
10
|
+
batch = Adapters::Kubernetes::Batch.new(config.to_h.symbolize_keys, Adapters::Kubernetes::Helper.new)
|
11
|
+
Adapters::Kubernetes.new(batch)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
module Adapters
|
16
|
+
class Kubernetes < Adapter
|
17
|
+
|
18
|
+
using Refinements::ArrayExtensions
|
19
|
+
using Refinements::HashExtensions
|
20
|
+
|
21
|
+
require "ood_core/job/adapters/kubernetes/batch"
|
22
|
+
|
23
|
+
attr_reader :batch
|
24
|
+
|
25
|
+
def initialize(batch)
|
26
|
+
@batch = batch
|
27
|
+
end
|
28
|
+
|
29
|
+
# Submit a job with the attributes defined in the job template instance
|
30
|
+
# @abstract Subclass is expected to implement {#submit}
|
31
|
+
# @raise [NotImplementedError] if subclass did not define {#submit}
|
32
|
+
# @example Submit job template to cluster
|
33
|
+
# solver_id = job_adapter.submit(solver_script)
|
34
|
+
# #=> "1234.server"
|
35
|
+
# @example Submit job that depends on previous job
|
36
|
+
# post_id = job_adapter.submit(
|
37
|
+
# post_script,
|
38
|
+
# afterok: solver_id
|
39
|
+
# )
|
40
|
+
# #=> "1235.server"
|
41
|
+
# @param script [Script] script object that describes the
|
42
|
+
# script and attributes for the submitted job
|
43
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
|
44
|
+
# at any point after dependent jobs have started execution
|
45
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
46
|
+
# execution only after dependent jobs have terminated with no errors
|
47
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
48
|
+
# execution only after dependent jobs have terminated with errors
|
49
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
50
|
+
# execution after dependent jobs have terminated
|
51
|
+
# @return [String] the job id returned after successfully submitting a job
|
52
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
53
|
+
raise ArgumentError, 'Must specify the script' if script.nil?
|
54
|
+
|
55
|
+
batch.submit(script)
|
56
|
+
rescue Batch::Error => e
|
57
|
+
raise JobAdapterError, e.message
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
# Retrieve info for all jobs from the resource manager
|
62
|
+
# @abstract Subclass is expected to implement {#info_all}
|
63
|
+
# @raise [NotImplementedError] if subclass did not define {#info_all}
|
64
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
65
|
+
# This array specifies only attrs you want, in addition to id and status.
|
66
|
+
# If an array, the Info object that is returned to you is not guarenteed
|
67
|
+
# to have a value for any attr besides the ones specified and id and status.
|
68
|
+
#
|
69
|
+
# For certain adapters this may speed up the response since
|
70
|
+
# adapters can get by without populating the entire Info object
|
71
|
+
# @return [Array<Info>] information describing submitted jobs
|
72
|
+
def info_all(attrs: nil)
|
73
|
+
batch.info_all(attrs: attrs)
|
74
|
+
rescue Batch::Error => e
|
75
|
+
raise JobAdapterError, e.message
|
76
|
+
end
|
77
|
+
|
78
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
79
|
+
# resource manager
|
80
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
81
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
82
|
+
# This array specifies only attrs you want, in addition to id and status.
|
83
|
+
# If an array, the Info object that is returned to you is not guarenteed
|
84
|
+
# to have a value for any attr besides the ones specified and id and status.
|
85
|
+
#
|
86
|
+
# For certain adapters this may speed up the response since
|
87
|
+
# adapters can get by without populating the entire Info object
|
88
|
+
# @return [Array<Info>] information describing submitted jobs
|
89
|
+
def info_where_owner(owner, attrs: nil)
|
90
|
+
owner = Array.wrap(owner).map(&:to_s)
|
91
|
+
|
92
|
+
# must at least have job_owner to filter by job_owner
|
93
|
+
attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
|
94
|
+
|
95
|
+
info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
|
96
|
+
end
|
97
|
+
|
98
|
+
# Iterate over each job Info object
|
99
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
100
|
+
# This array specifies only attrs you want, in addition to id and status.
|
101
|
+
# If an array, the Info object that is returned to you is not guarenteed
|
102
|
+
# to have a value for any attr besides the ones specified and id and status.
|
103
|
+
#
|
104
|
+
# For certain adapters this may speed up the response since
|
105
|
+
# adapters can get by without populating the entire Info object
|
106
|
+
# @yield [Info] of each job to block
|
107
|
+
# @return [Enumerator] if no block given
|
108
|
+
def info_all_each(attrs: nil)
|
109
|
+
return to_enum(:info_all_each, attrs: attrs) unless block_given?
|
110
|
+
|
111
|
+
info_all(attrs: attrs).each do |job|
|
112
|
+
yield job
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Iterate over each job Info object
|
117
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
118
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
119
|
+
# This array specifies only attrs you want, in addition to id and status.
|
120
|
+
# If an array, the Info object that is returned to you is not guarenteed
|
121
|
+
# to have a value for any attr besides the ones specified and id and status.
|
122
|
+
#
|
123
|
+
# For certain adapters this may speed up the response since
|
124
|
+
# adapters can get by without populating the entire Info object
|
125
|
+
# @yield [Info] of each job to block
|
126
|
+
# @return [Enumerator] if no block given
|
127
|
+
def info_where_owner_each(owner, attrs: nil)
|
128
|
+
return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
|
129
|
+
|
130
|
+
info_where_owner(owner, attrs: attrs).each do |job|
|
131
|
+
yield job
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Whether the adapter supports job arrays
|
136
|
+
# @return [Boolean] - assumes true; but can be overridden by adapters that
|
137
|
+
# explicitly do not
|
138
|
+
def supports_job_arrays?
|
139
|
+
false
|
140
|
+
end
|
141
|
+
|
142
|
+
# Retrieve job info from the resource manager
|
143
|
+
# @abstract Subclass is expected to implement {#info}
|
144
|
+
# @raise [NotImplementedError] if subclass did not define {#info}
|
145
|
+
# @param id [#to_s] the id of the job
|
146
|
+
# @return [Info] information describing submitted job
|
147
|
+
def info(id)
|
148
|
+
batch.info(id.to_s)
|
149
|
+
rescue Batch::Error => e
|
150
|
+
raise JobAdapterError, e.message
|
151
|
+
end
|
152
|
+
|
153
|
+
# Retrieve job status from resource manager
|
154
|
+
# @note Optimized slightly over retrieving complete job information from server
|
155
|
+
# @abstract Subclass is expected to implement {#status}
|
156
|
+
# @raise [NotImplementedError] if subclass did not define {#status}
|
157
|
+
# @param id [#to_s] the id of the job
|
158
|
+
# @return [Status] status of job
|
159
|
+
def status(id)
|
160
|
+
info(id).status
|
161
|
+
end
|
162
|
+
|
163
|
+
# Put the submitted job on hold
|
164
|
+
# @abstract Subclass is expected to implement {#hold}
|
165
|
+
# @raise [NotImplementedError] if subclass did not define {#hold}
|
166
|
+
# @param id [#to_s] the id of the job
|
167
|
+
# @return [void]
|
168
|
+
def hold(id)
|
169
|
+
raise NotImplementedError, 'subclass did not define #hold'
|
170
|
+
end
|
171
|
+
|
172
|
+
# Release the job that is on hold
|
173
|
+
# @abstract Subclass is expected to implement {#release}
|
174
|
+
# @raise [NotImplementedError] if subclass did not define {#release}
|
175
|
+
# @param id [#to_s] the id of the job
|
176
|
+
# @return [void]
|
177
|
+
def release(id)
|
178
|
+
raise NotImplementedError, 'subclass did not define #release'
|
179
|
+
end
|
180
|
+
|
181
|
+
# Delete the submitted job.
|
182
|
+
#
|
183
|
+
# @param id [#to_s] the id of the job
|
184
|
+
# @return [void]
|
185
|
+
def delete(id)
|
186
|
+
batch.delete(id.to_s)
|
187
|
+
rescue Batch::Error => e
|
188
|
+
raise JobAdapterError, e.message
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,350 @@
|
|
1
|
+
require "ood_core/refinements/hash_extensions"
|
2
|
+
require "json"
|
3
|
+
|
4
|
+
class OodCore::Job::Adapters::Kubernetes::Batch
|
5
|
+
|
6
|
+
require "ood_core/job/adapters/kubernetes/helper"
|
7
|
+
|
8
|
+
Helper = OodCore::Job::Adapters::Kubernetes::Helper
|
9
|
+
Resources = OodCore::Job::Adapters::Kubernetes::Resources
|
10
|
+
|
11
|
+
using OodCore::Refinements::HashExtensions
|
12
|
+
|
13
|
+
class Error < StandardError; end
|
14
|
+
|
15
|
+
attr_reader :config_file, :bin, :cluster_name, :mounts
|
16
|
+
attr_reader :all_namespaces, :using_context, :helper
|
17
|
+
attr_reader :username_prefix
|
18
|
+
|
19
|
+
def initialize(options = {}, helper = Helper.new)
|
20
|
+
options = options.to_h.symbolize_keys
|
21
|
+
|
22
|
+
@config_file = options.fetch(:config_file, default_config_file)
|
23
|
+
@bin = options.fetch(:bin, '/usr/bin/kubectl')
|
24
|
+
@cluster_name = options.fetch(:cluster_name, 'open-ondemand')
|
25
|
+
@mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
|
26
|
+
@all_namespaces = options.fetch(:all_namespaces, false)
|
27
|
+
@username_prefix = options.fetch(:username_prefix, nil)
|
28
|
+
|
29
|
+
@using_context = false
|
30
|
+
@helper = helper
|
31
|
+
|
32
|
+
begin
|
33
|
+
make_kubectl_config(options)
|
34
|
+
rescue
|
35
|
+
# FIXME could use a log here
|
36
|
+
# means you couldn't 'kubectl set config'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def resource_file(resource_type = 'pod')
|
41
|
+
File.dirname(__FILE__) + "/templates/#{resource_type}.yml.erb"
|
42
|
+
end
|
43
|
+
|
44
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
45
|
+
raise ArgumentError, 'Must specify the script' if script.nil?
|
46
|
+
|
47
|
+
resource_yml, id = generate_id_yml(script.native)
|
48
|
+
call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
|
49
|
+
|
50
|
+
id
|
51
|
+
end
|
52
|
+
|
53
|
+
def generate_id(name)
|
54
|
+
# 2_821_109_907_456 = 36**8
|
55
|
+
name.downcase.tr(' ', '-') + '-' + rand(2_821_109_907_456).to_s(36)
|
56
|
+
end
|
57
|
+
|
58
|
+
def info_all(attrs: nil)
|
59
|
+
cmd = if all_namespaces
|
60
|
+
"#{base_cmd} get pods -o json --all-namespaces"
|
61
|
+
else
|
62
|
+
"#{namespaced_cmd} get pods -o json"
|
63
|
+
end
|
64
|
+
|
65
|
+
output = call(cmd)
|
66
|
+
all_pods_to_info(output)
|
67
|
+
end
|
68
|
+
|
69
|
+
def info_where_owner(owner, attrs: nil)
|
70
|
+
owner = Array.wrap(owner).map(&:to_s)
|
71
|
+
|
72
|
+
# must at least have job_owner to filter by job_owner
|
73
|
+
attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
|
74
|
+
|
75
|
+
info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
|
76
|
+
end
|
77
|
+
|
78
|
+
def info_all_each(attrs: nil)
|
79
|
+
return to_enum(:info_all_each, attrs: attrs) unless block_given?
|
80
|
+
|
81
|
+
info_all(attrs: attrs).each do |job|
|
82
|
+
yield job
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def info_where_owner_each(owner, attrs: nil)
|
87
|
+
return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
|
88
|
+
|
89
|
+
info_where_owner(owner, attrs: attrs).each do |job|
|
90
|
+
yield job
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def info(id)
|
95
|
+
pod_json = call_json_output('get', 'pod', id)
|
96
|
+
|
97
|
+
begin
|
98
|
+
service_json = call_json_output('get', 'service', service_name(id))
|
99
|
+
secret_json = call_json_output('get', 'secret', secret_name(id))
|
100
|
+
rescue
|
101
|
+
# it's ok if these don't exist
|
102
|
+
service_json ||= nil
|
103
|
+
secret_json ||= nil
|
104
|
+
end
|
105
|
+
|
106
|
+
helper.info_from_json(pod_json: pod_json, service_json: service_json, secret_json: secret_json)
|
107
|
+
end
|
108
|
+
|
109
|
+
def status(id)
|
110
|
+
info(id).status
|
111
|
+
end
|
112
|
+
|
113
|
+
def delete(id)
|
114
|
+
call("#{namespaced_cmd} delete pod #{id}")
|
115
|
+
|
116
|
+
begin
|
117
|
+
call("#{namespaced_cmd} delete service #{service_name(id)}")
|
118
|
+
call("#{namespaced_cmd} delete secret #{secret_name(id)}")
|
119
|
+
call("#{namespaced_cmd} delete configmap #{configmap_name(id)}")
|
120
|
+
rescue
|
121
|
+
# FIXME: retries? delete if exists?
|
122
|
+
# just eat the results of deleting services and secrets
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def configmap_mount_path
|
127
|
+
'/ood'
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
# helper to help format multi-line yaml data from the submit.yml into
|
133
|
+
# mutli-line yaml in the pod.yml.erb
|
134
|
+
def config_data_lines(data)
|
135
|
+
output = []
|
136
|
+
first = true
|
137
|
+
|
138
|
+
data.to_s.each_line do |line|
|
139
|
+
output.append(first ? line : line.prepend(" "))
|
140
|
+
first = false
|
141
|
+
end
|
142
|
+
|
143
|
+
output
|
144
|
+
end
|
145
|
+
|
146
|
+
def username
|
147
|
+
@username ||= Etc.getlogin
|
148
|
+
end
|
149
|
+
|
150
|
+
def k8s_username
|
151
|
+
username_prefix.nil? ? username : "#{username_prefix}-#{username}"
|
152
|
+
end
|
153
|
+
|
154
|
+
def run_as_user
|
155
|
+
Etc.getpwnam(username).uid
|
156
|
+
end
|
157
|
+
|
158
|
+
def run_as_group
|
159
|
+
Etc.getpwnam(username).gid
|
160
|
+
end
|
161
|
+
|
162
|
+
def fs_group
|
163
|
+
run_as_group
|
164
|
+
end
|
165
|
+
|
166
|
+
# helper to template resource yml you're going to submit and
|
167
|
+
# create an id.
|
168
|
+
def generate_id_yml(native_data)
|
169
|
+
container = helper.container_from_native(native_data[:container])
|
170
|
+
id = generate_id(container.name)
|
171
|
+
configmap = helper.configmap_from_native(native_data, id)
|
172
|
+
init_containers = helper.init_ctrs_from_native(native_data[:init_containers])
|
173
|
+
spec = Resources::PodSpec.new(container, init_containers: init_containers)
|
174
|
+
all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
|
175
|
+
|
176
|
+
template = ERB.new(File.read(resource_file))
|
177
|
+
|
178
|
+
[template.result(binding), id]
|
179
|
+
end
|
180
|
+
|
181
|
+
# helper to call kubectl and get json data back.
|
182
|
+
# verb, resrouce and id are the kubernetes parlance terms.
|
183
|
+
# example: 'kubectl get pod my-pod-id' is verb=get, resource=pod
|
184
|
+
# and id=my-pod-id
|
185
|
+
def call_json_output(verb, resource, id, stdin: nil)
|
186
|
+
cmd = "#{formatted_ns_cmd} #{verb} #{resource} #{id}"
|
187
|
+
data = call(cmd, stdin: stdin)
|
188
|
+
data = data.empty? ? '{}' : data
|
189
|
+
json_data = JSON.parse(data, symbolize_names: true)
|
190
|
+
|
191
|
+
json_data
|
192
|
+
end
|
193
|
+
|
194
|
+
def service_name(id)
|
195
|
+
helper.service_name(id)
|
196
|
+
end
|
197
|
+
|
198
|
+
def secret_name(id)
|
199
|
+
helper.secret_name(id)
|
200
|
+
end
|
201
|
+
|
202
|
+
def configmap_name(id)
|
203
|
+
helper.configmap_name(id)
|
204
|
+
end
|
205
|
+
|
206
|
+
def namespace
|
207
|
+
default_namespace
|
208
|
+
end
|
209
|
+
|
210
|
+
def default_namespace
|
211
|
+
username
|
212
|
+
end
|
213
|
+
|
214
|
+
def context
|
215
|
+
cluster_name
|
216
|
+
end
|
217
|
+
|
218
|
+
def default_config_file
|
219
|
+
(ENV['KUBECONFIG'] || "#{Dir.home}/.kube/config")
|
220
|
+
end
|
221
|
+
|
222
|
+
def default_auth
|
223
|
+
{
|
224
|
+
type: 'managaged'
|
225
|
+
}.symbolize_keys
|
226
|
+
end
|
227
|
+
|
228
|
+
def default_server
|
229
|
+
{
|
230
|
+
endpoint: 'https://localhost:8080',
|
231
|
+
cert_authority_file: nil
|
232
|
+
}.symbolize_keys
|
233
|
+
end
|
234
|
+
|
235
|
+
def formatted_ns_cmd
|
236
|
+
"#{namespaced_cmd} -o json"
|
237
|
+
end
|
238
|
+
|
239
|
+
def namespaced_cmd
|
240
|
+
"#{base_cmd} --namespace=#{namespace}"
|
241
|
+
end
|
242
|
+
|
243
|
+
def base_cmd
|
244
|
+
base = "#{bin} --kubeconfig=#{config_file}"
|
245
|
+
base << " --context=#{context}" if using_context
|
246
|
+
base
|
247
|
+
end
|
248
|
+
|
249
|
+
def all_pods_to_info(data)
|
250
|
+
json_data = JSON.parse(data, symbolize_names: true)
|
251
|
+
pods = json_data.dig(:items)
|
252
|
+
|
253
|
+
info_array = []
|
254
|
+
pods.each do |pod|
|
255
|
+
info = pod_info_from_json(pod)
|
256
|
+
info_array.push(info) unless info.nil?
|
257
|
+
end
|
258
|
+
|
259
|
+
info_array
|
260
|
+
rescue JSON::ParserError
|
261
|
+
# 'no resources in <namespace>' throws parse error
|
262
|
+
[]
|
263
|
+
end
|
264
|
+
|
265
|
+
def pod_info_from_json(pod)
|
266
|
+
hash = helper.pod_info_from_json(pod)
|
267
|
+
OodCore::Job::Info.new(hash)
|
268
|
+
rescue Helper::K8sDataError
|
269
|
+
# FIXME: silently eating error, could probably use a logger
|
270
|
+
nil
|
271
|
+
end
|
272
|
+
|
273
|
+
def make_kubectl_config(config)
|
274
|
+
set_cluster(config.fetch(:server, default_server).to_h.symbolize_keys)
|
275
|
+
configure_auth(config.fetch(:auth, default_auth).to_h.symbolize_keys)
|
276
|
+
end
|
277
|
+
|
278
|
+
def configure_auth(auth)
|
279
|
+
type = auth.fetch(:type)
|
280
|
+
return if managed?(type)
|
281
|
+
|
282
|
+
case type
|
283
|
+
when 'gke'
|
284
|
+
set_gke_config(auth)
|
285
|
+
when 'oidc'
|
286
|
+
set_context
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
def use_context
|
291
|
+
@using_context = true
|
292
|
+
end
|
293
|
+
|
294
|
+
def managed?(type)
|
295
|
+
if type.nil?
|
296
|
+
true # maybe should be false?
|
297
|
+
else
|
298
|
+
type.to_s == 'managed'
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def set_gke_config(auth)
|
303
|
+
cred_file = auth.fetch(:svc_acct_file)
|
304
|
+
|
305
|
+
cmd = "gcloud auth activate-service-account --key-file=#{cred_file}"
|
306
|
+
call(cmd)
|
307
|
+
|
308
|
+
set_gke_credentials(auth)
|
309
|
+
end
|
310
|
+
|
311
|
+
def set_gke_credentials(auth)
|
312
|
+
|
313
|
+
zone = auth.fetch(:zone, nil)
|
314
|
+
region = auth.fetch(:region, nil)
|
315
|
+
|
316
|
+
locale = ''
|
317
|
+
locale = "--zone=#{zone}" unless zone.nil?
|
318
|
+
locale = "--region=#{region}" unless region.nil?
|
319
|
+
|
320
|
+
# gke cluster name can probably can differ from what ood calls the cluster
|
321
|
+
cmd = "gcloud container clusters get-credentials #{locale} #{cluster_name}"
|
322
|
+
env = { 'KUBECONFIG' => config_file }
|
323
|
+
call(cmd, env)
|
324
|
+
end
|
325
|
+
|
326
|
+
def set_context
|
327
|
+
cmd = "#{base_cmd} config set-context #{cluster_name}"
|
328
|
+
cmd << " --cluster=#{cluster_name} --namespace=#{namespace}"
|
329
|
+
cmd << " --user=#{k8s_username}"
|
330
|
+
|
331
|
+
call(cmd)
|
332
|
+
use_context
|
333
|
+
end
|
334
|
+
|
335
|
+
def set_cluster(config)
|
336
|
+
server = config.fetch(:endpoint)
|
337
|
+
cert = config.fetch(:cert_authority_file, nil)
|
338
|
+
|
339
|
+
cmd = "#{base_cmd} config set-cluster #{cluster_name}"
|
340
|
+
cmd << " --server=#{server}"
|
341
|
+
cmd << " --certificate-authority=#{cert}" unless cert.nil?
|
342
|
+
|
343
|
+
call(cmd)
|
344
|
+
end
|
345
|
+
|
346
|
+
def call(cmd = '', env: {}, stdin: nil)
|
347
|
+
o, error, s = Open3.capture3(env, cmd, stdin_data: stdin.to_s)
|
348
|
+
s.success? ? o : raise(Error, error)
|
349
|
+
end
|
350
|
+
end
|
@@ -0,0 +1,298 @@
|
|
1
|
+
class OodCore::Job::Adapters::Kubernetes::Helper
|
2
|
+
|
3
|
+
require 'ood_core/job/adapters/kubernetes/resources'
|
4
|
+
require 'resolv'
|
5
|
+
require 'base64'
|
6
|
+
|
7
|
+
class K8sDataError < StandardError; end
|
8
|
+
|
9
|
+
Resources = OodCore::Job::Adapters::Kubernetes::Resources
|
10
|
+
|
11
|
+
# Extract info from json data. The data is expected to be from the kubectl
|
12
|
+
# command and conform to kubernetes' datatype structures.
|
13
|
+
#
|
14
|
+
# Returns { native: {host: localhost, port:80, password: sshhh }} in the info
|
15
|
+
# object field in lieu of writing a connection.yml
|
16
|
+
#
|
17
|
+
# @param pod_json [#to_h]
|
18
|
+
# the pod data returned from 'kubectl get pod abc-123'
|
19
|
+
# @param service_json [#to_h]
|
20
|
+
# the service data returned from 'kubectl get service abc-123-service'
|
21
|
+
# @param secret_json [#to_h]
|
22
|
+
# the secret data returned from 'kubectl get secret abc-123-secret'
|
23
|
+
# @return [OodCore::Job::Info]
|
24
|
+
def info_from_json(pod_json: nil, service_json: nil, secret_json: nil)
|
25
|
+
pod_hash = pod_info_from_json(pod_json)
|
26
|
+
service_hash = service_info_from_json(service_json)
|
27
|
+
secret_hash = secret_info_from_json(secret_json)
|
28
|
+
|
29
|
+
# can't just use deep_merge bc we don't depend *directly* on rails
|
30
|
+
pod_hash[:native] = pod_hash[:native].merge(service_hash[:native])
|
31
|
+
pod_hash[:native] = pod_hash[:native].merge(secret_hash[:native])
|
32
|
+
OodCore::Job::Info.new(pod_hash)
|
33
|
+
rescue NoMethodError
|
34
|
+
raise K8sDataError, "unable to read data correctly from json"
|
35
|
+
end
|
36
|
+
|
37
|
+
# Turn a container hash into a Kubernetes::Resources::Container
|
38
|
+
#
|
39
|
+
# @param container [#to_h]
|
40
|
+
# the input container hash
|
41
|
+
# @return [OodCore::Job::Adapters::Kubernetes::Resources::Container]
|
42
|
+
def container_from_native(container)
|
43
|
+
Resources::Container.new(
|
44
|
+
container[:name],
|
45
|
+
container[:image],
|
46
|
+
command: parse_command(container[:command]),
|
47
|
+
port: container[:port],
|
48
|
+
env: container.fetch(:env, []),
|
49
|
+
memory: container[:memory],
|
50
|
+
cpu: container[:cpu],
|
51
|
+
working_dir: container[:working_dir],
|
52
|
+
restart_policy: container[:restart_policy]
|
53
|
+
)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Parse a command string given from a user and return an array.
|
57
|
+
# If given an array, the input is simply returned back.
|
58
|
+
#
|
59
|
+
# @param cmd [#to_s]
|
60
|
+
# the command to parse
|
61
|
+
# @return [Array<#to_s>]
|
62
|
+
# the command parsed into an array of arguements
|
63
|
+
def parse_command(cmd)
|
64
|
+
if cmd&.is_a?(Array)
|
65
|
+
cmd
|
66
|
+
else
|
67
|
+
Shellwords.split(cmd.to_s)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Turn a configmap hash into a Kubernetes::Resources::ConfigMap
|
72
|
+
# that can be used in templates. Needs an id so that the resulting
|
73
|
+
# configmap has a known name.
|
74
|
+
#
|
75
|
+
# @param native [#to_h]
|
76
|
+
# the input configmap hash
|
77
|
+
# @param id [#to_s]
|
78
|
+
# the id to use for giving the configmap a name
|
79
|
+
# @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
|
80
|
+
def configmap_from_native(native, id)
|
81
|
+
configmap = native.fetch(:configmap, nil)
|
82
|
+
return nil if configmap.nil?
|
83
|
+
|
84
|
+
Resources::ConfigMap.new(
|
85
|
+
configmap_name(id),
|
86
|
+
configmap[:filename],
|
87
|
+
configmap[:data]
|
88
|
+
)
|
89
|
+
end
|
90
|
+
|
91
|
+
# parse initialization containers from native data
|
92
|
+
#
|
93
|
+
# @param native_data [#to_h]
|
94
|
+
# the native data to parse. Expected key init_ctrs and for that
|
95
|
+
# key to be an array of hashes.
|
96
|
+
# @return [Array<OodCore::Job::Adapters::Kubernetes::Resources::Container>]
|
97
|
+
# the array of init containers
|
98
|
+
def init_ctrs_from_native(ctrs)
|
99
|
+
init_ctrs = []
|
100
|
+
|
101
|
+
ctrs&.each do |ctr_raw|
|
102
|
+
ctr = container_from_native(ctr_raw)
|
103
|
+
init_ctrs.push(ctr)
|
104
|
+
end
|
105
|
+
|
106
|
+
init_ctrs
|
107
|
+
end
|
108
|
+
|
109
|
+
def service_name(id)
|
110
|
+
id + '-service'
|
111
|
+
end
|
112
|
+
|
113
|
+
def secret_name(id)
|
114
|
+
id + '-secret'
|
115
|
+
end
|
116
|
+
|
117
|
+
def configmap_name(id)
|
118
|
+
id + '-configmap'
|
119
|
+
end
|
120
|
+
|
121
|
+
# Extract pod info from json data. The data is expected to be from the kubectl
|
122
|
+
# command and conform to kubernetes' datatype structures.
|
123
|
+
#
|
124
|
+
# @param json_data [#to_h]
|
125
|
+
# the pod data returned from 'kubectl get pod abc-123'
|
126
|
+
# @return [#to_h]
|
127
|
+
# the hash of info expected from adapters
|
128
|
+
def pod_info_from_json(json_data)
|
129
|
+
{
|
130
|
+
id: json_data.dig(:metadata, :name).to_s,
|
131
|
+
job_name: name_from_metadata(json_data.dig(:metadata)),
|
132
|
+
status: pod_status_from_json(json_data),
|
133
|
+
job_owner: json_data.dig(:metadata, :namespace).to_s,
|
134
|
+
submission_time: submission_time(json_data),
|
135
|
+
dispatch_time: dispatch_time(json_data),
|
136
|
+
wallclock_time: wallclock_time(json_data),
|
137
|
+
native: {
|
138
|
+
host: get_host(json_data.dig(:status, :hostIP))
|
139
|
+
},
|
140
|
+
procs: procs_from_json(json_data)
|
141
|
+
}
|
142
|
+
rescue NoMethodError
|
143
|
+
# gotta raise an error because Info.new will throw an error if id is undefined
|
144
|
+
raise K8sDataError, "unable to read data correctly from json"
|
145
|
+
end
|
146
|
+
|
147
|
+
private
|
148
|
+
|
149
|
+
def get_host(ip)
|
150
|
+
Resolv.getname(ip)
|
151
|
+
rescue Resolv::ResolvError
|
152
|
+
ip
|
153
|
+
end
|
154
|
+
|
155
|
+
def name_from_metadata(metadata)
|
156
|
+
name = metadata.dig(:labels, :'app.kubernetes.io/name')
|
157
|
+
name = metadata.dig(:labels, :'k8s-app') if name.nil?
|
158
|
+
name = metadata.dig(:name) if name.nil? # pod-id but better than nil?
|
159
|
+
name
|
160
|
+
end
|
161
|
+
|
162
|
+
def service_info_from_json(json_data)
|
163
|
+
# all we need is the port - .spec.ports[0].nodePort
|
164
|
+
ports = json_data.dig(:spec, :ports)
|
165
|
+
{
|
166
|
+
native:
|
167
|
+
{
|
168
|
+
port: ports[0].dig(:nodePort)
|
169
|
+
}
|
170
|
+
}
|
171
|
+
rescue
|
172
|
+
empty_native
|
173
|
+
end
|
174
|
+
|
175
|
+
def secret_info_from_json(json_data)
|
176
|
+
raw = json_data.dig(:data, :password)
|
177
|
+
{
|
178
|
+
native:
|
179
|
+
{
|
180
|
+
password: Base64.decode64(raw)
|
181
|
+
}
|
182
|
+
}
|
183
|
+
rescue
|
184
|
+
empty_native
|
185
|
+
end
|
186
|
+
|
187
|
+
def empty_native
|
188
|
+
{
|
189
|
+
native: {}
|
190
|
+
}
|
191
|
+
end
|
192
|
+
|
193
|
+
def dispatch_time(json_data)
|
194
|
+
status = pod_status_from_json(json_data)
|
195
|
+
return nil if status == 'undetermined'
|
196
|
+
|
197
|
+
state_data = json_data.dig(:status, :containerStatuses)[0].dig(:state)
|
198
|
+
date_string = nil
|
199
|
+
|
200
|
+
if status == 'completed'
|
201
|
+
date_string = state_data.dig(:terminated, :startedAt)
|
202
|
+
elsif status == 'running'
|
203
|
+
date_string = state_data.dig(:running, :startedAt)
|
204
|
+
end
|
205
|
+
|
206
|
+
date_string.nil? ? nil : DateTime.parse(date_string).to_time.to_i
|
207
|
+
end
|
208
|
+
|
209
|
+
def wallclock_time(json_data)
|
210
|
+
status = pod_status_from_json(json_data)
|
211
|
+
return nil if status == 'undetermined'
|
212
|
+
|
213
|
+
state_data = json_data.dig(:status, :containerStatuses)[0].dig(:state)
|
214
|
+
start_time = dispatch_time(json_data)
|
215
|
+
return nil if start_time.nil?
|
216
|
+
|
217
|
+
et = end_time(status, state_data)
|
218
|
+
|
219
|
+
et.nil? ? nil : et - start_time
|
220
|
+
end
|
221
|
+
|
222
|
+
def end_time(status, state_data)
|
223
|
+
if status == 'completed'
|
224
|
+
end_time_string = state_data.dig(:terminated, :finishedAt)
|
225
|
+
et = DateTime.parse(end_time_string).to_time.to_i
|
226
|
+
elsif status == 'running'
|
227
|
+
et = DateTime.now.to_time.to_i
|
228
|
+
else
|
229
|
+
et = nil
|
230
|
+
end
|
231
|
+
|
232
|
+
et
|
233
|
+
end
|
234
|
+
|
235
|
+
def submission_time(json_data)
|
236
|
+
status = json_data.dig(:status)
|
237
|
+
start = status.dig(:startTime)
|
238
|
+
|
239
|
+
if start.nil?
|
240
|
+
# the pod is in some pending state limbo
|
241
|
+
conditions = status.dig(:conditions)
|
242
|
+
# best guess to start time is just the first condition's
|
243
|
+
# transition time
|
244
|
+
str = conditions[0].dig(:lastTransitionTime)
|
245
|
+
else
|
246
|
+
str = start
|
247
|
+
end
|
248
|
+
|
249
|
+
DateTime.parse(str).to_time.to_i
|
250
|
+
end
|
251
|
+
|
252
|
+
def pod_status_from_json(json_data)
|
253
|
+
state = 'undetermined'
|
254
|
+
status = json_data.dig(:status)
|
255
|
+
container_statuses = status.dig(:containerStatuses)
|
256
|
+
|
257
|
+
if container_statuses.nil?
|
258
|
+
# if you're here, it means you're pending, probably unschedulable
|
259
|
+
return OodCore::Job::Status.new(state: state)
|
260
|
+
end
|
261
|
+
|
262
|
+
# only support 1 container/pod
|
263
|
+
json_state = container_statuses[0].dig(:state)
|
264
|
+
state = 'running' unless json_state.dig(:running).nil?
|
265
|
+
state = terminated_state(json_state) unless json_state.dig(:terminated).nil?
|
266
|
+
state = 'queued' unless json_state.dig(:waiting).nil?
|
267
|
+
|
268
|
+
OodCore::Job::Status.new(state: state)
|
269
|
+
end
|
270
|
+
|
271
|
+
def terminated_state(status)
|
272
|
+
reason = status.dig(:terminated, :reason)
|
273
|
+
if reason == 'Error'
|
274
|
+
'suspended'
|
275
|
+
else
|
276
|
+
'completed'
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
def procs_from_json(json_data)
|
281
|
+
containers = json_data.dig(:spec, :containers)
|
282
|
+
resources = containers[0].dig(:resources)
|
283
|
+
|
284
|
+
cpu = resources.dig(:limits, :cpu)
|
285
|
+
millicores_rex = /(\d+)m/
|
286
|
+
|
287
|
+
# ok to return string bc nil.to_i == 0 and we'd rather return
|
288
|
+
# nil (undefined) than 0 which is confusing.
|
289
|
+
if millicores_rex.match?(cpu)
|
290
|
+
millicores = millicores_rex.match(cpu)[1].to_i
|
291
|
+
|
292
|
+
# have to return at least 1 bc 200m could be 0
|
293
|
+
((millicores + 1000) / 1000).to_s
|
294
|
+
else
|
295
|
+
cpu
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module OodCore::Job::Adapters::Kubernetes::Resources
|
2
|
+
|
3
|
+
class ConfigMap
|
4
|
+
attr_accessor :name, :filename, :data
|
5
|
+
|
6
|
+
def initialize(name, filename, data)
|
7
|
+
@name = name
|
8
|
+
@filename = filename
|
9
|
+
@data = data
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class Container
|
14
|
+
attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
|
15
|
+
:restart_policy
|
16
|
+
|
17
|
+
def initialize(
|
18
|
+
name, image, command: [], port: nil, env: [], memory: "4Gi", cpu: "1",
|
19
|
+
working_dir: "", restart_policy: "Never"
|
20
|
+
)
|
21
|
+
raise ArgumentError, "containers need valid names and images" unless name && image
|
22
|
+
|
23
|
+
@name = name
|
24
|
+
@image = image
|
25
|
+
@command = command.nil? ? [] : command
|
26
|
+
@port = port&.to_i
|
27
|
+
@env = env.nil? ? [] : env
|
28
|
+
@memory = memory.nil? ? "4Gi" : memory
|
29
|
+
@cpu = cpu.nil? ? "1" : cpu
|
30
|
+
@working_dir = working_dir.nil? ? "" : working_dir
|
31
|
+
@restart_policy = restart_policy.nil? ? "Never" : restart_policy
|
32
|
+
end
|
33
|
+
|
34
|
+
def ==(other)
|
35
|
+
name == other.name &&
|
36
|
+
image == other.image &&
|
37
|
+
command == other.command &&
|
38
|
+
port == other.port &&
|
39
|
+
env == other.env &&
|
40
|
+
memory == other.memory &&
|
41
|
+
cpu == other.cpu &&
|
42
|
+
working_dir == other.working_dir &&
|
43
|
+
restart_policy == other.restart_policy
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
class PodSpec
|
49
|
+
attr_accessor :container, :init_containers
|
50
|
+
def initialize(container, init_containers: nil)
|
51
|
+
@container = container
|
52
|
+
@init_containers = init_containers
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
apiVersion: v1
|
2
|
+
kind: Pod
|
3
|
+
metadata:
|
4
|
+
namespace: <%= namespace %>
|
5
|
+
name: <%= id %>
|
6
|
+
labels:
|
7
|
+
job: <%= id %>
|
8
|
+
app.kubernetes.io/name: <%= container.name %>
|
9
|
+
app.kubernetes.io/managed-by: open-ondemand
|
10
|
+
spec:
|
11
|
+
restartPolicy: <%= spec.container.restart_policy %>
|
12
|
+
securityContext:
|
13
|
+
runAsUser: <%= run_as_user %>
|
14
|
+
runAsGroup: <%= run_as_group %>
|
15
|
+
fsGroup: <%= fs_group %>
|
16
|
+
containers:
|
17
|
+
- name: "<%= spec.container.name %>"
|
18
|
+
image: <%= spec.container.image %>
|
19
|
+
imagePullPolicy: IfNotPresent
|
20
|
+
<% unless spec.container.working_dir.empty? %>
|
21
|
+
workingDir: "<%= spec.container.working_dir %>"
|
22
|
+
<% end %>
|
23
|
+
<% unless spec.container.env.empty? %>
|
24
|
+
env:
|
25
|
+
<% spec.container.env.each do |env| %>
|
26
|
+
- name: <%= env[:name] %>
|
27
|
+
value: "<%= env[:value] %>"
|
28
|
+
<% end %> <%# for each env %>
|
29
|
+
<% end %> <%# unless env is nil %>
|
30
|
+
<% unless spec.container.command.empty? %>
|
31
|
+
command:
|
32
|
+
<% spec.container.command.each do |cmd| %>
|
33
|
+
- "<%= cmd %>"
|
34
|
+
<% end %> <%# for each command %>
|
35
|
+
<% end %> <%# unless command is nil %>
|
36
|
+
<% unless spec.container.port.nil? %>
|
37
|
+
ports:
|
38
|
+
- containerPort: <%= spec.container.port %>
|
39
|
+
<% end %>
|
40
|
+
volumeMounts:
|
41
|
+
<% unless configmap.nil? %>
|
42
|
+
- name: configmap-volume
|
43
|
+
mountPath: <%= configmap_mount_path %>
|
44
|
+
<% end %>
|
45
|
+
<% all_mounts.each do |mount| %>
|
46
|
+
- name: <%= mount[:name] %>
|
47
|
+
mountPath: <%= mount[:destination_path] %>
|
48
|
+
<% end %> <%# for each mount %>
|
49
|
+
resources:
|
50
|
+
limits:
|
51
|
+
memory: "<%= spec.container.memory %>"
|
52
|
+
cpu: "<%= spec.container.cpu %>"
|
53
|
+
requests:
|
54
|
+
memory: "<%= spec.container.memory %>"
|
55
|
+
cpu: "<%= spec.container.cpu %>"
|
56
|
+
<% unless spec.init_containers.nil? %>
|
57
|
+
initContainers:
|
58
|
+
<% spec.init_containers.each do |ctr| %>
|
59
|
+
- name: "<%= ctr.name %>"
|
60
|
+
image: "<%= ctr.image %>"
|
61
|
+
command:
|
62
|
+
<% ctr.command.each do |cmd| %>
|
63
|
+
- "<%= cmd %>"
|
64
|
+
<% end %> <%# command loop %>
|
65
|
+
volumeMounts:
|
66
|
+
<% unless configmap.nil? %>
|
67
|
+
- name: configmap-volume
|
68
|
+
mountPath: <%= configmap_mount_path %>
|
69
|
+
<% end %>
|
70
|
+
<% all_mounts.each do |mount| %>
|
71
|
+
- name: <%= mount[:name] %>
|
72
|
+
mountPath: <%= mount[:destination_path] %>
|
73
|
+
<% end %> <%# for each mount %>
|
74
|
+
<% end %> <%# init container loop %>
|
75
|
+
<% end %> <%# if init containers %>
|
76
|
+
<% unless configmap.nil? || all_mounts.empty? %>
|
77
|
+
volumes:
|
78
|
+
<% end %> <%# configmap.nil? || all_mounts.empty? %>
|
79
|
+
<% unless configmap.nil? %>
|
80
|
+
- name: configmap-volume
|
81
|
+
configMap:
|
82
|
+
name: <%= configmap_name(id) %>
|
83
|
+
<% end %>
|
84
|
+
<% all_mounts.each do |mount| %>
|
85
|
+
<% if mount[:type] == 'nfs' %>
|
86
|
+
- name: <%= mount[:name] %>
|
87
|
+
nfs:
|
88
|
+
server: <%= mount[:host] %>
|
89
|
+
path: <%= mount[:path] %>
|
90
|
+
<% elsif mount[:type] == 'host' %>
|
91
|
+
- name: <%= mount[:name] %>
|
92
|
+
hostPath:
|
93
|
+
path: <%= mount[:path] %>
|
94
|
+
type: <%= mount[:host_type] %>
|
95
|
+
<% end %> <%# if mount is [host,nfs] %>
|
96
|
+
<% end %> <%# for each mount %>
|
97
|
+
---
|
98
|
+
<% unless spec.container.port.nil? %>
|
99
|
+
apiVersion: v1
|
100
|
+
kind: Service
|
101
|
+
metadata:
|
102
|
+
name: <%= service_name(id) %>
|
103
|
+
namespace: <%= namespace %>
|
104
|
+
spec:
|
105
|
+
selector:
|
106
|
+
job: <%= id %>
|
107
|
+
ports:
|
108
|
+
- protocol: TCP
|
109
|
+
port: 80
|
110
|
+
targetPort: <%= spec.container.port %>
|
111
|
+
type: NodePort
|
112
|
+
<% end %> <%# end for service %>
|
113
|
+
---
|
114
|
+
<% unless configmap.nil? %>
|
115
|
+
apiVersion: v1
|
116
|
+
kind: ConfigMap
|
117
|
+
metadata:
|
118
|
+
name: <%= configmap_name(id) %>
|
119
|
+
namespace: <%= namespace %>
|
120
|
+
data:
|
121
|
+
<%= configmap.filename %>: |
|
122
|
+
<% config_data_lines(configmap.data).each do |line| %><%= line %><% end %>
|
123
|
+
<% end %> <%# end for configmap %>
|
@@ -166,7 +166,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
166
166
|
'email_on_terminated' => script_email_on_event(script, 'terminated'),
|
167
167
|
'email_on_start' => script_email_on_event(script, 'started'),
|
168
168
|
'environment' => export_env(script),
|
169
|
-
'error_path' => (script
|
169
|
+
'error_path' => error_path(script),
|
170
170
|
'job_name' => script.job_name.to_s,
|
171
171
|
'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
|
172
172
|
'script_content' => content,
|
@@ -176,6 +176,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
176
176
|
'singularity_image' => singularity_image(script.native),
|
177
177
|
'ssh_hosts' => ssh_hosts,
|
178
178
|
'tmux_bin' => tmux_bin,
|
179
|
+
'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
|
179
180
|
}.each{
|
180
181
|
|key, value| bnd.local_variable_set(key, value)
|
181
182
|
}
|
@@ -272,4 +273,11 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
272
273
|
return false if script.content.empty?
|
273
274
|
script.content.split("\n").first.start_with?('#!/')
|
274
275
|
end
|
276
|
+
|
277
|
+
def error_path(script)
|
278
|
+
return script.error_path.to_s if script.error_path
|
279
|
+
return script.output_path.to_s if script.output_path
|
280
|
+
|
281
|
+
'/dev/null'
|
282
|
+
end
|
275
283
|
end
|
@@ -16,13 +16,9 @@ fi
|
|
16
16
|
echo $hostname
|
17
17
|
|
18
18
|
# Put the script into a temp file on localhost
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
<% else %>
|
23
|
-
singularity_tmp_file=$(mktemp)
|
24
|
-
tmux_tmp_file=$(mktemp)
|
25
|
-
<% end %>
|
19
|
+
singularity_tmp_file=$(mktemp -p "<%= workdir %>" --suffix '_sing')
|
20
|
+
tmux_tmp_file=$(mktemp -p "<%= workdir %>" --suffix "_tmux")
|
21
|
+
|
26
22
|
|
27
23
|
# Create an executable to run in a tmux session
|
28
24
|
# The escaped HEREDOC means that we need to substitute in $singularity_tmp_file ourselves
|
@@ -69,10 +65,3 @@ SINGULARITY_LAUNCHER
|
|
69
65
|
chmod +x "$singularity_tmp_file"
|
70
66
|
chmod +x "$tmux_tmp_file"
|
71
67
|
<%= tmux_bin %> new-session -d -s "<%= session_name %>" "$tmux_tmp_file"
|
72
|
-
|
73
|
-
# Remove the file
|
74
|
-
<% if ! debug %>
|
75
|
-
# Wait 1 second to ensure that tmux session has started before the file is removed
|
76
|
-
sleep 1
|
77
|
-
rm -f "$tmux_tmp_file"; rm -f "$singularity_tmp_file"
|
78
|
-
<% end %>
|
@@ -80,6 +80,9 @@ module OodCore
|
|
80
80
|
# from
|
81
81
|
class Error < StandardError; end
|
82
82
|
|
83
|
+
# An error indicating the slurm command timed out
|
84
|
+
class SlurmTimeoutError < Error; end
|
85
|
+
|
83
86
|
# @param cluster [#to_s, nil] the cluster name
|
84
87
|
# @param conf [#to_s, nil] path to the slurm conf
|
85
88
|
# @param bin [#to_s] path to slurm installation binaries
|
@@ -147,6 +150,9 @@ module OodCore
|
|
147
150
|
end
|
148
151
|
jobs
|
149
152
|
end
|
153
|
+
rescue SlurmTimeoutError
|
154
|
+
# TODO: could use a log entry here
|
155
|
+
return [{ id: id, state: 'undetermined' }]
|
150
156
|
end
|
151
157
|
|
152
158
|
def squeue_fields(attrs)
|
@@ -303,7 +309,18 @@ module OodCore
|
|
303
309
|
|
304
310
|
cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
|
305
311
|
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
306
|
-
s.success? ? o : raise(Error, e)
|
312
|
+
s.success? ? interpret_and_raise(o, e) : raise(Error, e)
|
313
|
+
end
|
314
|
+
|
315
|
+
# Helper function to raise an error based on the contents of stderr.
|
316
|
+
# Slurm exits 0 even when the command fails, so we need to interpret stderr
|
317
|
+
# to see if the command was actually successful.
|
318
|
+
def interpret_and_raise(stdout, stderr)
|
319
|
+
return stdout if stderr.empty?
|
320
|
+
|
321
|
+
raise SlurmTimeoutError, stderr if /^slurm_load_jobs error: Socket timed out/.match(stderr)
|
322
|
+
|
323
|
+
stdout
|
307
324
|
end
|
308
325
|
|
309
326
|
def squeue_attrs_for_info_attrs(attrs)
|
data/lib/ood_core/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2020-
|
13
|
+
date: 2020-10-01 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -166,6 +166,11 @@ files:
|
|
166
166
|
- lib/ood_core/job/adapters/ccq.rb
|
167
167
|
- lib/ood_core/job/adapters/drmaa.rb
|
168
168
|
- lib/ood_core/job/adapters/helper.rb
|
169
|
+
- lib/ood_core/job/adapters/kubernetes.rb
|
170
|
+
- lib/ood_core/job/adapters/kubernetes/batch.rb
|
171
|
+
- lib/ood_core/job/adapters/kubernetes/helper.rb
|
172
|
+
- lib/ood_core/job/adapters/kubernetes/resources.rb
|
173
|
+
- lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb
|
169
174
|
- lib/ood_core/job/adapters/linux_host.rb
|
170
175
|
- lib/ood_core/job/adapters/linux_host/launcher.rb
|
171
176
|
- lib/ood_core/job/adapters/linux_host/templates/email.erb.sh
|
@@ -216,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
216
221
|
- !ruby/object:Gem::Version
|
217
222
|
version: '0'
|
218
223
|
requirements: []
|
219
|
-
rubygems_version: 3.0.
|
224
|
+
rubygems_version: 3.0.8
|
220
225
|
signing_key:
|
221
226
|
specification_version: 4
|
222
227
|
summary: Open OnDemand core library
|