ood_core 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -1
- data/README.md +1 -1
- data/lib/ood_core/job/adapters/kubernetes.rb +193 -0
- data/lib/ood_core/job/adapters/kubernetes/batch.rb +350 -0
- data/lib/ood_core/job/adapters/kubernetes/helper.rb +298 -0
- data/lib/ood_core/job/adapters/kubernetes/resources.rb +56 -0
- data/lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb +123 -0
- data/lib/ood_core/job/adapters/linux_host/launcher.rb +9 -1
- data/lib/ood_core/job/adapters/linux_host/templates/script_wrapper.erb.sh +3 -14
- data/lib/ood_core/job/adapters/slurm.rb +18 -1
- data/lib/ood_core/version.rb +1 -1
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52ba764b085dedb7eaeb06d95751f1804a50488e1859f980a7836d2d9032b95d
|
4
|
+
data.tar.gz: c2dc5edf395fe158960f33b80c554f3dc745f15e7ec1337b738683a0e1bbdc7f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59915bae23a008a923c249d222e50548a7bee3438144068a29ae1cafdd489ca1229ee1a14f4f81e3fd065381f46f920bef24344fe633c7c578cb1f6a4f9a2a77
|
7
|
+
data.tar.gz: 8d2ca42c7f49158c8d321c21b79aff1c636df3c77bb7e71107db70371a34058d79c8a5ec32ca93883e7d3bcc7dc2202375144d23613f167ab089318d6270248c
|
data/CHANGELOG.md
CHANGED
@@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
|
|
6
6
|
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
|
7
7
|
|
8
8
|
## [Unreleased]
|
9
|
+
## [0.14.0] - 2020-10-01
|
10
|
+
### Added
|
11
|
+
- Kubernetes adapter in PR [156](https://github.com/OSC/ood_core/pull/156)
|
12
|
+
|
13
|
+
### Fixed
|
14
|
+
- Catch Slurm times. [209](https://github.com/OSC/ood_core/pull/209)
|
15
|
+
- LHA race condition in deleteing tmp files. [212](https://github.com/OSC/ood_core/pull/212)
|
16
|
+
|
9
17
|
## [0.13.0] - 2020-08-10
|
10
18
|
### Added
|
11
19
|
- CloudyCluster CCQ Adapter
|
@@ -247,7 +255,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
247
255
|
### Added
|
248
256
|
- Initial release!
|
249
257
|
|
250
|
-
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.
|
258
|
+
[Unreleased]: https://github.com/OSC/ood_core/compare/v0.14.0...HEAD
|
259
|
+
[0.14.0]: https://github.com/OSC/ood_core/compare/v0.13.0...v0.14.0
|
251
260
|
[0.13.0]: https://github.com/OSC/ood_core/compare/v0.12.0...v0.13.0
|
252
261
|
[0.12.0]: https://github.com/OSC/ood_core/compare/v0.11.4...v0.12.0
|
253
262
|
[0.11.4]: https://github.com/OSC/ood_core/compare/v0.11.3...v0.11.4
|
data/README.md
CHANGED
@@ -6,7 +6,7 @@
|
|
6
6
|
|
7
7
|
- Website: http://openondemand.org/
|
8
8
|
- Website repo with JOSS publication: https://github.com/OSC/Open-OnDemand
|
9
|
-
- Documentation: https://osc.github.io/ood-documentation/
|
9
|
+
- Documentation: https://osc.github.io/ood-documentation/latest/
|
10
10
|
- Main code repo: https://github.com/OSC/ondemand
|
11
11
|
- Core library repo: https://github.com/OSC/ood_core
|
12
12
|
|
@@ -0,0 +1,193 @@
|
|
1
|
+
require "ood_core/refinements/hash_extensions"
|
2
|
+
require "ood_core/refinements/array_extensions"
|
3
|
+
|
4
|
+
module OodCore
|
5
|
+
module Job
|
6
|
+
class Factory
|
7
|
+
using Refinements::HashExtensions
|
8
|
+
|
9
|
+
def self.build_kubernetes(config)
|
10
|
+
batch = Adapters::Kubernetes::Batch.new(config.to_h.symbolize_keys, Adapters::Kubernetes::Helper.new)
|
11
|
+
Adapters::Kubernetes.new(batch)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
module Adapters
|
16
|
+
class Kubernetes < Adapter
|
17
|
+
|
18
|
+
using Refinements::ArrayExtensions
|
19
|
+
using Refinements::HashExtensions
|
20
|
+
|
21
|
+
require "ood_core/job/adapters/kubernetes/batch"
|
22
|
+
|
23
|
+
attr_reader :batch
|
24
|
+
|
25
|
+
def initialize(batch)
|
26
|
+
@batch = batch
|
27
|
+
end
|
28
|
+
|
29
|
+
# Submit a job with the attributes defined in the job template instance
|
30
|
+
# @abstract Subclass is expected to implement {#submit}
|
31
|
+
# @raise [NotImplementedError] if subclass did not define {#submit}
|
32
|
+
# @example Submit job template to cluster
|
33
|
+
# solver_id = job_adapter.submit(solver_script)
|
34
|
+
# #=> "1234.server"
|
35
|
+
# @example Submit job that depends on previous job
|
36
|
+
# post_id = job_adapter.submit(
|
37
|
+
# post_script,
|
38
|
+
# afterok: solver_id
|
39
|
+
# )
|
40
|
+
# #=> "1235.server"
|
41
|
+
# @param script [Script] script object that describes the
|
42
|
+
# script and attributes for the submitted job
|
43
|
+
# @param after [#to_s, Array<#to_s>] this job may be scheduled for execution
|
44
|
+
# at any point after dependent jobs have started execution
|
45
|
+
# @param afterok [#to_s, Array<#to_s>] this job may be scheduled for
|
46
|
+
# execution only after dependent jobs have terminated with no errors
|
47
|
+
# @param afternotok [#to_s, Array<#to_s>] this job may be scheduled for
|
48
|
+
# execution only after dependent jobs have terminated with errors
|
49
|
+
# @param afterany [#to_s, Array<#to_s>] this job may be scheduled for
|
50
|
+
# execution after dependent jobs have terminated
|
51
|
+
# @return [String] the job id returned after successfully submitting a job
|
52
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
53
|
+
raise ArgumentError, 'Must specify the script' if script.nil?
|
54
|
+
|
55
|
+
batch.submit(script)
|
56
|
+
rescue Batch::Error => e
|
57
|
+
raise JobAdapterError, e.message
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
# Retrieve info for all jobs from the resource manager
|
62
|
+
# @abstract Subclass is expected to implement {#info_all}
|
63
|
+
# @raise [NotImplementedError] if subclass did not define {#info_all}
|
64
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
65
|
+
# This array specifies only attrs you want, in addition to id and status.
|
66
|
+
# If an array, the Info object that is returned to you is not guarenteed
|
67
|
+
# to have a value for any attr besides the ones specified and id and status.
|
68
|
+
#
|
69
|
+
# For certain adapters this may speed up the response since
|
70
|
+
# adapters can get by without populating the entire Info object
|
71
|
+
# @return [Array<Info>] information describing submitted jobs
|
72
|
+
def info_all(attrs: nil)
|
73
|
+
batch.info_all(attrs: attrs)
|
74
|
+
rescue Batch::Error => e
|
75
|
+
raise JobAdapterError, e.message
|
76
|
+
end
|
77
|
+
|
78
|
+
# Retrieve info for all jobs for a given owner or owners from the
|
79
|
+
# resource manager
|
80
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
81
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
82
|
+
# This array specifies only attrs you want, in addition to id and status.
|
83
|
+
# If an array, the Info object that is returned to you is not guarenteed
|
84
|
+
# to have a value for any attr besides the ones specified and id and status.
|
85
|
+
#
|
86
|
+
# For certain adapters this may speed up the response since
|
87
|
+
# adapters can get by without populating the entire Info object
|
88
|
+
# @return [Array<Info>] information describing submitted jobs
|
89
|
+
def info_where_owner(owner, attrs: nil)
|
90
|
+
owner = Array.wrap(owner).map(&:to_s)
|
91
|
+
|
92
|
+
# must at least have job_owner to filter by job_owner
|
93
|
+
attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
|
94
|
+
|
95
|
+
info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
|
96
|
+
end
|
97
|
+
|
98
|
+
# Iterate over each job Info object
|
99
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
100
|
+
# This array specifies only attrs you want, in addition to id and status.
|
101
|
+
# If an array, the Info object that is returned to you is not guarenteed
|
102
|
+
# to have a value for any attr besides the ones specified and id and status.
|
103
|
+
#
|
104
|
+
# For certain adapters this may speed up the response since
|
105
|
+
# adapters can get by without populating the entire Info object
|
106
|
+
# @yield [Info] of each job to block
|
107
|
+
# @return [Enumerator] if no block given
|
108
|
+
def info_all_each(attrs: nil)
|
109
|
+
return to_enum(:info_all_each, attrs: attrs) unless block_given?
|
110
|
+
|
111
|
+
info_all(attrs: attrs).each do |job|
|
112
|
+
yield job
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Iterate over each job Info object
|
117
|
+
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs
|
118
|
+
# @param attrs [Array<symbol>] defaults to nil (and all attrs are provided)
|
119
|
+
# This array specifies only attrs you want, in addition to id and status.
|
120
|
+
# If an array, the Info object that is returned to you is not guarenteed
|
121
|
+
# to have a value for any attr besides the ones specified and id and status.
|
122
|
+
#
|
123
|
+
# For certain adapters this may speed up the response since
|
124
|
+
# adapters can get by without populating the entire Info object
|
125
|
+
# @yield [Info] of each job to block
|
126
|
+
# @return [Enumerator] if no block given
|
127
|
+
def info_where_owner_each(owner, attrs: nil)
|
128
|
+
return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
|
129
|
+
|
130
|
+
info_where_owner(owner, attrs: attrs).each do |job|
|
131
|
+
yield job
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
# Whether the adapter supports job arrays
|
136
|
+
# @return [Boolean] - assumes true; but can be overridden by adapters that
|
137
|
+
# explicitly do not
|
138
|
+
def supports_job_arrays?
|
139
|
+
false
|
140
|
+
end
|
141
|
+
|
142
|
+
# Retrieve job info from the resource manager
|
143
|
+
# @abstract Subclass is expected to implement {#info}
|
144
|
+
# @raise [NotImplementedError] if subclass did not define {#info}
|
145
|
+
# @param id [#to_s] the id of the job
|
146
|
+
# @return [Info] information describing submitted job
|
147
|
+
def info(id)
|
148
|
+
batch.info(id.to_s)
|
149
|
+
rescue Batch::Error => e
|
150
|
+
raise JobAdapterError, e.message
|
151
|
+
end
|
152
|
+
|
153
|
+
# Retrieve job status from resource manager
|
154
|
+
# @note Optimized slightly over retrieving complete job information from server
|
155
|
+
# @abstract Subclass is expected to implement {#status}
|
156
|
+
# @raise [NotImplementedError] if subclass did not define {#status}
|
157
|
+
# @param id [#to_s] the id of the job
|
158
|
+
# @return [Status] status of job
|
159
|
+
def status(id)
|
160
|
+
info(id).status
|
161
|
+
end
|
162
|
+
|
163
|
+
# Put the submitted job on hold
|
164
|
+
# @abstract Subclass is expected to implement {#hold}
|
165
|
+
# @raise [NotImplementedError] if subclass did not define {#hold}
|
166
|
+
# @param id [#to_s] the id of the job
|
167
|
+
# @return [void]
|
168
|
+
def hold(id)
|
169
|
+
raise NotImplementedError, 'subclass did not define #hold'
|
170
|
+
end
|
171
|
+
|
172
|
+
# Release the job that is on hold
|
173
|
+
# @abstract Subclass is expected to implement {#release}
|
174
|
+
# @raise [NotImplementedError] if subclass did not define {#release}
|
175
|
+
# @param id [#to_s] the id of the job
|
176
|
+
# @return [void]
|
177
|
+
def release(id)
|
178
|
+
raise NotImplementedError, 'subclass did not define #release'
|
179
|
+
end
|
180
|
+
|
181
|
+
# Delete the submitted job.
|
182
|
+
#
|
183
|
+
# @param id [#to_s] the id of the job
|
184
|
+
# @return [void]
|
185
|
+
def delete(id)
|
186
|
+
batch.delete(id.to_s)
|
187
|
+
rescue Batch::Error => e
|
188
|
+
raise JobAdapterError, e.message
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,350 @@
|
|
1
|
+
require "ood_core/refinements/hash_extensions"
|
2
|
+
require "json"
|
3
|
+
|
4
|
+
class OodCore::Job::Adapters::Kubernetes::Batch
|
5
|
+
|
6
|
+
require "ood_core/job/adapters/kubernetes/helper"
|
7
|
+
|
8
|
+
Helper = OodCore::Job::Adapters::Kubernetes::Helper
|
9
|
+
Resources = OodCore::Job::Adapters::Kubernetes::Resources
|
10
|
+
|
11
|
+
using OodCore::Refinements::HashExtensions
|
12
|
+
|
13
|
+
class Error < StandardError; end
|
14
|
+
|
15
|
+
attr_reader :config_file, :bin, :cluster_name, :mounts
|
16
|
+
attr_reader :all_namespaces, :using_context, :helper
|
17
|
+
attr_reader :username_prefix
|
18
|
+
|
19
|
+
def initialize(options = {}, helper = Helper.new)
|
20
|
+
options = options.to_h.symbolize_keys
|
21
|
+
|
22
|
+
@config_file = options.fetch(:config_file, default_config_file)
|
23
|
+
@bin = options.fetch(:bin, '/usr/bin/kubectl')
|
24
|
+
@cluster_name = options.fetch(:cluster_name, 'open-ondemand')
|
25
|
+
@mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
|
26
|
+
@all_namespaces = options.fetch(:all_namespaces, false)
|
27
|
+
@username_prefix = options.fetch(:username_prefix, nil)
|
28
|
+
|
29
|
+
@using_context = false
|
30
|
+
@helper = helper
|
31
|
+
|
32
|
+
begin
|
33
|
+
make_kubectl_config(options)
|
34
|
+
rescue
|
35
|
+
# FIXME could use a log here
|
36
|
+
# means you couldn't 'kubectl set config'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def resource_file(resource_type = 'pod')
|
41
|
+
File.dirname(__FILE__) + "/templates/#{resource_type}.yml.erb"
|
42
|
+
end
|
43
|
+
|
44
|
+
def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
|
45
|
+
raise ArgumentError, 'Must specify the script' if script.nil?
|
46
|
+
|
47
|
+
resource_yml, id = generate_id_yml(script.native)
|
48
|
+
call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
|
49
|
+
|
50
|
+
id
|
51
|
+
end
|
52
|
+
|
53
|
+
def generate_id(name)
|
54
|
+
# 2_821_109_907_456 = 36**8
|
55
|
+
name.downcase.tr(' ', '-') + '-' + rand(2_821_109_907_456).to_s(36)
|
56
|
+
end
|
57
|
+
|
58
|
+
def info_all(attrs: nil)
|
59
|
+
cmd = if all_namespaces
|
60
|
+
"#{base_cmd} get pods -o json --all-namespaces"
|
61
|
+
else
|
62
|
+
"#{namespaced_cmd} get pods -o json"
|
63
|
+
end
|
64
|
+
|
65
|
+
output = call(cmd)
|
66
|
+
all_pods_to_info(output)
|
67
|
+
end
|
68
|
+
|
69
|
+
def info_where_owner(owner, attrs: nil)
|
70
|
+
owner = Array.wrap(owner).map(&:to_s)
|
71
|
+
|
72
|
+
# must at least have job_owner to filter by job_owner
|
73
|
+
attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
|
74
|
+
|
75
|
+
info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
|
76
|
+
end
|
77
|
+
|
78
|
+
def info_all_each(attrs: nil)
|
79
|
+
return to_enum(:info_all_each, attrs: attrs) unless block_given?
|
80
|
+
|
81
|
+
info_all(attrs: attrs).each do |job|
|
82
|
+
yield job
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def info_where_owner_each(owner, attrs: nil)
|
87
|
+
return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
|
88
|
+
|
89
|
+
info_where_owner(owner, attrs: attrs).each do |job|
|
90
|
+
yield job
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def info(id)
|
95
|
+
pod_json = call_json_output('get', 'pod', id)
|
96
|
+
|
97
|
+
begin
|
98
|
+
service_json = call_json_output('get', 'service', service_name(id))
|
99
|
+
secret_json = call_json_output('get', 'secret', secret_name(id))
|
100
|
+
rescue
|
101
|
+
# it's ok if these don't exist
|
102
|
+
service_json ||= nil
|
103
|
+
secret_json ||= nil
|
104
|
+
end
|
105
|
+
|
106
|
+
helper.info_from_json(pod_json: pod_json, service_json: service_json, secret_json: secret_json)
|
107
|
+
end
|
108
|
+
|
109
|
+
def status(id)
|
110
|
+
info(id).status
|
111
|
+
end
|
112
|
+
|
113
|
+
def delete(id)
|
114
|
+
call("#{namespaced_cmd} delete pod #{id}")
|
115
|
+
|
116
|
+
begin
|
117
|
+
call("#{namespaced_cmd} delete service #{service_name(id)}")
|
118
|
+
call("#{namespaced_cmd} delete secret #{secret_name(id)}")
|
119
|
+
call("#{namespaced_cmd} delete configmap #{configmap_name(id)}")
|
120
|
+
rescue
|
121
|
+
# FIXME: retries? delete if exists?
|
122
|
+
# just eat the results of deleting services and secrets
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def configmap_mount_path
|
127
|
+
'/ood'
|
128
|
+
end
|
129
|
+
|
130
|
+
private
|
131
|
+
|
132
|
+
# helper to help format multi-line yaml data from the submit.yml into
|
133
|
+
# mutli-line yaml in the pod.yml.erb
|
134
|
+
def config_data_lines(data)
|
135
|
+
output = []
|
136
|
+
first = true
|
137
|
+
|
138
|
+
data.to_s.each_line do |line|
|
139
|
+
output.append(first ? line : line.prepend(" "))
|
140
|
+
first = false
|
141
|
+
end
|
142
|
+
|
143
|
+
output
|
144
|
+
end
|
145
|
+
|
146
|
+
def username
|
147
|
+
@username ||= Etc.getlogin
|
148
|
+
end
|
149
|
+
|
150
|
+
def k8s_username
|
151
|
+
username_prefix.nil? ? username : "#{username_prefix}-#{username}"
|
152
|
+
end
|
153
|
+
|
154
|
+
def run_as_user
|
155
|
+
Etc.getpwnam(username).uid
|
156
|
+
end
|
157
|
+
|
158
|
+
def run_as_group
|
159
|
+
Etc.getpwnam(username).gid
|
160
|
+
end
|
161
|
+
|
162
|
+
def fs_group
|
163
|
+
run_as_group
|
164
|
+
end
|
165
|
+
|
166
|
+
# helper to template resource yml you're going to submit and
|
167
|
+
# create an id.
|
168
|
+
def generate_id_yml(native_data)
|
169
|
+
container = helper.container_from_native(native_data[:container])
|
170
|
+
id = generate_id(container.name)
|
171
|
+
configmap = helper.configmap_from_native(native_data, id)
|
172
|
+
init_containers = helper.init_ctrs_from_native(native_data[:init_containers])
|
173
|
+
spec = Resources::PodSpec.new(container, init_containers: init_containers)
|
174
|
+
all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
|
175
|
+
|
176
|
+
template = ERB.new(File.read(resource_file))
|
177
|
+
|
178
|
+
[template.result(binding), id]
|
179
|
+
end
|
180
|
+
|
181
|
+
# helper to call kubectl and get json data back.
|
182
|
+
# verb, resrouce and id are the kubernetes parlance terms.
|
183
|
+
# example: 'kubectl get pod my-pod-id' is verb=get, resource=pod
|
184
|
+
# and id=my-pod-id
|
185
|
+
def call_json_output(verb, resource, id, stdin: nil)
|
186
|
+
cmd = "#{formatted_ns_cmd} #{verb} #{resource} #{id}"
|
187
|
+
data = call(cmd, stdin: stdin)
|
188
|
+
data = data.empty? ? '{}' : data
|
189
|
+
json_data = JSON.parse(data, symbolize_names: true)
|
190
|
+
|
191
|
+
json_data
|
192
|
+
end
|
193
|
+
|
194
|
+
def service_name(id)
|
195
|
+
helper.service_name(id)
|
196
|
+
end
|
197
|
+
|
198
|
+
def secret_name(id)
|
199
|
+
helper.secret_name(id)
|
200
|
+
end
|
201
|
+
|
202
|
+
def configmap_name(id)
|
203
|
+
helper.configmap_name(id)
|
204
|
+
end
|
205
|
+
|
206
|
+
def namespace
|
207
|
+
default_namespace
|
208
|
+
end
|
209
|
+
|
210
|
+
def default_namespace
|
211
|
+
username
|
212
|
+
end
|
213
|
+
|
214
|
+
def context
|
215
|
+
cluster_name
|
216
|
+
end
|
217
|
+
|
218
|
+
def default_config_file
|
219
|
+
(ENV['KUBECONFIG'] || "#{Dir.home}/.kube/config")
|
220
|
+
end
|
221
|
+
|
222
|
+
def default_auth
|
223
|
+
{
|
224
|
+
type: 'managaged'
|
225
|
+
}.symbolize_keys
|
226
|
+
end
|
227
|
+
|
228
|
+
def default_server
|
229
|
+
{
|
230
|
+
endpoint: 'https://localhost:8080',
|
231
|
+
cert_authority_file: nil
|
232
|
+
}.symbolize_keys
|
233
|
+
end
|
234
|
+
|
235
|
+
def formatted_ns_cmd
|
236
|
+
"#{namespaced_cmd} -o json"
|
237
|
+
end
|
238
|
+
|
239
|
+
def namespaced_cmd
|
240
|
+
"#{base_cmd} --namespace=#{namespace}"
|
241
|
+
end
|
242
|
+
|
243
|
+
def base_cmd
|
244
|
+
base = "#{bin} --kubeconfig=#{config_file}"
|
245
|
+
base << " --context=#{context}" if using_context
|
246
|
+
base
|
247
|
+
end
|
248
|
+
|
249
|
+
def all_pods_to_info(data)
|
250
|
+
json_data = JSON.parse(data, symbolize_names: true)
|
251
|
+
pods = json_data.dig(:items)
|
252
|
+
|
253
|
+
info_array = []
|
254
|
+
pods.each do |pod|
|
255
|
+
info = pod_info_from_json(pod)
|
256
|
+
info_array.push(info) unless info.nil?
|
257
|
+
end
|
258
|
+
|
259
|
+
info_array
|
260
|
+
rescue JSON::ParserError
|
261
|
+
# 'no resources in <namespace>' throws parse error
|
262
|
+
[]
|
263
|
+
end
|
264
|
+
|
265
|
+
def pod_info_from_json(pod)
|
266
|
+
hash = helper.pod_info_from_json(pod)
|
267
|
+
OodCore::Job::Info.new(hash)
|
268
|
+
rescue Helper::K8sDataError
|
269
|
+
# FIXME: silently eating error, could probably use a logger
|
270
|
+
nil
|
271
|
+
end
|
272
|
+
|
273
|
+
def make_kubectl_config(config)
|
274
|
+
set_cluster(config.fetch(:server, default_server).to_h.symbolize_keys)
|
275
|
+
configure_auth(config.fetch(:auth, default_auth).to_h.symbolize_keys)
|
276
|
+
end
|
277
|
+
|
278
|
+
def configure_auth(auth)
|
279
|
+
type = auth.fetch(:type)
|
280
|
+
return if managed?(type)
|
281
|
+
|
282
|
+
case type
|
283
|
+
when 'gke'
|
284
|
+
set_gke_config(auth)
|
285
|
+
when 'oidc'
|
286
|
+
set_context
|
287
|
+
end
|
288
|
+
end
|
289
|
+
|
290
|
+
def use_context
|
291
|
+
@using_context = true
|
292
|
+
end
|
293
|
+
|
294
|
+
def managed?(type)
|
295
|
+
if type.nil?
|
296
|
+
true # maybe should be false?
|
297
|
+
else
|
298
|
+
type.to_s == 'managed'
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def set_gke_config(auth)
|
303
|
+
cred_file = auth.fetch(:svc_acct_file)
|
304
|
+
|
305
|
+
cmd = "gcloud auth activate-service-account --key-file=#{cred_file}"
|
306
|
+
call(cmd)
|
307
|
+
|
308
|
+
set_gke_credentials(auth)
|
309
|
+
end
|
310
|
+
|
311
|
+
def set_gke_credentials(auth)
|
312
|
+
|
313
|
+
zone = auth.fetch(:zone, nil)
|
314
|
+
region = auth.fetch(:region, nil)
|
315
|
+
|
316
|
+
locale = ''
|
317
|
+
locale = "--zone=#{zone}" unless zone.nil?
|
318
|
+
locale = "--region=#{region}" unless region.nil?
|
319
|
+
|
320
|
+
# gke cluster name can probably can differ from what ood calls the cluster
|
321
|
+
cmd = "gcloud container clusters get-credentials #{locale} #{cluster_name}"
|
322
|
+
env = { 'KUBECONFIG' => config_file }
|
323
|
+
call(cmd, env)
|
324
|
+
end
|
325
|
+
|
326
|
+
def set_context
|
327
|
+
cmd = "#{base_cmd} config set-context #{cluster_name}"
|
328
|
+
cmd << " --cluster=#{cluster_name} --namespace=#{namespace}"
|
329
|
+
cmd << " --user=#{k8s_username}"
|
330
|
+
|
331
|
+
call(cmd)
|
332
|
+
use_context
|
333
|
+
end
|
334
|
+
|
335
|
+
def set_cluster(config)
|
336
|
+
server = config.fetch(:endpoint)
|
337
|
+
cert = config.fetch(:cert_authority_file, nil)
|
338
|
+
|
339
|
+
cmd = "#{base_cmd} config set-cluster #{cluster_name}"
|
340
|
+
cmd << " --server=#{server}"
|
341
|
+
cmd << " --certificate-authority=#{cert}" unless cert.nil?
|
342
|
+
|
343
|
+
call(cmd)
|
344
|
+
end
|
345
|
+
|
346
|
+
def call(cmd = '', env: {}, stdin: nil)
|
347
|
+
o, error, s = Open3.capture3(env, cmd, stdin_data: stdin.to_s)
|
348
|
+
s.success? ? o : raise(Error, error)
|
349
|
+
end
|
350
|
+
end
|
@@ -0,0 +1,298 @@
|
|
1
|
+
class OodCore::Job::Adapters::Kubernetes::Helper
|
2
|
+
|
3
|
+
require 'ood_core/job/adapters/kubernetes/resources'
|
4
|
+
require 'resolv'
|
5
|
+
require 'base64'
|
6
|
+
|
7
|
+
class K8sDataError < StandardError; end
|
8
|
+
|
9
|
+
Resources = OodCore::Job::Adapters::Kubernetes::Resources
|
10
|
+
|
11
|
+
# Extract info from json data. The data is expected to be from the kubectl
|
12
|
+
# command and conform to kubernetes' datatype structures.
|
13
|
+
#
|
14
|
+
# Returns { native: {host: localhost, port:80, password: sshhh }} in the info
|
15
|
+
# object field in lieu of writing a connection.yml
|
16
|
+
#
|
17
|
+
# @param pod_json [#to_h]
|
18
|
+
# the pod data returned from 'kubectl get pod abc-123'
|
19
|
+
# @param service_json [#to_h]
|
20
|
+
# the service data returned from 'kubectl get service abc-123-service'
|
21
|
+
# @param secret_json [#to_h]
|
22
|
+
# the secret data returned from 'kubectl get secret abc-123-secret'
|
23
|
+
# @return [OodCore::Job::Info]
|
24
|
+
def info_from_json(pod_json: nil, service_json: nil, secret_json: nil)
|
25
|
+
pod_hash = pod_info_from_json(pod_json)
|
26
|
+
service_hash = service_info_from_json(service_json)
|
27
|
+
secret_hash = secret_info_from_json(secret_json)
|
28
|
+
|
29
|
+
# can't just use deep_merge bc we don't depend *directly* on rails
|
30
|
+
pod_hash[:native] = pod_hash[:native].merge(service_hash[:native])
|
31
|
+
pod_hash[:native] = pod_hash[:native].merge(secret_hash[:native])
|
32
|
+
OodCore::Job::Info.new(pod_hash)
|
33
|
+
rescue NoMethodError
|
34
|
+
raise K8sDataError, "unable to read data correctly from json"
|
35
|
+
end
|
36
|
+
|
37
|
+
# Turn a container hash into a Kubernetes::Resources::Container
|
38
|
+
#
|
39
|
+
# @param container [#to_h]
|
40
|
+
# the input container hash
|
41
|
+
# @return [OodCore::Job::Adapters::Kubernetes::Resources::Container]
|
42
|
+
def container_from_native(container)
|
43
|
+
Resources::Container.new(
|
44
|
+
container[:name],
|
45
|
+
container[:image],
|
46
|
+
command: parse_command(container[:command]),
|
47
|
+
port: container[:port],
|
48
|
+
env: container.fetch(:env, []),
|
49
|
+
memory: container[:memory],
|
50
|
+
cpu: container[:cpu],
|
51
|
+
working_dir: container[:working_dir],
|
52
|
+
restart_policy: container[:restart_policy]
|
53
|
+
)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Parse a command string given from a user and return an array.
|
57
|
+
# If given an array, the input is simply returned back.
|
58
|
+
#
|
59
|
+
# @param cmd [#to_s]
|
60
|
+
# the command to parse
|
61
|
+
# @return [Array<#to_s>]
|
62
|
+
# the command parsed into an array of arguements
|
63
|
+
def parse_command(cmd)
|
64
|
+
if cmd&.is_a?(Array)
|
65
|
+
cmd
|
66
|
+
else
|
67
|
+
Shellwords.split(cmd.to_s)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Turn a configmap hash into a Kubernetes::Resources::ConfigMap
|
72
|
+
# that can be used in templates. Needs an id so that the resulting
|
73
|
+
# configmap has a known name.
|
74
|
+
#
|
75
|
+
# @param native [#to_h]
|
76
|
+
# the input configmap hash
|
77
|
+
# @param id [#to_s]
|
78
|
+
# the id to use for giving the configmap a name
|
79
|
+
# @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
|
80
|
+
def configmap_from_native(native, id)
|
81
|
+
configmap = native.fetch(:configmap, nil)
|
82
|
+
return nil if configmap.nil?
|
83
|
+
|
84
|
+
Resources::ConfigMap.new(
|
85
|
+
configmap_name(id),
|
86
|
+
configmap[:filename],
|
87
|
+
configmap[:data]
|
88
|
+
)
|
89
|
+
end
|
90
|
+
|
91
|
+
# parse initialization containers from native data
|
92
|
+
#
|
93
|
+
# @param native_data [#to_h]
|
94
|
+
# the native data to parse. Expected key init_ctrs and for that
|
95
|
+
# key to be an array of hashes.
|
96
|
+
# @return [Array<OodCore::Job::Adapters::Kubernetes::Resources::Container>]
|
97
|
+
# the array of init containers
|
98
|
+
def init_ctrs_from_native(ctrs)
|
99
|
+
init_ctrs = []
|
100
|
+
|
101
|
+
ctrs&.each do |ctr_raw|
|
102
|
+
ctr = container_from_native(ctr_raw)
|
103
|
+
init_ctrs.push(ctr)
|
104
|
+
end
|
105
|
+
|
106
|
+
init_ctrs
|
107
|
+
end
|
108
|
+
|
109
|
+
def service_name(id)
|
110
|
+
id + '-service'
|
111
|
+
end
|
112
|
+
|
113
|
+
def secret_name(id)
|
114
|
+
id + '-secret'
|
115
|
+
end
|
116
|
+
|
117
|
+
def configmap_name(id)
|
118
|
+
id + '-configmap'
|
119
|
+
end
|
120
|
+
|
121
|
+
# Extract pod info from json data. The data is expected to be from the kubectl
|
122
|
+
# command and conform to kubernetes' datatype structures.
|
123
|
+
#
|
124
|
+
# @param json_data [#to_h]
|
125
|
+
# the pod data returned from 'kubectl get pod abc-123'
|
126
|
+
# @return [#to_h]
|
127
|
+
# the hash of info expected from adapters
|
128
|
+
def pod_info_from_json(json_data)
|
129
|
+
{
|
130
|
+
id: json_data.dig(:metadata, :name).to_s,
|
131
|
+
job_name: name_from_metadata(json_data.dig(:metadata)),
|
132
|
+
status: pod_status_from_json(json_data),
|
133
|
+
job_owner: json_data.dig(:metadata, :namespace).to_s,
|
134
|
+
submission_time: submission_time(json_data),
|
135
|
+
dispatch_time: dispatch_time(json_data),
|
136
|
+
wallclock_time: wallclock_time(json_data),
|
137
|
+
native: {
|
138
|
+
host: get_host(json_data.dig(:status, :hostIP))
|
139
|
+
},
|
140
|
+
procs: procs_from_json(json_data)
|
141
|
+
}
|
142
|
+
rescue NoMethodError
|
143
|
+
# gotta raise an error because Info.new will throw an error if id is undefined
|
144
|
+
raise K8sDataError, "unable to read data correctly from json"
|
145
|
+
end
|
146
|
+
|
147
|
+
private
|
148
|
+
|
149
|
+
def get_host(ip)
|
150
|
+
Resolv.getname(ip)
|
151
|
+
rescue Resolv::ResolvError
|
152
|
+
ip
|
153
|
+
end
|
154
|
+
|
155
|
+
def name_from_metadata(metadata)
|
156
|
+
name = metadata.dig(:labels, :'app.kubernetes.io/name')
|
157
|
+
name = metadata.dig(:labels, :'k8s-app') if name.nil?
|
158
|
+
name = metadata.dig(:name) if name.nil? # pod-id but better than nil?
|
159
|
+
name
|
160
|
+
end
|
161
|
+
|
162
|
+
def service_info_from_json(json_data)
|
163
|
+
# all we need is the port - .spec.ports[0].nodePort
|
164
|
+
ports = json_data.dig(:spec, :ports)
|
165
|
+
{
|
166
|
+
native:
|
167
|
+
{
|
168
|
+
port: ports[0].dig(:nodePort)
|
169
|
+
}
|
170
|
+
}
|
171
|
+
rescue
|
172
|
+
empty_native
|
173
|
+
end
|
174
|
+
|
175
|
+
def secret_info_from_json(json_data)
|
176
|
+
raw = json_data.dig(:data, :password)
|
177
|
+
{
|
178
|
+
native:
|
179
|
+
{
|
180
|
+
password: Base64.decode64(raw)
|
181
|
+
}
|
182
|
+
}
|
183
|
+
rescue
|
184
|
+
empty_native
|
185
|
+
end
|
186
|
+
|
187
|
+
def empty_native
|
188
|
+
{
|
189
|
+
native: {}
|
190
|
+
}
|
191
|
+
end
|
192
|
+
|
193
|
+
def dispatch_time(json_data)
|
194
|
+
status = pod_status_from_json(json_data)
|
195
|
+
return nil if status == 'undetermined'
|
196
|
+
|
197
|
+
state_data = json_data.dig(:status, :containerStatuses)[0].dig(:state)
|
198
|
+
date_string = nil
|
199
|
+
|
200
|
+
if status == 'completed'
|
201
|
+
date_string = state_data.dig(:terminated, :startedAt)
|
202
|
+
elsif status == 'running'
|
203
|
+
date_string = state_data.dig(:running, :startedAt)
|
204
|
+
end
|
205
|
+
|
206
|
+
date_string.nil? ? nil : DateTime.parse(date_string).to_time.to_i
|
207
|
+
end
|
208
|
+
|
209
|
+
def wallclock_time(json_data)
|
210
|
+
status = pod_status_from_json(json_data)
|
211
|
+
return nil if status == 'undetermined'
|
212
|
+
|
213
|
+
state_data = json_data.dig(:status, :containerStatuses)[0].dig(:state)
|
214
|
+
start_time = dispatch_time(json_data)
|
215
|
+
return nil if start_time.nil?
|
216
|
+
|
217
|
+
et = end_time(status, state_data)
|
218
|
+
|
219
|
+
et.nil? ? nil : et - start_time
|
220
|
+
end
|
221
|
+
|
222
|
+
def end_time(status, state_data)
|
223
|
+
if status == 'completed'
|
224
|
+
end_time_string = state_data.dig(:terminated, :finishedAt)
|
225
|
+
et = DateTime.parse(end_time_string).to_time.to_i
|
226
|
+
elsif status == 'running'
|
227
|
+
et = DateTime.now.to_time.to_i
|
228
|
+
else
|
229
|
+
et = nil
|
230
|
+
end
|
231
|
+
|
232
|
+
et
|
233
|
+
end
|
234
|
+
|
235
|
+
def submission_time(json_data)
|
236
|
+
status = json_data.dig(:status)
|
237
|
+
start = status.dig(:startTime)
|
238
|
+
|
239
|
+
if start.nil?
|
240
|
+
# the pod is in some pending state limbo
|
241
|
+
conditions = status.dig(:conditions)
|
242
|
+
# best guess to start time is just the first condition's
|
243
|
+
# transition time
|
244
|
+
str = conditions[0].dig(:lastTransitionTime)
|
245
|
+
else
|
246
|
+
str = start
|
247
|
+
end
|
248
|
+
|
249
|
+
DateTime.parse(str).to_time.to_i
|
250
|
+
end
|
251
|
+
|
252
|
+
def pod_status_from_json(json_data)
|
253
|
+
state = 'undetermined'
|
254
|
+
status = json_data.dig(:status)
|
255
|
+
container_statuses = status.dig(:containerStatuses)
|
256
|
+
|
257
|
+
if container_statuses.nil?
|
258
|
+
# if you're here, it means you're pending, probably unschedulable
|
259
|
+
return OodCore::Job::Status.new(state: state)
|
260
|
+
end
|
261
|
+
|
262
|
+
# only support 1 container/pod
|
263
|
+
json_state = container_statuses[0].dig(:state)
|
264
|
+
state = 'running' unless json_state.dig(:running).nil?
|
265
|
+
state = terminated_state(json_state) unless json_state.dig(:terminated).nil?
|
266
|
+
state = 'queued' unless json_state.dig(:waiting).nil?
|
267
|
+
|
268
|
+
OodCore::Job::Status.new(state: state)
|
269
|
+
end
|
270
|
+
|
271
|
+
def terminated_state(status)
|
272
|
+
reason = status.dig(:terminated, :reason)
|
273
|
+
if reason == 'Error'
|
274
|
+
'suspended'
|
275
|
+
else
|
276
|
+
'completed'
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
def procs_from_json(json_data)
|
281
|
+
containers = json_data.dig(:spec, :containers)
|
282
|
+
resources = containers[0].dig(:resources)
|
283
|
+
|
284
|
+
cpu = resources.dig(:limits, :cpu)
|
285
|
+
millicores_rex = /(\d+)m/
|
286
|
+
|
287
|
+
# ok to return string bc nil.to_i == 0 and we'd rather return
|
288
|
+
# nil (undefined) than 0 which is confusing.
|
289
|
+
if millicores_rex.match?(cpu)
|
290
|
+
millicores = millicores_rex.match(cpu)[1].to_i
|
291
|
+
|
292
|
+
# have to return at least 1 bc 200m could be 0
|
293
|
+
((millicores + 1000) / 1000).to_s
|
294
|
+
else
|
295
|
+
cpu
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module OodCore::Job::Adapters::Kubernetes::Resources
|
2
|
+
|
3
|
+
class ConfigMap
|
4
|
+
attr_accessor :name, :filename, :data
|
5
|
+
|
6
|
+
def initialize(name, filename, data)
|
7
|
+
@name = name
|
8
|
+
@filename = filename
|
9
|
+
@data = data
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class Container
|
14
|
+
attr_accessor :name, :image, :command, :port, :env, :memory, :cpu, :working_dir,
|
15
|
+
:restart_policy
|
16
|
+
|
17
|
+
def initialize(
|
18
|
+
name, image, command: [], port: nil, env: [], memory: "4Gi", cpu: "1",
|
19
|
+
working_dir: "", restart_policy: "Never"
|
20
|
+
)
|
21
|
+
raise ArgumentError, "containers need valid names and images" unless name && image
|
22
|
+
|
23
|
+
@name = name
|
24
|
+
@image = image
|
25
|
+
@command = command.nil? ? [] : command
|
26
|
+
@port = port&.to_i
|
27
|
+
@env = env.nil? ? [] : env
|
28
|
+
@memory = memory.nil? ? "4Gi" : memory
|
29
|
+
@cpu = cpu.nil? ? "1" : cpu
|
30
|
+
@working_dir = working_dir.nil? ? "" : working_dir
|
31
|
+
@restart_policy = restart_policy.nil? ? "Never" : restart_policy
|
32
|
+
end
|
33
|
+
|
34
|
+
def ==(other)
|
35
|
+
name == other.name &&
|
36
|
+
image == other.image &&
|
37
|
+
command == other.command &&
|
38
|
+
port == other.port &&
|
39
|
+
env == other.env &&
|
40
|
+
memory == other.memory &&
|
41
|
+
cpu == other.cpu &&
|
42
|
+
working_dir == other.working_dir &&
|
43
|
+
restart_policy == other.restart_policy
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
class PodSpec
|
49
|
+
attr_accessor :container, :init_containers
|
50
|
+
def initialize(container, init_containers: nil)
|
51
|
+
@container = container
|
52
|
+
@init_containers = init_containers
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
apiVersion: v1
|
2
|
+
kind: Pod
|
3
|
+
metadata:
|
4
|
+
namespace: <%= namespace %>
|
5
|
+
name: <%= id %>
|
6
|
+
labels:
|
7
|
+
job: <%= id %>
|
8
|
+
app.kubernetes.io/name: <%= container.name %>
|
9
|
+
app.kubernetes.io/managed-by: open-ondemand
|
10
|
+
spec:
|
11
|
+
restartPolicy: <%= spec.container.restart_policy %>
|
12
|
+
securityContext:
|
13
|
+
runAsUser: <%= run_as_user %>
|
14
|
+
runAsGroup: <%= run_as_group %>
|
15
|
+
fsGroup: <%= fs_group %>
|
16
|
+
containers:
|
17
|
+
- name: "<%= spec.container.name %>"
|
18
|
+
image: <%= spec.container.image %>
|
19
|
+
imagePullPolicy: IfNotPresent
|
20
|
+
<% unless spec.container.working_dir.empty? %>
|
21
|
+
workingDir: "<%= spec.container.working_dir %>"
|
22
|
+
<% end %>
|
23
|
+
<% unless spec.container.env.empty? %>
|
24
|
+
env:
|
25
|
+
<% spec.container.env.each do |env| %>
|
26
|
+
- name: <%= env[:name] %>
|
27
|
+
value: "<%= env[:value] %>"
|
28
|
+
<% end %> <%# for each env %>
|
29
|
+
<% end %> <%# unless env is nil %>
|
30
|
+
<% unless spec.container.command.empty? %>
|
31
|
+
command:
|
32
|
+
<% spec.container.command.each do |cmd| %>
|
33
|
+
- "<%= cmd %>"
|
34
|
+
<% end %> <%# for each command %>
|
35
|
+
<% end %> <%# unless command is nil %>
|
36
|
+
<% unless spec.container.port.nil? %>
|
37
|
+
ports:
|
38
|
+
- containerPort: <%= spec.container.port %>
|
39
|
+
<% end %>
|
40
|
+
volumeMounts:
|
41
|
+
<% unless configmap.nil? %>
|
42
|
+
- name: configmap-volume
|
43
|
+
mountPath: <%= configmap_mount_path %>
|
44
|
+
<% end %>
|
45
|
+
<% all_mounts.each do |mount| %>
|
46
|
+
- name: <%= mount[:name] %>
|
47
|
+
mountPath: <%= mount[:destination_path] %>
|
48
|
+
<% end %> <%# for each mount %>
|
49
|
+
resources:
|
50
|
+
limits:
|
51
|
+
memory: "<%= spec.container.memory %>"
|
52
|
+
cpu: "<%= spec.container.cpu %>"
|
53
|
+
requests:
|
54
|
+
memory: "<%= spec.container.memory %>"
|
55
|
+
cpu: "<%= spec.container.cpu %>"
|
56
|
+
<% unless spec.init_containers.nil? %>
|
57
|
+
initContainers:
|
58
|
+
<% spec.init_containers.each do |ctr| %>
|
59
|
+
- name: "<%= ctr.name %>"
|
60
|
+
image: "<%= ctr.image %>"
|
61
|
+
command:
|
62
|
+
<% ctr.command.each do |cmd| %>
|
63
|
+
- "<%= cmd %>"
|
64
|
+
<% end %> <%# command loop %>
|
65
|
+
volumeMounts:
|
66
|
+
<% unless configmap.nil? %>
|
67
|
+
- name: configmap-volume
|
68
|
+
mountPath: <%= configmap_mount_path %>
|
69
|
+
<% end %>
|
70
|
+
<% all_mounts.each do |mount| %>
|
71
|
+
- name: <%= mount[:name] %>
|
72
|
+
mountPath: <%= mount[:destination_path] %>
|
73
|
+
<% end %> <%# for each mount %>
|
74
|
+
<% end %> <%# init container loop %>
|
75
|
+
<% end %> <%# if init containers %>
|
76
|
+
<% unless configmap.nil? || all_mounts.empty? %>
|
77
|
+
volumes:
|
78
|
+
<% end %> <%# configmap.nil? || all_mounts.empty? %>
|
79
|
+
<% unless configmap.nil? %>
|
80
|
+
- name: configmap-volume
|
81
|
+
configMap:
|
82
|
+
name: <%= configmap_name(id) %>
|
83
|
+
<% end %>
|
84
|
+
<% all_mounts.each do |mount| %>
|
85
|
+
<% if mount[:type] == 'nfs' %>
|
86
|
+
- name: <%= mount[:name] %>
|
87
|
+
nfs:
|
88
|
+
server: <%= mount[:host] %>
|
89
|
+
path: <%= mount[:path] %>
|
90
|
+
<% elsif mount[:type] == 'host' %>
|
91
|
+
- name: <%= mount[:name] %>
|
92
|
+
hostPath:
|
93
|
+
path: <%= mount[:path] %>
|
94
|
+
type: <%= mount[:host_type] %>
|
95
|
+
<% end %> <%# if mount is [host,nfs] %>
|
96
|
+
<% end %> <%# for each mount %>
|
97
|
+
---
|
98
|
+
<% unless spec.container.port.nil? %>
|
99
|
+
apiVersion: v1
|
100
|
+
kind: Service
|
101
|
+
metadata:
|
102
|
+
name: <%= service_name(id) %>
|
103
|
+
namespace: <%= namespace %>
|
104
|
+
spec:
|
105
|
+
selector:
|
106
|
+
job: <%= id %>
|
107
|
+
ports:
|
108
|
+
- protocol: TCP
|
109
|
+
port: 80
|
110
|
+
targetPort: <%= spec.container.port %>
|
111
|
+
type: NodePort
|
112
|
+
<% end %> <%# end for service %>
|
113
|
+
---
|
114
|
+
<% unless configmap.nil? %>
|
115
|
+
apiVersion: v1
|
116
|
+
kind: ConfigMap
|
117
|
+
metadata:
|
118
|
+
name: <%= configmap_name(id) %>
|
119
|
+
namespace: <%= namespace %>
|
120
|
+
data:
|
121
|
+
<%= configmap.filename %>: |
|
122
|
+
<% config_data_lines(configmap.data).each do |line| %><%= line %><% end %>
|
123
|
+
<% end %> <%# end for configmap %>
|
@@ -166,7 +166,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
166
166
|
'email_on_terminated' => script_email_on_event(script, 'terminated'),
|
167
167
|
'email_on_start' => script_email_on_event(script, 'started'),
|
168
168
|
'environment' => export_env(script),
|
169
|
-
'error_path' => (script
|
169
|
+
'error_path' => error_path(script),
|
170
170
|
'job_name' => script.job_name.to_s,
|
171
171
|
'output_path' => (script.output_path) ? script.output_path.to_s : '/dev/null',
|
172
172
|
'script_content' => content,
|
@@ -176,6 +176,7 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
176
176
|
'singularity_image' => singularity_image(script.native),
|
177
177
|
'ssh_hosts' => ssh_hosts,
|
178
178
|
'tmux_bin' => tmux_bin,
|
179
|
+
'workdir' => (script.workdir) ? script.workdir.to_s : '/tmp',
|
179
180
|
}.each{
|
180
181
|
|key, value| bnd.local_variable_set(key, value)
|
181
182
|
}
|
@@ -272,4 +273,11 @@ class OodCore::Job::Adapters::LinuxHost::Launcher
|
|
272
273
|
return false if script.content.empty?
|
273
274
|
script.content.split("\n").first.start_with?('#!/')
|
274
275
|
end
|
276
|
+
|
277
|
+
def error_path(script)
|
278
|
+
return script.error_path.to_s if script.error_path
|
279
|
+
return script.output_path.to_s if script.output_path
|
280
|
+
|
281
|
+
'/dev/null'
|
282
|
+
end
|
275
283
|
end
|
@@ -16,13 +16,9 @@ fi
|
|
16
16
|
echo $hostname
|
17
17
|
|
18
18
|
# Put the script into a temp file on localhost
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
<% else %>
|
23
|
-
singularity_tmp_file=$(mktemp)
|
24
|
-
tmux_tmp_file=$(mktemp)
|
25
|
-
<% end %>
|
19
|
+
singularity_tmp_file=$(mktemp -p "<%= workdir %>" --suffix '_sing')
|
20
|
+
tmux_tmp_file=$(mktemp -p "<%= workdir %>" --suffix "_tmux")
|
21
|
+
|
26
22
|
|
27
23
|
# Create an executable to run in a tmux session
|
28
24
|
# The escaped HEREDOC means that we need to substitute in $singularity_tmp_file ourselves
|
@@ -69,10 +65,3 @@ SINGULARITY_LAUNCHER
|
|
69
65
|
chmod +x "$singularity_tmp_file"
|
70
66
|
chmod +x "$tmux_tmp_file"
|
71
67
|
<%= tmux_bin %> new-session -d -s "<%= session_name %>" "$tmux_tmp_file"
|
72
|
-
|
73
|
-
# Remove the file
|
74
|
-
<% if ! debug %>
|
75
|
-
# Wait 1 second to ensure that tmux session has started before the file is removed
|
76
|
-
sleep 1
|
77
|
-
rm -f "$tmux_tmp_file"; rm -f "$singularity_tmp_file"
|
78
|
-
<% end %>
|
@@ -80,6 +80,9 @@ module OodCore
|
|
80
80
|
# from
|
81
81
|
class Error < StandardError; end
|
82
82
|
|
83
|
+
# An error indicating the slurm command timed out
|
84
|
+
class SlurmTimeoutError < Error; end
|
85
|
+
|
83
86
|
# @param cluster [#to_s, nil] the cluster name
|
84
87
|
# @param conf [#to_s, nil] path to the slurm conf
|
85
88
|
# @param bin [#to_s] path to slurm installation binaries
|
@@ -147,6 +150,9 @@ module OodCore
|
|
147
150
|
end
|
148
151
|
jobs
|
149
152
|
end
|
153
|
+
rescue SlurmTimeoutError
|
154
|
+
# TODO: could use a log entry here
|
155
|
+
return [{ id: id, state: 'undetermined' }]
|
150
156
|
end
|
151
157
|
|
152
158
|
def squeue_fields(attrs)
|
@@ -303,7 +309,18 @@ module OodCore
|
|
303
309
|
|
304
310
|
cmd, args = OodCore::Job::Adapters::Helper.ssh_wrap(submit_host, cmd, args, strict_host_checking)
|
305
311
|
o, e, s = Open3.capture3(env, cmd, *(args.map(&:to_s)), stdin_data: stdin.to_s)
|
306
|
-
s.success? ? o : raise(Error, e)
|
312
|
+
s.success? ? interpret_and_raise(o, e) : raise(Error, e)
|
313
|
+
end
|
314
|
+
|
315
|
+
# Helper function to raise an error based on the contents of stderr.
|
316
|
+
# Slurm exits 0 even when the command fails, so we need to interpret stderr
|
317
|
+
# to see if the command was actually successful.
|
318
|
+
def interpret_and_raise(stdout, stderr)
|
319
|
+
return stdout if stderr.empty?
|
320
|
+
|
321
|
+
raise SlurmTimeoutError, stderr if /^slurm_load_jobs error: Socket timed out/.match(stderr)
|
322
|
+
|
323
|
+
stdout
|
307
324
|
end
|
308
325
|
|
309
326
|
def squeue_attrs_for_info_attrs(attrs)
|
data/lib/ood_core/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ood_core
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eric Franz
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: exe
|
12
12
|
cert_chain: []
|
13
|
-
date: 2020-
|
13
|
+
date: 2020-10-01 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: ood_support
|
@@ -166,6 +166,11 @@ files:
|
|
166
166
|
- lib/ood_core/job/adapters/ccq.rb
|
167
167
|
- lib/ood_core/job/adapters/drmaa.rb
|
168
168
|
- lib/ood_core/job/adapters/helper.rb
|
169
|
+
- lib/ood_core/job/adapters/kubernetes.rb
|
170
|
+
- lib/ood_core/job/adapters/kubernetes/batch.rb
|
171
|
+
- lib/ood_core/job/adapters/kubernetes/helper.rb
|
172
|
+
- lib/ood_core/job/adapters/kubernetes/resources.rb
|
173
|
+
- lib/ood_core/job/adapters/kubernetes/templates/pod.yml.erb
|
169
174
|
- lib/ood_core/job/adapters/linux_host.rb
|
170
175
|
- lib/ood_core/job/adapters/linux_host/launcher.rb
|
171
176
|
- lib/ood_core/job/adapters/linux_host/templates/email.erb.sh
|
@@ -216,7 +221,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
216
221
|
- !ruby/object:Gem::Version
|
217
222
|
version: '0'
|
218
223
|
requirements: []
|
219
|
-
rubygems_version: 3.0.
|
224
|
+
rubygems_version: 3.0.8
|
220
225
|
signing_key:
|
221
226
|
specification_version: 4
|
222
227
|
summary: Open OnDemand core library
|