ood_core 0.11.3 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,354 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "json"
3
+
4
+ class OodCore::Job::Adapters::Kubernetes::Batch
5
+
6
+ require_relative "helper"
7
+ require_relative "k8s_job_info"
8
+
9
+ using OodCore::Refinements::HashExtensions
10
+
11
+ class Error < StandardError; end
12
+ class NotFoundError < StandardError; end
13
+
14
+ attr_reader :config_file, :bin, :cluster, :mounts
15
+ attr_reader :all_namespaces, :using_context, :helper
16
+ attr_reader :username_prefix, :namespace_prefix
17
+
18
+ def initialize(options = {})
19
+ options = options.to_h.symbolize_keys
20
+
21
+ @config_file = options.fetch(:config_file, default_config_file)
22
+ @bin = options.fetch(:bin, '/usr/bin/kubectl')
23
+ @cluster = options.fetch(:cluster, 'open-ondemand')
24
+ @mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
25
+ @all_namespaces = options.fetch(:all_namespaces, false)
26
+ @username_prefix = options.fetch(:username_prefix, nil)
27
+ @namespace_prefix = options.fetch(:namespace_prefix, '')
28
+
29
+ @using_context = false
30
+ @helper = Helper.new
31
+
32
+ begin
33
+ make_kubectl_config(options)
34
+ rescue
35
+ # FIXME could use a log here
36
+ # means you couldn't 'kubectl set config'
37
+ end
38
+ end
39
+
40
+ def resource_file(resource_type = 'pod')
41
+ File.dirname(__FILE__) + "/templates/#{resource_type}.yml.erb"
42
+ end
43
+
44
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
45
+ raise ArgumentError, 'Must specify the script' if script.nil?
46
+
47
+ resource_yml, id = generate_id_yml(script)
48
+ call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
49
+
50
+ id
51
+ end
52
+
53
+ def generate_id(name)
54
+ # 2_821_109_907_456 = 36**8
55
+ name.downcase.tr(' ', '-') + '-' + rand(2_821_109_907_456).to_s(36)
56
+ end
57
+
58
+ def info_all(attrs: nil)
59
+ cmd = if all_namespaces
60
+ "#{base_cmd} get pods -o json --all-namespaces"
61
+ else
62
+ "#{namespaced_cmd} get pods -o json"
63
+ end
64
+
65
+ output = call(cmd)
66
+ all_pods_to_info(output)
67
+ end
68
+
69
+ def info_where_owner(owner, attrs: nil)
70
+ owner = Array.wrap(owner).map(&:to_s)
71
+
72
+ # must at least have job_owner to filter by job_owner
73
+ attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
74
+
75
+ info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
76
+ end
77
+
78
+ def info_all_each(attrs: nil)
79
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
80
+
81
+ info_all(attrs: attrs).each do |job|
82
+ yield job
83
+ end
84
+ end
85
+
86
+ def info_where_owner_each(owner, attrs: nil)
87
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
88
+
89
+ info_where_owner(owner, attrs: attrs).each do |job|
90
+ yield job
91
+ end
92
+ end
93
+
94
+ def info(id)
95
+ pod_json = safe_call('get', 'pod', id)
96
+ return OodCore::Job::Info.new({ id: id, status: 'completed' }) if pod_json.empty?
97
+
98
+ service_json = safe_call('get', 'service', service_name(id))
99
+ secret_json = safe_call('get', 'secret', secret_name(id))
100
+
101
+ helper.info_from_json(pod_json: pod_json, service_json: service_json, secret_json: secret_json)
102
+ end
103
+
104
+ def status(id)
105
+ info(id).status
106
+ end
107
+
108
+ def delete(id)
109
+ safe_call("delete", "pod", id)
110
+ safe_call("delete", "service", service_name(id))
111
+ safe_call("delete", "secret", secret_name(id))
112
+ safe_call("delete", "configmap", configmap_name(id))
113
+ end
114
+
115
+ def configmap_mount_path
116
+ '/ood'
117
+ end
118
+
119
+ private
120
+
121
+ def safe_call(verb, resource, id)
122
+ begin
123
+ case verb.to_s
124
+ when "get"
125
+ call_json_output('get', resource, id)
126
+ when "delete"
127
+ call("#{namespaced_cmd} delete #{resource} #{id}")
128
+ end
129
+ rescue NotFoundError
130
+ {}
131
+ end
132
+ end
133
+
134
+ # helper to help format multi-line yaml data from the submit.yml into
135
+ # mutli-line yaml in the pod.yml.erb
136
+ def config_data_lines(data)
137
+ output = []
138
+ first = true
139
+
140
+ data.to_s.each_line do |line|
141
+ output.append(first ? line : line.prepend(" "))
142
+ first = false
143
+ end
144
+
145
+ output
146
+ end
147
+
148
+ def username
149
+ @username ||= Etc.getlogin
150
+ end
151
+
152
+ def k8s_username
153
+ username_prefix.nil? ? username : "#{username_prefix}-#{username}"
154
+ end
155
+
156
+ def run_as_user
157
+ Etc.getpwnam(username).uid
158
+ end
159
+
160
+ def run_as_group
161
+ Etc.getpwnam(username).gid
162
+ end
163
+
164
+ def fs_group
165
+ run_as_group
166
+ end
167
+
168
+ # helper to template resource yml you're going to submit and
169
+ # create an id.
170
+ def generate_id_yml(script)
171
+ native_data = script.native
172
+ container = helper.container_from_native(native_data[:container])
173
+ id = generate_id(container.name)
174
+ configmap = helper.configmap_from_native(native_data, id)
175
+ init_containers = helper.init_ctrs_from_native(native_data[:init_containers])
176
+ spec = Kubernetes::Resources::PodSpec.new(container, init_containers: init_containers)
177
+ all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
178
+
179
+ template = ERB.new(File.read(resource_file), nil, '-')
180
+
181
+ [template.result(binding), id]
182
+ end
183
+
184
+ # helper to call kubectl and get json data back.
185
+ # verb, resrouce and id are the kubernetes parlance terms.
186
+ # example: 'kubectl get pod my-pod-id' is verb=get, resource=pod
187
+ # and id=my-pod-id
188
+ def call_json_output(verb, resource, id, stdin: nil)
189
+ cmd = "#{formatted_ns_cmd} #{verb} #{resource} #{id}"
190
+ data = call(cmd, stdin: stdin)
191
+ data = data.empty? ? '{}' : data
192
+ json_data = JSON.parse(data, symbolize_names: true)
193
+
194
+ json_data
195
+ end
196
+
197
+ def service_name(id)
198
+ helper.service_name(id)
199
+ end
200
+
201
+ def secret_name(id)
202
+ helper.secret_name(id)
203
+ end
204
+
205
+ def configmap_name(id)
206
+ helper.configmap_name(id)
207
+ end
208
+
209
+ def namespace
210
+ "#{namespace_prefix}#{username}"
211
+ end
212
+
213
+ def context
214
+ cluster
215
+ end
216
+
217
+ def default_config_file
218
+ (ENV['KUBECONFIG'] || "#{Dir.home}/.kube/config")
219
+ end
220
+
221
+ def default_auth
222
+ {
223
+ type: 'managaged'
224
+ }.symbolize_keys
225
+ end
226
+
227
+ def default_server
228
+ {
229
+ endpoint: 'https://localhost:8080',
230
+ cert_authority_file: nil
231
+ }.symbolize_keys
232
+ end
233
+
234
+ def formatted_ns_cmd
235
+ "#{namespaced_cmd} -o json"
236
+ end
237
+
238
+ def namespaced_cmd
239
+ "#{base_cmd} --namespace=#{namespace}"
240
+ end
241
+
242
+ def base_cmd
243
+ base = "#{bin} --kubeconfig=#{config_file}"
244
+ base << " --context=#{context}" if using_context
245
+ base
246
+ end
247
+
248
+ def all_pods_to_info(data)
249
+ json_data = JSON.parse(data, symbolize_names: true)
250
+ pods = json_data.dig(:items)
251
+
252
+ info_array = []
253
+ pods.each do |pod|
254
+ info = pod_info_from_json(pod)
255
+ info_array.push(info) unless info.nil?
256
+ end
257
+
258
+ info_array
259
+ rescue JSON::ParserError
260
+ # 'no resources in <namespace>' throws parse error
261
+ []
262
+ end
263
+
264
+ def pod_info_from_json(pod)
265
+ hash = helper.pod_info_from_json(pod)
266
+ K8sJobInfo.new(hash)
267
+ rescue Helper::K8sDataError
268
+ # FIXME: silently eating error, could probably use a logger
269
+ nil
270
+ end
271
+
272
+ def make_kubectl_config(config)
273
+ set_cluster(config.fetch(:server, default_server).to_h.symbolize_keys)
274
+ configure_auth(config.fetch(:auth, default_auth).to_h.symbolize_keys)
275
+ end
276
+
277
+ def configure_auth(auth)
278
+ type = auth.fetch(:type)
279
+ return if managed?(type)
280
+
281
+ case type
282
+ when 'gke'
283
+ set_gke_config(auth)
284
+ when 'oidc'
285
+ set_context
286
+ end
287
+ end
288
+
289
+ def use_context
290
+ @using_context = true
291
+ end
292
+
293
+ def managed?(type)
294
+ if type.nil?
295
+ true # maybe should be false?
296
+ else
297
+ type.to_s == 'managed'
298
+ end
299
+ end
300
+
301
+ def set_gke_config(auth)
302
+ cred_file = auth.fetch(:svc_acct_file)
303
+
304
+ cmd = "gcloud auth activate-service-account --key-file=#{cred_file}"
305
+ call(cmd)
306
+
307
+ set_gke_credentials(auth)
308
+ end
309
+
310
+ def set_gke_credentials(auth)
311
+
312
+ zone = auth.fetch(:zone, nil)
313
+ region = auth.fetch(:region, nil)
314
+
315
+ locale = ''
316
+ locale = "--zone=#{zone}" unless zone.nil?
317
+ locale = "--region=#{region}" unless region.nil?
318
+
319
+ # gke cluster name can probably can differ from what ood calls the cluster
320
+ cmd = "gcloud container clusters get-credentials #{locale} #{cluster}"
321
+ env = { 'KUBECONFIG' => config_file }
322
+ call(cmd, env)
323
+ end
324
+
325
+ def set_context
326
+ cmd = "#{base_cmd} config set-context #{cluster}"
327
+ cmd << " --cluster=#{cluster} --namespace=#{namespace}"
328
+ cmd << " --user=#{k8s_username}"
329
+
330
+ call(cmd)
331
+ use_context
332
+ end
333
+
334
+ def set_cluster(config)
335
+ server = config.fetch(:endpoint)
336
+ cert = config.fetch(:cert_authority_file, nil)
337
+
338
+ cmd = "#{base_cmd} config set-cluster #{cluster}"
339
+ cmd << " --server=#{server}"
340
+ cmd << " --certificate-authority=#{cert}" unless cert.nil?
341
+
342
+ call(cmd)
343
+ end
344
+
345
+ def call(cmd = '', env: {}, stdin: nil)
346
+ o, e, s = Open3.capture3(env, cmd, stdin_data: stdin.to_s)
347
+ s.success? ? o : interpret_and_raise(e)
348
+ end
349
+
350
+ def interpret_and_raise(stderr)
351
+ raise NotFoundError, stderr if /^Error from server \(NotFound\):/.match(stderr)
352
+ raise(Error, stderr)
353
+ end
354
+ end
@@ -0,0 +1,294 @@
1
+ class OodCore::Job::Adapters::Kubernetes::Helper
2
+
3
+ require_relative 'resources'
4
+ require_relative 'k8s_job_info'
5
+ require 'resolv'
6
+ require 'base64'
7
+ require 'active_support/core_ext/hash'
8
+
9
+ class K8sDataError < StandardError; end
10
+
11
+ # Extract info from json data. The data is expected to be from the kubectl
12
+ # command and conform to kubernetes' datatype structures.
13
+ #
14
+ # Returns K8sJobInfo in the in lieu of writing a connection.yml
15
+ #
16
+ # @param pod_json [#to_h]
17
+ # the pod data returned from 'kubectl get pod abc-123'
18
+ # @param service_json [#to_h]
19
+ # the service data returned from 'kubectl get service abc-123-service'
20
+ # @param secret_json [#to_h]
21
+ # the secret data returned from 'kubectl get secret abc-123-secret'
22
+ # @param ns_prefix [#to_s]
23
+ # the namespace prefix so that namespaces can be converted back to usernames
24
+ # @return [OodCore::Job::Adapters::Kubernetes::K8sJobInfo]
25
+ def info_from_json(pod_json: nil, service_json: nil, secret_json: nil, ns_prefix: nil)
26
+ pod_hash = pod_info_from_json(pod_json, ns_prefix: ns_prefix)
27
+ service_hash = service_info_from_json(service_json)
28
+ secret_hash = secret_info_from_json(secret_json)
29
+
30
+ pod_hash.deep_merge!(service_hash)
31
+ pod_hash.deep_merge!(secret_hash)
32
+ K8sJobInfo.new(pod_hash)
33
+ rescue NoMethodError
34
+ raise K8sDataError, "unable to read data correctly from json"
35
+ end
36
+
37
+ # Turn a container hash into a Kubernetes::Resources::Container
38
+ #
39
+ # @param container [#to_h]
40
+ # the input container hash
41
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::Container]
42
+ def container_from_native(container)
43
+ Kubernetes::Resources::Container.new(
44
+ container[:name],
45
+ container[:image],
46
+ command: parse_command(container[:command]),
47
+ port: container[:port],
48
+ env: container.fetch(:env, []),
49
+ memory: container[:memory],
50
+ cpu: container[:cpu],
51
+ working_dir: container[:working_dir],
52
+ restart_policy: container[:restart_policy]
53
+ )
54
+ end
55
+
56
+ # Parse a command string given from a user and return an array.
57
+ # If given an array, the input is simply returned back.
58
+ #
59
+ # @param cmd [#to_s]
60
+ # the command to parse
61
+ # @return [Array<#to_s>]
62
+ # the command parsed into an array of arguements
63
+ def parse_command(cmd)
64
+ if cmd&.is_a?(Array)
65
+ cmd
66
+ else
67
+ Shellwords.split(cmd.to_s)
68
+ end
69
+ end
70
+
71
+ # Turn a configmap hash into a Kubernetes::Resources::ConfigMap
72
+ # that can be used in templates. Needs an id so that the resulting
73
+ # configmap has a known name.
74
+ #
75
+ # @param native [#to_h]
76
+ # the input configmap hash
77
+ # @param id [#to_s]
78
+ # the id to use for giving the configmap a name
79
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
80
+ def configmap_from_native(native, id)
81
+ configmap = native.fetch(:configmap, nil)
82
+ return nil if configmap.nil?
83
+
84
+ Kubernetes::Resources::ConfigMap.new(
85
+ configmap_name(id),
86
+ configmap[:filename],
87
+ configmap[:data]
88
+ )
89
+ end
90
+
91
+ # parse initialization containers from native data
92
+ #
93
+ # @param native_data [#to_h]
94
+ # the native data to parse. Expected key init_ctrs and for that
95
+ # key to be an array of hashes.
96
+ # @return [Array<OodCore::Job::Adapters::Kubernetes::Resources::Container>]
97
+ # the array of init containers
98
+ def init_ctrs_from_native(ctrs)
99
+ init_ctrs = []
100
+
101
+ ctrs&.each do |ctr_raw|
102
+ ctr = container_from_native(ctr_raw)
103
+ init_ctrs.push(ctr)
104
+ end
105
+
106
+ init_ctrs
107
+ end
108
+
109
+ def service_name(id)
110
+ id + '-service'
111
+ end
112
+
113
+ def secret_name(id)
114
+ id + '-secret'
115
+ end
116
+
117
+ def configmap_name(id)
118
+ id + '-configmap'
119
+ end
120
+
121
+ def seconds_to_duration(s)
122
+ "%02dh%02dm%02ds" % [s / 3600, s / 60 % 60, s % 60]
123
+ end
124
+
125
+ # Extract pod info from json data. The data is expected to be from the kubectl
126
+ # command and conform to kubernetes' datatype structures.
127
+ #
128
+ # @param json_data [#to_h]
129
+ # the pod data returned from 'kubectl get pod abc-123'
130
+ # @param ns_prefix [#to_s]
131
+ # the namespace prefix so that namespaces can be converted back to usernames
132
+ # @return [#to_h]
133
+ # the hash of info expected from adapters
134
+ def pod_info_from_json(json_data, ns_prefix: nil)
135
+ {
136
+ id: json_data.dig(:metadata, :name).to_s,
137
+ job_name: name_from_metadata(json_data.dig(:metadata)),
138
+ status: pod_status_from_json(json_data),
139
+ job_owner: job_owner_from_json(json_data, ns_prefix),
140
+ submission_time: submission_time(json_data),
141
+ dispatch_time: dispatch_time(json_data),
142
+ wallclock_time: wallclock_time(json_data),
143
+ ood_connection_info: { host: get_host(json_data.dig(:status, :hostIP)) },
144
+ procs: procs_from_json(json_data)
145
+ }
146
+ rescue NoMethodError
147
+ # gotta raise an error because Info.new will throw an error if id is undefined
148
+ raise K8sDataError, "unable to read data correctly from json"
149
+ end
150
+
151
+ private
152
+
153
+ def get_host(ip)
154
+ Resolv.getname(ip)
155
+ rescue Resolv::ResolvError
156
+ ip
157
+ end
158
+
159
+ def name_from_metadata(metadata)
160
+ name = metadata.dig(:labels, :'app.kubernetes.io/name')
161
+ name = metadata.dig(:labels, :'k8s-app') if name.nil?
162
+ name = metadata.dig(:name) if name.nil? # pod-id but better than nil?
163
+ name
164
+ end
165
+
166
+ def service_info_from_json(json_data)
167
+ # all we need is the port - .spec.ports[0].nodePort
168
+ ports = json_data.dig(:spec, :ports)
169
+ { ood_connection_info: { port: ports[0].dig(:nodePort) } }
170
+ rescue
171
+ {}
172
+ end
173
+
174
+ def secret_info_from_json(json_data)
175
+ raw = json_data.dig(:data, :password)
176
+ { ood_connection_info: { password: Base64.decode64(raw) } }
177
+ rescue
178
+ {}
179
+ end
180
+
181
+ def dispatch_time(json_data)
182
+ status = pod_status_from_json(json_data)
183
+ container_statuses = json_data.dig(:status, :containerStatuses)
184
+ return nil if container_statuses.nil?
185
+
186
+ state_data = container_statuses[0].dig(:state)
187
+ date_string = nil
188
+
189
+ if status == 'completed'
190
+ date_string = state_data.dig(:terminated, :startedAt)
191
+ elsif status == 'running'
192
+ date_string = state_data.dig(:running, :startedAt)
193
+ end
194
+
195
+ date_string.nil? ? nil : DateTime.parse(date_string).to_time.to_i
196
+ end
197
+
198
+ def wallclock_time(json_data)
199
+ status = pod_status_from_json(json_data)
200
+ container_statuses = json_data.dig(:status, :containerStatuses)
201
+ return nil if container_statuses.nil?
202
+
203
+ state_data = container_statuses[0].dig(:state)
204
+ start_time = dispatch_time(json_data)
205
+ return nil if start_time.nil?
206
+
207
+ et = end_time(status, state_data)
208
+
209
+ et.nil? ? nil : et - start_time
210
+ end
211
+
212
+ def end_time(status, state_data)
213
+ if status == 'completed'
214
+ end_time_string = state_data.dig(:terminated, :finishedAt)
215
+ et = DateTime.parse(end_time_string).to_time.to_i
216
+ elsif status == 'running'
217
+ et = DateTime.now.to_time.to_i
218
+ else
219
+ et = nil
220
+ end
221
+
222
+ et
223
+ end
224
+
225
+ def submission_time(json_data)
226
+ status = json_data.dig(:status)
227
+ start = status.dig(:startTime)
228
+
229
+ if start.nil?
230
+ # the pod is in some pending state limbo
231
+ conditions = status.dig(:conditions)
232
+ # best guess to start time is just the first condition's
233
+ # transition time
234
+ str = conditions[0].dig(:lastTransitionTime)
235
+ else
236
+ str = start
237
+ end
238
+
239
+ DateTime.parse(str).to_time.to_i
240
+ end
241
+
242
+ def pod_status_from_json(json_data)
243
+ phase = json_data.dig(:status, :phase)
244
+ state = case phase
245
+ when "Running"
246
+ "running"
247
+ when "Pending"
248
+ "queued"
249
+ when "Failed"
250
+ "suspended"
251
+ when "Succeeded"
252
+ "completed"
253
+ when "Unknown"
254
+ "undetermined"
255
+ else
256
+ "undetermined"
257
+ end
258
+
259
+ OodCore::Job::Status.new(state: state)
260
+ end
261
+
262
+ def terminated_state(status)
263
+ reason = status.dig(:terminated, :reason)
264
+ if reason == 'Error'
265
+ 'suspended'
266
+ else
267
+ 'completed'
268
+ end
269
+ end
270
+
271
+ def procs_from_json(json_data)
272
+ containers = json_data.dig(:spec, :containers)
273
+ resources = containers[0].dig(:resources)
274
+
275
+ cpu = resources.dig(:limits, :cpu)
276
+ millicores_rex = /(\d+)m/
277
+
278
+ # ok to return string bc nil.to_i == 0 and we'd rather return
279
+ # nil (undefined) than 0 which is confusing.
280
+ if millicores_rex.match?(cpu)
281
+ millicores = millicores_rex.match(cpu)[1].to_i
282
+
283
+ # have to return at least 1 bc 200m could be 0
284
+ ((millicores + 1000) / 1000).to_s
285
+ else
286
+ cpu
287
+ end
288
+ end
289
+
290
+ def job_owner_from_json(json_data = {}, ns_prefix = nil)
291
+ namespace = json_data.dig(:metadata, :namespace).to_s
292
+ namespace.delete_prefix(ns_prefix.to_s)
293
+ end
294
+ end