ood_core 0.12.0 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,372 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "json"
3
+
4
+ class OodCore::Job::Adapters::Kubernetes::Batch
5
+
6
+ require_relative "helper"
7
+ require_relative "k8s_job_info"
8
+
9
+ using OodCore::Refinements::HashExtensions
10
+
11
+ class Error < StandardError; end
12
+ class NotFoundError < StandardError; end
13
+
14
+ attr_reader :config_file, :bin, :cluster, :mounts
15
+ attr_reader :all_namespaces, :using_context, :helper
16
+ attr_reader :username_prefix, :namespace_prefix
17
+
18
+ def initialize(options = {})
19
+ options = options.to_h.symbolize_keys
20
+
21
+ @config_file = options.fetch(:config_file, default_config_file)
22
+ @bin = options.fetch(:bin, '/usr/bin/kubectl')
23
+ @cluster = options.fetch(:cluster, 'open-ondemand')
24
+ @mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
25
+ @all_namespaces = options.fetch(:all_namespaces, false)
26
+ @username_prefix = options.fetch(:username_prefix, nil)
27
+ @namespace_prefix = options.fetch(:namespace_prefix, '')
28
+
29
+ @using_context = false
30
+ @helper = OodCore::Job::Adapters::Kubernetes::Helper.new
31
+
32
+ begin
33
+ make_kubectl_config(options)
34
+ rescue
35
+ # FIXME could use a log here
36
+ # means you couldn't 'kubectl set config'
37
+ end
38
+ end
39
+
40
+ def resource_file(resource_type = 'pod')
41
+ File.dirname(__FILE__) + "/templates/#{resource_type}.yml.erb"
42
+ end
43
+
44
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
45
+ raise ArgumentError, 'Must specify the script' if script.nil?
46
+
47
+ resource_yml, id = generate_id_yml(script)
48
+ call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
49
+
50
+ id
51
+ end
52
+
53
+ def generate_id(name)
54
+ # 2_821_109_907_456 = 36**8
55
+ name.downcase.tr(' ', '-') + '-' + rand(2_821_109_907_456).to_s(36)
56
+ end
57
+
58
+ def info_all(attrs: nil)
59
+ cmd = if all_namespaces
60
+ "#{base_cmd} get pods -o json --all-namespaces"
61
+ else
62
+ "#{namespaced_cmd} get pods -o json"
63
+ end
64
+
65
+ output = call(cmd)
66
+ all_pods_to_info(output)
67
+ end
68
+
69
+ def info_where_owner(owner, attrs: nil)
70
+ owner = Array.wrap(owner).map(&:to_s)
71
+
72
+ # must at least have job_owner to filter by job_owner
73
+ attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
74
+
75
+ info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
76
+ end
77
+
78
+ def info_all_each(attrs: nil)
79
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
80
+
81
+ info_all(attrs: attrs).each do |job|
82
+ yield job
83
+ end
84
+ end
85
+
86
+ def info_where_owner_each(owner, attrs: nil)
87
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
88
+
89
+ info_where_owner(owner, attrs: attrs).each do |job|
90
+ yield job
91
+ end
92
+ end
93
+
94
+ def info(id)
95
+ pod_json = safe_call('get', 'pod', id)
96
+ return OodCore::Job::Info.new({ id: id, status: 'completed' }) if pod_json.empty?
97
+
98
+ service_json = safe_call('get', 'service', service_name(id))
99
+ secret_json = safe_call('get', 'secret', secret_name(id))
100
+
101
+ helper.info_from_json(pod_json: pod_json, service_json: service_json, secret_json: secret_json)
102
+ end
103
+
104
+ def status(id)
105
+ info(id).status
106
+ end
107
+
108
+ def delete(id)
109
+ safe_call("delete", "pod", id)
110
+ safe_call("delete", "service", service_name(id))
111
+ safe_call("delete", "secret", secret_name(id))
112
+ safe_call("delete", "configmap", configmap_name(id))
113
+ end
114
+
115
+ private
116
+
117
+ def safe_call(verb, resource, id)
118
+ begin
119
+ case verb.to_s
120
+ when "get"
121
+ call_json_output('get', resource, id)
122
+ when "delete"
123
+ call("#{namespaced_cmd} delete #{resource} #{id}")
124
+ end
125
+ rescue NotFoundError
126
+ {}
127
+ end
128
+ end
129
+
130
+ # helper to help format multi-line yaml data from the submit.yml into
131
+ # mutli-line yaml in the pod.yml.erb
132
+ def config_data_lines(data)
133
+ output = []
134
+ first = true
135
+
136
+ data.to_s.each_line do |line|
137
+ output.append(first ? line : line.prepend(" "))
138
+ first = false
139
+ end
140
+
141
+ output
142
+ end
143
+
144
+ def username
145
+ @username ||= Etc.getlogin
146
+ end
147
+
148
+ def k8s_username
149
+ username_prefix.nil? ? username : "#{username_prefix}-#{username}"
150
+ end
151
+
152
+ def user
153
+ @user ||= Etc.getpwnam(username)
154
+ end
155
+
156
+ def home_dir
157
+ user.dir
158
+ end
159
+
160
+ def run_as_user
161
+ user.uid
162
+ end
163
+
164
+ def run_as_group
165
+ user.gid
166
+ end
167
+
168
+ def fs_group
169
+ run_as_group
170
+ end
171
+
172
+ def group
173
+ Etc.getgrgid(run_as_group).name
174
+ end
175
+
176
+ def default_env
177
+ {
178
+ USER: username,
179
+ UID: run_as_user,
180
+ HOME: home_dir,
181
+ GROUP: group,
182
+ GID: run_as_group,
183
+ }
184
+ end
185
+
186
+ # helper to template resource yml you're going to submit and
187
+ # create an id.
188
+ def generate_id_yml(script)
189
+ native_data = script.native
190
+ container = helper.container_from_native(native_data[:container], default_env)
191
+ id = generate_id(container.name)
192
+ configmap = helper.configmap_from_native(native_data, id)
193
+ init_containers = helper.init_ctrs_from_native(native_data[:init_containers], container.env)
194
+ spec = OodCore::Job::Adapters::Kubernetes::Resources::PodSpec.new(container, init_containers: init_containers)
195
+ all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
196
+
197
+ template = ERB.new(File.read(resource_file), nil, '-')
198
+
199
+ [template.result(binding), id]
200
+ end
201
+
202
+ # helper to call kubectl and get json data back.
203
+ # verb, resrouce and id are the kubernetes parlance terms.
204
+ # example: 'kubectl get pod my-pod-id' is verb=get, resource=pod
205
+ # and id=my-pod-id
206
+ def call_json_output(verb, resource, id, stdin: nil)
207
+ cmd = "#{formatted_ns_cmd} #{verb} #{resource} #{id}"
208
+ data = call(cmd, stdin: stdin)
209
+ data = data.empty? ? '{}' : data
210
+ json_data = JSON.parse(data, symbolize_names: true)
211
+
212
+ json_data
213
+ end
214
+
215
+ def service_name(id)
216
+ helper.service_name(id)
217
+ end
218
+
219
+ def secret_name(id)
220
+ helper.secret_name(id)
221
+ end
222
+
223
+ def configmap_name(id)
224
+ helper.configmap_name(id)
225
+ end
226
+
227
+ def namespace
228
+ "#{namespace_prefix}#{username}"
229
+ end
230
+
231
+ def context
232
+ cluster
233
+ end
234
+
235
+ def default_config_file
236
+ (ENV['KUBECONFIG'] || "#{Dir.home}/.kube/config")
237
+ end
238
+
239
+ def default_auth
240
+ {
241
+ type: 'managaged'
242
+ }.symbolize_keys
243
+ end
244
+
245
+ def default_server
246
+ {
247
+ endpoint: 'https://localhost:8080',
248
+ cert_authority_file: nil
249
+ }.symbolize_keys
250
+ end
251
+
252
+ def formatted_ns_cmd
253
+ "#{namespaced_cmd} -o json"
254
+ end
255
+
256
+ def namespaced_cmd
257
+ "#{base_cmd} --namespace=#{namespace}"
258
+ end
259
+
260
+ def base_cmd
261
+ base = "#{bin} --kubeconfig=#{config_file}"
262
+ base << " --context=#{context}" if using_context
263
+ base
264
+ end
265
+
266
+ def all_pods_to_info(data)
267
+ json_data = JSON.parse(data, symbolize_names: true)
268
+ pods = json_data.dig(:items)
269
+
270
+ info_array = []
271
+ pods.each do |pod|
272
+ info = pod_info_from_json(pod)
273
+ info_array.push(info) unless info.nil?
274
+ end
275
+
276
+ info_array
277
+ rescue JSON::ParserError
278
+ # 'no resources in <namespace>' throws parse error
279
+ []
280
+ end
281
+
282
+ def pod_info_from_json(pod)
283
+ hash = helper.pod_info_from_json(pod)
284
+ K8sJobInfo.new(hash)
285
+ rescue Helper::K8sDataError
286
+ # FIXME: silently eating error, could probably use a logger
287
+ nil
288
+ end
289
+
290
+ def make_kubectl_config(config)
291
+ set_cluster(config.fetch(:server, default_server).to_h.symbolize_keys)
292
+ configure_auth(config.fetch(:auth, default_auth).to_h.symbolize_keys)
293
+ end
294
+
295
+ def configure_auth(auth)
296
+ type = auth.fetch(:type)
297
+ return if managed?(type)
298
+
299
+ case type
300
+ when 'gke'
301
+ set_gke_config(auth)
302
+ when 'oidc'
303
+ set_context
304
+ end
305
+ end
306
+
307
+ def use_context
308
+ @using_context = true
309
+ end
310
+
311
+ def managed?(type)
312
+ if type.nil?
313
+ true # maybe should be false?
314
+ else
315
+ type.to_s == 'managed'
316
+ end
317
+ end
318
+
319
+ def set_gke_config(auth)
320
+ cred_file = auth.fetch(:svc_acct_file)
321
+
322
+ cmd = "gcloud auth activate-service-account --key-file=#{cred_file}"
323
+ call(cmd)
324
+
325
+ set_gke_credentials(auth)
326
+ end
327
+
328
+ def set_gke_credentials(auth)
329
+
330
+ zone = auth.fetch(:zone, nil)
331
+ region = auth.fetch(:region, nil)
332
+
333
+ locale = ''
334
+ locale = "--zone=#{zone}" unless zone.nil?
335
+ locale = "--region=#{region}" unless region.nil?
336
+
337
+ # gke cluster name can probably can differ from what ood calls the cluster
338
+ cmd = "gcloud container clusters get-credentials #{locale} #{cluster}"
339
+ env = { 'KUBECONFIG' => config_file }
340
+ call(cmd, env)
341
+ end
342
+
343
+ def set_context
344
+ cmd = "#{base_cmd} config set-context #{cluster}"
345
+ cmd << " --cluster=#{cluster} --namespace=#{namespace}"
346
+ cmd << " --user=#{k8s_username}"
347
+
348
+ call(cmd)
349
+ use_context
350
+ end
351
+
352
+ def set_cluster(config)
353
+ server = config.fetch(:endpoint)
354
+ cert = config.fetch(:cert_authority_file, nil)
355
+
356
+ cmd = "#{base_cmd} config set-cluster #{cluster}"
357
+ cmd << " --server=#{server}"
358
+ cmd << " --certificate-authority=#{cert}" unless cert.nil?
359
+
360
+ call(cmd)
361
+ end
362
+
363
+ def call(cmd = '', env: {}, stdin: nil)
364
+ o, e, s = Open3.capture3(env, cmd, stdin_data: stdin.to_s)
365
+ s.success? ? o : interpret_and_raise(e)
366
+ end
367
+
368
+ def interpret_and_raise(stderr)
369
+ raise NotFoundError, stderr if /^Error from server \(NotFound\):/.match(stderr)
370
+ raise(Error, stderr)
371
+ end
372
+ end
@@ -0,0 +1,299 @@
1
+ class OodCore::Job::Adapters::Kubernetes::Helper
2
+
3
+ require_relative 'resources'
4
+ require_relative 'k8s_job_info'
5
+ require 'resolv'
6
+ require 'base64'
7
+ require 'active_support/core_ext/hash'
8
+
9
+ class K8sDataError < StandardError; end
10
+
11
+ # Extract info from json data. The data is expected to be from the kubectl
12
+ # command and conform to kubernetes' datatype structures.
13
+ #
14
+ # Returns K8sJobInfo in the in lieu of writing a connection.yml
15
+ #
16
+ # @param pod_json [#to_h]
17
+ # the pod data returned from 'kubectl get pod abc-123'
18
+ # @param service_json [#to_h]
19
+ # the service data returned from 'kubectl get service abc-123-service'
20
+ # @param secret_json [#to_h]
21
+ # the secret data returned from 'kubectl get secret abc-123-secret'
22
+ # @param ns_prefix [#to_s]
23
+ # the namespace prefix so that namespaces can be converted back to usernames
24
+ # @return [OodCore::Job::Adapters::Kubernetes::K8sJobInfo]
25
+ def info_from_json(pod_json: nil, service_json: nil, secret_json: nil, ns_prefix: nil)
26
+ pod_hash = pod_info_from_json(pod_json, ns_prefix: ns_prefix)
27
+ service_hash = service_info_from_json(service_json)
28
+ secret_hash = secret_info_from_json(secret_json)
29
+
30
+ pod_hash.deep_merge!(service_hash)
31
+ pod_hash.deep_merge!(secret_hash)
32
+ OodCore::Job::Adapters::Kubernetes::K8sJobInfo.new(pod_hash)
33
+ rescue NoMethodError
34
+ raise K8sDataError, "unable to read data correctly from json"
35
+ end
36
+
37
+ # Turn a container hash into a Kubernetes::Resources::Container
38
+ #
39
+ # @param container [#to_h]
40
+ # the input container hash
41
+ # @param default_env [#to_h]
42
+ # Default env to merge with defined env
43
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::Container]
44
+ def container_from_native(container, default_env)
45
+ env = container.fetch(:env, {}).to_h.symbolize_keys
46
+ OodCore::Job::Adapters::Kubernetes::Resources::Container.new(
47
+ container[:name],
48
+ container[:image],
49
+ command: parse_command(container[:command]),
50
+ port: container[:port],
51
+ env: default_env.merge(env),
52
+ memory: container[:memory],
53
+ cpu: container[:cpu],
54
+ working_dir: container[:working_dir],
55
+ restart_policy: container[:restart_policy],
56
+ image_pull_secret: container[:image_pull_secret]
57
+ )
58
+ end
59
+
60
+ # Parse a command string given from a user and return an array.
61
+ # If given an array, the input is simply returned back.
62
+ #
63
+ # @param cmd [#to_s]
64
+ # the command to parse
65
+ # @return [Array<#to_s>]
66
+ # the command parsed into an array of arguements
67
+ def parse_command(cmd)
68
+ if cmd&.is_a?(Array)
69
+ cmd
70
+ else
71
+ Shellwords.split(cmd.to_s)
72
+ end
73
+ end
74
+
75
+ # Turn a configmap hash into a Kubernetes::Resources::ConfigMap
76
+ # that can be used in templates. Needs an id so that the resulting
77
+ # configmap has a known name.
78
+ #
79
+ # @param native [#to_h]
80
+ # the input configmap hash
81
+ # @param id [#to_s]
82
+ # the id to use for giving the configmap a name
83
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
84
+ def configmap_from_native(native, id)
85
+ configmap = native.fetch(:configmap, nil)
86
+ return nil if configmap.nil?
87
+
88
+ OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap.new(
89
+ configmap_name(id),
90
+ (configmap[:files] || [])
91
+ )
92
+ end
93
+
94
+ # parse initialization containers from native data
95
+ #
96
+ # @param native_data [#to_h]
97
+ # the native data to parse. Expected key init_ctrs and for that
98
+ # key to be an array of hashes.
99
+ # @param default_env [#to_h]
100
+ # Default env to merge with defined env
101
+ # @return [Array<OodCore::Job::Adapters::Kubernetes::Resources::Container>]
102
+ # the array of init containers
103
+ def init_ctrs_from_native(ctrs, default_env)
104
+ init_ctrs = []
105
+
106
+ ctrs&.each do |ctr_raw|
107
+ ctr = container_from_native(ctr_raw, default_env)
108
+ init_ctrs.push(ctr)
109
+ end
110
+
111
+ init_ctrs
112
+ end
113
+
114
+ def service_name(id)
115
+ id + '-service'
116
+ end
117
+
118
+ def secret_name(id)
119
+ id + '-secret'
120
+ end
121
+
122
+ def configmap_name(id)
123
+ id + '-configmap'
124
+ end
125
+
126
+ def seconds_to_duration(s)
127
+ "%02dh%02dm%02ds" % [s / 3600, s / 60 % 60, s % 60]
128
+ end
129
+
130
+ # Extract pod info from json data. The data is expected to be from the kubectl
131
+ # command and conform to kubernetes' datatype structures.
132
+ #
133
+ # @param json_data [#to_h]
134
+ # the pod data returned from 'kubectl get pod abc-123'
135
+ # @param ns_prefix [#to_s]
136
+ # the namespace prefix so that namespaces can be converted back to usernames
137
+ # @return [#to_h]
138
+ # the hash of info expected from adapters
139
+ def pod_info_from_json(json_data, ns_prefix: nil)
140
+ {
141
+ id: json_data.dig(:metadata, :name).to_s,
142
+ job_name: name_from_metadata(json_data.dig(:metadata)),
143
+ status: pod_status_from_json(json_data),
144
+ job_owner: job_owner_from_json(json_data, ns_prefix),
145
+ submission_time: submission_time(json_data),
146
+ dispatch_time: dispatch_time(json_data),
147
+ wallclock_time: wallclock_time(json_data),
148
+ ood_connection_info: { host: get_host(json_data.dig(:status, :hostIP)) },
149
+ procs: procs_from_json(json_data)
150
+ }
151
+ rescue NoMethodError
152
+ # gotta raise an error because Info.new will throw an error if id is undefined
153
+ raise K8sDataError, "unable to read data correctly from json"
154
+ end
155
+
156
+ private
157
+
158
+ def get_host(ip)
159
+ Resolv.getname(ip)
160
+ rescue Resolv::ResolvError
161
+ ip
162
+ end
163
+
164
+ def name_from_metadata(metadata)
165
+ name = metadata.dig(:labels, :'app.kubernetes.io/name')
166
+ name = metadata.dig(:labels, :'k8s-app') if name.nil?
167
+ name = metadata.dig(:name) if name.nil? # pod-id but better than nil?
168
+ name
169
+ end
170
+
171
+ def service_info_from_json(json_data)
172
+ # all we need is the port - .spec.ports[0].nodePort
173
+ ports = json_data.dig(:spec, :ports)
174
+ { ood_connection_info: { port: ports[0].dig(:nodePort) } }
175
+ rescue
176
+ {}
177
+ end
178
+
179
+ def secret_info_from_json(json_data)
180
+ raw = json_data.dig(:data, :password)
181
+ { ood_connection_info: { password: Base64.decode64(raw) } }
182
+ rescue
183
+ {}
184
+ end
185
+
186
+ def dispatch_time(json_data)
187
+ status = pod_status_from_json(json_data)
188
+ container_statuses = json_data.dig(:status, :containerStatuses)
189
+ return nil if container_statuses.nil?
190
+
191
+ state_data = container_statuses[0].dig(:state)
192
+ date_string = nil
193
+
194
+ if status == 'completed'
195
+ date_string = state_data.dig(:terminated, :startedAt)
196
+ elsif status == 'running'
197
+ date_string = state_data.dig(:running, :startedAt)
198
+ end
199
+
200
+ date_string.nil? ? nil : DateTime.parse(date_string).to_time.to_i
201
+ end
202
+
203
+ def wallclock_time(json_data)
204
+ status = pod_status_from_json(json_data)
205
+ container_statuses = json_data.dig(:status, :containerStatuses)
206
+ return nil if container_statuses.nil?
207
+
208
+ state_data = container_statuses[0].dig(:state)
209
+ start_time = dispatch_time(json_data)
210
+ return nil if start_time.nil?
211
+
212
+ et = end_time(status, state_data)
213
+
214
+ et.nil? ? nil : et - start_time
215
+ end
216
+
217
+ def end_time(status, state_data)
218
+ if status == 'completed'
219
+ end_time_string = state_data.dig(:terminated, :finishedAt)
220
+ et = DateTime.parse(end_time_string).to_time.to_i
221
+ elsif status == 'running'
222
+ et = DateTime.now.to_time.to_i
223
+ else
224
+ et = nil
225
+ end
226
+
227
+ et
228
+ end
229
+
230
+ def submission_time(json_data)
231
+ status = json_data.dig(:status)
232
+ start = status.dig(:startTime)
233
+
234
+ if start.nil?
235
+ # the pod is in some pending state limbo
236
+ conditions = status.dig(:conditions)
237
+ # best guess to start time is just the first condition's
238
+ # transition time
239
+ str = conditions[0].dig(:lastTransitionTime)
240
+ else
241
+ str = start
242
+ end
243
+
244
+ DateTime.parse(str).to_time.to_i
245
+ end
246
+
247
+ def pod_status_from_json(json_data)
248
+ phase = json_data.dig(:status, :phase)
249
+ state = case phase
250
+ when "Running"
251
+ "running"
252
+ when "Pending"
253
+ "queued"
254
+ when "Failed"
255
+ "suspended"
256
+ when "Succeeded"
257
+ "completed"
258
+ when "Unknown"
259
+ "undetermined"
260
+ else
261
+ "undetermined"
262
+ end
263
+
264
+ OodCore::Job::Status.new(state: state)
265
+ end
266
+
267
+ def terminated_state(status)
268
+ reason = status.dig(:terminated, :reason)
269
+ if reason == 'Error'
270
+ 'suspended'
271
+ else
272
+ 'completed'
273
+ end
274
+ end
275
+
276
+ def procs_from_json(json_data)
277
+ containers = json_data.dig(:spec, :containers)
278
+ resources = containers[0].dig(:resources)
279
+
280
+ cpu = resources.dig(:limits, :cpu)
281
+ millicores_rex = /(\d+)m/
282
+
283
+ # ok to return string bc nil.to_i == 0 and we'd rather return
284
+ # nil (undefined) than 0 which is confusing.
285
+ if millicores_rex.match?(cpu)
286
+ millicores = millicores_rex.match(cpu)[1].to_i
287
+
288
+ # have to return at least 1 bc 200m could be 0
289
+ ((millicores + 1000) / 1000).to_s
290
+ else
291
+ cpu
292
+ end
293
+ end
294
+
295
+ def job_owner_from_json(json_data = {}, ns_prefix = nil)
296
+ namespace = json_data.dig(:metadata, :namespace).to_s
297
+ namespace.delete_prefix(ns_prefix.to_s)
298
+ end
299
+ end