ood_core 0.11.3 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,354 @@
1
+ require "ood_core/refinements/hash_extensions"
2
+ require "json"
3
+
4
+ class OodCore::Job::Adapters::Kubernetes::Batch
5
+
6
+ require_relative "helper"
7
+ require_relative "k8s_job_info"
8
+
9
+ using OodCore::Refinements::HashExtensions
10
+
11
+ class Error < StandardError; end
12
+ class NotFoundError < StandardError; end
13
+
14
+ attr_reader :config_file, :bin, :cluster, :mounts
15
+ attr_reader :all_namespaces, :using_context, :helper
16
+ attr_reader :username_prefix, :namespace_prefix
17
+
18
+ def initialize(options = {})
19
+ options = options.to_h.symbolize_keys
20
+
21
+ @config_file = options.fetch(:config_file, default_config_file)
22
+ @bin = options.fetch(:bin, '/usr/bin/kubectl')
23
+ @cluster = options.fetch(:cluster, 'open-ondemand')
24
+ @mounts = options.fetch(:mounts, []).map { |m| m.to_h.symbolize_keys }
25
+ @all_namespaces = options.fetch(:all_namespaces, false)
26
+ @username_prefix = options.fetch(:username_prefix, nil)
27
+ @namespace_prefix = options.fetch(:namespace_prefix, '')
28
+
29
+ @using_context = false
30
+ @helper = Helper.new
31
+
32
+ begin
33
+ make_kubectl_config(options)
34
+ rescue
35
+ # FIXME could use a log here
36
+ # means you couldn't 'kubectl set config'
37
+ end
38
+ end
39
+
40
+ def resource_file(resource_type = 'pod')
41
+ File.dirname(__FILE__) + "/templates/#{resource_type}.yml.erb"
42
+ end
43
+
44
+ def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
45
+ raise ArgumentError, 'Must specify the script' if script.nil?
46
+
47
+ resource_yml, id = generate_id_yml(script)
48
+ call("#{formatted_ns_cmd} create -f -", stdin: resource_yml)
49
+
50
+ id
51
+ end
52
+
53
+ def generate_id(name)
54
+ # 2_821_109_907_456 = 36**8
55
+ name.downcase.tr(' ', '-') + '-' + rand(2_821_109_907_456).to_s(36)
56
+ end
57
+
58
+ def info_all(attrs: nil)
59
+ cmd = if all_namespaces
60
+ "#{base_cmd} get pods -o json --all-namespaces"
61
+ else
62
+ "#{namespaced_cmd} get pods -o json"
63
+ end
64
+
65
+ output = call(cmd)
66
+ all_pods_to_info(output)
67
+ end
68
+
69
+ def info_where_owner(owner, attrs: nil)
70
+ owner = Array.wrap(owner).map(&:to_s)
71
+
72
+ # must at least have job_owner to filter by job_owner
73
+ attrs = Array.wrap(attrs) | [:job_owner] unless attrs.nil?
74
+
75
+ info_all(attrs: attrs).select { |info| owner.include? info.job_owner }
76
+ end
77
+
78
+ def info_all_each(attrs: nil)
79
+ return to_enum(:info_all_each, attrs: attrs) unless block_given?
80
+
81
+ info_all(attrs: attrs).each do |job|
82
+ yield job
83
+ end
84
+ end
85
+
86
+ def info_where_owner_each(owner, attrs: nil)
87
+ return to_enum(:info_where_owner_each, owner, attrs: attrs) unless block_given?
88
+
89
+ info_where_owner(owner, attrs: attrs).each do |job|
90
+ yield job
91
+ end
92
+ end
93
+
94
+ def info(id)
95
+ pod_json = safe_call('get', 'pod', id)
96
+ return OodCore::Job::Info.new({ id: id, status: 'completed' }) if pod_json.empty?
97
+
98
+ service_json = safe_call('get', 'service', service_name(id))
99
+ secret_json = safe_call('get', 'secret', secret_name(id))
100
+
101
+ helper.info_from_json(pod_json: pod_json, service_json: service_json, secret_json: secret_json)
102
+ end
103
+
104
+ def status(id)
105
+ info(id).status
106
+ end
107
+
108
+ def delete(id)
109
+ safe_call("delete", "pod", id)
110
+ safe_call("delete", "service", service_name(id))
111
+ safe_call("delete", "secret", secret_name(id))
112
+ safe_call("delete", "configmap", configmap_name(id))
113
+ end
114
+
115
+ def configmap_mount_path
116
+ '/ood'
117
+ end
118
+
119
+ private
120
+
121
+ def safe_call(verb, resource, id)
122
+ begin
123
+ case verb.to_s
124
+ when "get"
125
+ call_json_output('get', resource, id)
126
+ when "delete"
127
+ call("#{namespaced_cmd} delete #{resource} #{id}")
128
+ end
129
+ rescue NotFoundError
130
+ {}
131
+ end
132
+ end
133
+
134
+ # helper to help format multi-line yaml data from the submit.yml into
135
+ # mutli-line yaml in the pod.yml.erb
136
+ def config_data_lines(data)
137
+ output = []
138
+ first = true
139
+
140
+ data.to_s.each_line do |line|
141
+ output.append(first ? line : line.prepend(" "))
142
+ first = false
143
+ end
144
+
145
+ output
146
+ end
147
+
148
+ def username
149
+ @username ||= Etc.getlogin
150
+ end
151
+
152
+ def k8s_username
153
+ username_prefix.nil? ? username : "#{username_prefix}-#{username}"
154
+ end
155
+
156
+ def run_as_user
157
+ Etc.getpwnam(username).uid
158
+ end
159
+
160
+ def run_as_group
161
+ Etc.getpwnam(username).gid
162
+ end
163
+
164
+ def fs_group
165
+ run_as_group
166
+ end
167
+
168
+ # helper to template resource yml you're going to submit and
169
+ # create an id.
170
+ def generate_id_yml(script)
171
+ native_data = script.native
172
+ container = helper.container_from_native(native_data[:container])
173
+ id = generate_id(container.name)
174
+ configmap = helper.configmap_from_native(native_data, id)
175
+ init_containers = helper.init_ctrs_from_native(native_data[:init_containers])
176
+ spec = Kubernetes::Resources::PodSpec.new(container, init_containers: init_containers)
177
+ all_mounts = native_data[:mounts].nil? ? mounts : mounts + native_data[:mounts]
178
+
179
+ template = ERB.new(File.read(resource_file), nil, '-')
180
+
181
+ [template.result(binding), id]
182
+ end
183
+
184
+ # helper to call kubectl and get json data back.
185
+ # verb, resrouce and id are the kubernetes parlance terms.
186
+ # example: 'kubectl get pod my-pod-id' is verb=get, resource=pod
187
+ # and id=my-pod-id
188
+ def call_json_output(verb, resource, id, stdin: nil)
189
+ cmd = "#{formatted_ns_cmd} #{verb} #{resource} #{id}"
190
+ data = call(cmd, stdin: stdin)
191
+ data = data.empty? ? '{}' : data
192
+ json_data = JSON.parse(data, symbolize_names: true)
193
+
194
+ json_data
195
+ end
196
+
197
+ def service_name(id)
198
+ helper.service_name(id)
199
+ end
200
+
201
+ def secret_name(id)
202
+ helper.secret_name(id)
203
+ end
204
+
205
+ def configmap_name(id)
206
+ helper.configmap_name(id)
207
+ end
208
+
209
+ def namespace
210
+ "#{namespace_prefix}#{username}"
211
+ end
212
+
213
+ def context
214
+ cluster
215
+ end
216
+
217
+ def default_config_file
218
+ (ENV['KUBECONFIG'] || "#{Dir.home}/.kube/config")
219
+ end
220
+
221
+ def default_auth
222
+ {
223
+ type: 'managaged'
224
+ }.symbolize_keys
225
+ end
226
+
227
+ def default_server
228
+ {
229
+ endpoint: 'https://localhost:8080',
230
+ cert_authority_file: nil
231
+ }.symbolize_keys
232
+ end
233
+
234
+ def formatted_ns_cmd
235
+ "#{namespaced_cmd} -o json"
236
+ end
237
+
238
+ def namespaced_cmd
239
+ "#{base_cmd} --namespace=#{namespace}"
240
+ end
241
+
242
+ def base_cmd
243
+ base = "#{bin} --kubeconfig=#{config_file}"
244
+ base << " --context=#{context}" if using_context
245
+ base
246
+ end
247
+
248
+ def all_pods_to_info(data)
249
+ json_data = JSON.parse(data, symbolize_names: true)
250
+ pods = json_data.dig(:items)
251
+
252
+ info_array = []
253
+ pods.each do |pod|
254
+ info = pod_info_from_json(pod)
255
+ info_array.push(info) unless info.nil?
256
+ end
257
+
258
+ info_array
259
+ rescue JSON::ParserError
260
+ # 'no resources in <namespace>' throws parse error
261
+ []
262
+ end
263
+
264
+ def pod_info_from_json(pod)
265
+ hash = helper.pod_info_from_json(pod)
266
+ K8sJobInfo.new(hash)
267
+ rescue Helper::K8sDataError
268
+ # FIXME: silently eating error, could probably use a logger
269
+ nil
270
+ end
271
+
272
+ def make_kubectl_config(config)
273
+ set_cluster(config.fetch(:server, default_server).to_h.symbolize_keys)
274
+ configure_auth(config.fetch(:auth, default_auth).to_h.symbolize_keys)
275
+ end
276
+
277
+ def configure_auth(auth)
278
+ type = auth.fetch(:type)
279
+ return if managed?(type)
280
+
281
+ case type
282
+ when 'gke'
283
+ set_gke_config(auth)
284
+ when 'oidc'
285
+ set_context
286
+ end
287
+ end
288
+
289
+ def use_context
290
+ @using_context = true
291
+ end
292
+
293
+ def managed?(type)
294
+ if type.nil?
295
+ true # maybe should be false?
296
+ else
297
+ type.to_s == 'managed'
298
+ end
299
+ end
300
+
301
+ def set_gke_config(auth)
302
+ cred_file = auth.fetch(:svc_acct_file)
303
+
304
+ cmd = "gcloud auth activate-service-account --key-file=#{cred_file}"
305
+ call(cmd)
306
+
307
+ set_gke_credentials(auth)
308
+ end
309
+
310
+ def set_gke_credentials(auth)
311
+
312
+ zone = auth.fetch(:zone, nil)
313
+ region = auth.fetch(:region, nil)
314
+
315
+ locale = ''
316
+ locale = "--zone=#{zone}" unless zone.nil?
317
+ locale = "--region=#{region}" unless region.nil?
318
+
319
+ # gke cluster name can probably can differ from what ood calls the cluster
320
+ cmd = "gcloud container clusters get-credentials #{locale} #{cluster}"
321
+ env = { 'KUBECONFIG' => config_file }
322
+ call(cmd, env)
323
+ end
324
+
325
+ def set_context
326
+ cmd = "#{base_cmd} config set-context #{cluster}"
327
+ cmd << " --cluster=#{cluster} --namespace=#{namespace}"
328
+ cmd << " --user=#{k8s_username}"
329
+
330
+ call(cmd)
331
+ use_context
332
+ end
333
+
334
+ def set_cluster(config)
335
+ server = config.fetch(:endpoint)
336
+ cert = config.fetch(:cert_authority_file, nil)
337
+
338
+ cmd = "#{base_cmd} config set-cluster #{cluster}"
339
+ cmd << " --server=#{server}"
340
+ cmd << " --certificate-authority=#{cert}" unless cert.nil?
341
+
342
+ call(cmd)
343
+ end
344
+
345
+ def call(cmd = '', env: {}, stdin: nil)
346
+ o, e, s = Open3.capture3(env, cmd, stdin_data: stdin.to_s)
347
+ s.success? ? o : interpret_and_raise(e)
348
+ end
349
+
350
+ def interpret_and_raise(stderr)
351
+ raise NotFoundError, stderr if /^Error from server \(NotFound\):/.match(stderr)
352
+ raise(Error, stderr)
353
+ end
354
+ end
@@ -0,0 +1,294 @@
1
+ class OodCore::Job::Adapters::Kubernetes::Helper
2
+
3
+ require_relative 'resources'
4
+ require_relative 'k8s_job_info'
5
+ require 'resolv'
6
+ require 'base64'
7
+ require 'active_support/core_ext/hash'
8
+
9
+ class K8sDataError < StandardError; end
10
+
11
+ # Extract info from json data. The data is expected to be from the kubectl
12
+ # command and conform to kubernetes' datatype structures.
13
+ #
14
+ # Returns K8sJobInfo in the in lieu of writing a connection.yml
15
+ #
16
+ # @param pod_json [#to_h]
17
+ # the pod data returned from 'kubectl get pod abc-123'
18
+ # @param service_json [#to_h]
19
+ # the service data returned from 'kubectl get service abc-123-service'
20
+ # @param secret_json [#to_h]
21
+ # the secret data returned from 'kubectl get secret abc-123-secret'
22
+ # @param ns_prefix [#to_s]
23
+ # the namespace prefix so that namespaces can be converted back to usernames
24
+ # @return [OodCore::Job::Adapters::Kubernetes::K8sJobInfo]
25
+ def info_from_json(pod_json: nil, service_json: nil, secret_json: nil, ns_prefix: nil)
26
+ pod_hash = pod_info_from_json(pod_json, ns_prefix: ns_prefix)
27
+ service_hash = service_info_from_json(service_json)
28
+ secret_hash = secret_info_from_json(secret_json)
29
+
30
+ pod_hash.deep_merge!(service_hash)
31
+ pod_hash.deep_merge!(secret_hash)
32
+ K8sJobInfo.new(pod_hash)
33
+ rescue NoMethodError
34
+ raise K8sDataError, "unable to read data correctly from json"
35
+ end
36
+
37
+ # Turn a container hash into a Kubernetes::Resources::Container
38
+ #
39
+ # @param container [#to_h]
40
+ # the input container hash
41
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::Container]
42
+ def container_from_native(container)
43
+ Kubernetes::Resources::Container.new(
44
+ container[:name],
45
+ container[:image],
46
+ command: parse_command(container[:command]),
47
+ port: container[:port],
48
+ env: container.fetch(:env, []),
49
+ memory: container[:memory],
50
+ cpu: container[:cpu],
51
+ working_dir: container[:working_dir],
52
+ restart_policy: container[:restart_policy]
53
+ )
54
+ end
55
+
56
+ # Parse a command string given from a user and return an array.
57
+ # If given an array, the input is simply returned back.
58
+ #
59
+ # @param cmd [#to_s]
60
+ # the command to parse
61
+ # @return [Array<#to_s>]
62
+ # the command parsed into an array of arguements
63
+ def parse_command(cmd)
64
+ if cmd&.is_a?(Array)
65
+ cmd
66
+ else
67
+ Shellwords.split(cmd.to_s)
68
+ end
69
+ end
70
+
71
+ # Turn a configmap hash into a Kubernetes::Resources::ConfigMap
72
+ # that can be used in templates. Needs an id so that the resulting
73
+ # configmap has a known name.
74
+ #
75
+ # @param native [#to_h]
76
+ # the input configmap hash
77
+ # @param id [#to_s]
78
+ # the id to use for giving the configmap a name
79
+ # @return [OodCore::Job::Adapters::Kubernetes::Resources::ConfigMap]
80
+ def configmap_from_native(native, id)
81
+ configmap = native.fetch(:configmap, nil)
82
+ return nil if configmap.nil?
83
+
84
+ Kubernetes::Resources::ConfigMap.new(
85
+ configmap_name(id),
86
+ configmap[:filename],
87
+ configmap[:data]
88
+ )
89
+ end
90
+
91
+ # parse initialization containers from native data
92
+ #
93
+ # @param native_data [#to_h]
94
+ # the native data to parse. Expected key init_ctrs and for that
95
+ # key to be an array of hashes.
96
+ # @return [Array<OodCore::Job::Adapters::Kubernetes::Resources::Container>]
97
+ # the array of init containers
98
+ def init_ctrs_from_native(ctrs)
99
+ init_ctrs = []
100
+
101
+ ctrs&.each do |ctr_raw|
102
+ ctr = container_from_native(ctr_raw)
103
+ init_ctrs.push(ctr)
104
+ end
105
+
106
+ init_ctrs
107
+ end
108
+
109
+ def service_name(id)
110
+ id + '-service'
111
+ end
112
+
113
+ def secret_name(id)
114
+ id + '-secret'
115
+ end
116
+
117
+ def configmap_name(id)
118
+ id + '-configmap'
119
+ end
120
+
121
+ def seconds_to_duration(s)
122
+ "%02dh%02dm%02ds" % [s / 3600, s / 60 % 60, s % 60]
123
+ end
124
+
125
+ # Extract pod info from json data. The data is expected to be from the kubectl
126
+ # command and conform to kubernetes' datatype structures.
127
+ #
128
+ # @param json_data [#to_h]
129
+ # the pod data returned from 'kubectl get pod abc-123'
130
+ # @param ns_prefix [#to_s]
131
+ # the namespace prefix so that namespaces can be converted back to usernames
132
+ # @return [#to_h]
133
+ # the hash of info expected from adapters
134
+ def pod_info_from_json(json_data, ns_prefix: nil)
135
+ {
136
+ id: json_data.dig(:metadata, :name).to_s,
137
+ job_name: name_from_metadata(json_data.dig(:metadata)),
138
+ status: pod_status_from_json(json_data),
139
+ job_owner: job_owner_from_json(json_data, ns_prefix),
140
+ submission_time: submission_time(json_data),
141
+ dispatch_time: dispatch_time(json_data),
142
+ wallclock_time: wallclock_time(json_data),
143
+ ood_connection_info: { host: get_host(json_data.dig(:status, :hostIP)) },
144
+ procs: procs_from_json(json_data)
145
+ }
146
+ rescue NoMethodError
147
+ # gotta raise an error because Info.new will throw an error if id is undefined
148
+ raise K8sDataError, "unable to read data correctly from json"
149
+ end
150
+
151
+ private
152
+
153
+ def get_host(ip)
154
+ Resolv.getname(ip)
155
+ rescue Resolv::ResolvError
156
+ ip
157
+ end
158
+
159
+ def name_from_metadata(metadata)
160
+ name = metadata.dig(:labels, :'app.kubernetes.io/name')
161
+ name = metadata.dig(:labels, :'k8s-app') if name.nil?
162
+ name = metadata.dig(:name) if name.nil? # pod-id but better than nil?
163
+ name
164
+ end
165
+
166
+ def service_info_from_json(json_data)
167
+ # all we need is the port - .spec.ports[0].nodePort
168
+ ports = json_data.dig(:spec, :ports)
169
+ { ood_connection_info: { port: ports[0].dig(:nodePort) } }
170
+ rescue
171
+ {}
172
+ end
173
+
174
+ def secret_info_from_json(json_data)
175
+ raw = json_data.dig(:data, :password)
176
+ { ood_connection_info: { password: Base64.decode64(raw) } }
177
+ rescue
178
+ {}
179
+ end
180
+
181
+ def dispatch_time(json_data)
182
+ status = pod_status_from_json(json_data)
183
+ container_statuses = json_data.dig(:status, :containerStatuses)
184
+ return nil if container_statuses.nil?
185
+
186
+ state_data = container_statuses[0].dig(:state)
187
+ date_string = nil
188
+
189
+ if status == 'completed'
190
+ date_string = state_data.dig(:terminated, :startedAt)
191
+ elsif status == 'running'
192
+ date_string = state_data.dig(:running, :startedAt)
193
+ end
194
+
195
+ date_string.nil? ? nil : DateTime.parse(date_string).to_time.to_i
196
+ end
197
+
198
+ def wallclock_time(json_data)
199
+ status = pod_status_from_json(json_data)
200
+ container_statuses = json_data.dig(:status, :containerStatuses)
201
+ return nil if container_statuses.nil?
202
+
203
+ state_data = container_statuses[0].dig(:state)
204
+ start_time = dispatch_time(json_data)
205
+ return nil if start_time.nil?
206
+
207
+ et = end_time(status, state_data)
208
+
209
+ et.nil? ? nil : et - start_time
210
+ end
211
+
212
+ def end_time(status, state_data)
213
+ if status == 'completed'
214
+ end_time_string = state_data.dig(:terminated, :finishedAt)
215
+ et = DateTime.parse(end_time_string).to_time.to_i
216
+ elsif status == 'running'
217
+ et = DateTime.now.to_time.to_i
218
+ else
219
+ et = nil
220
+ end
221
+
222
+ et
223
+ end
224
+
225
+ def submission_time(json_data)
226
+ status = json_data.dig(:status)
227
+ start = status.dig(:startTime)
228
+
229
+ if start.nil?
230
+ # the pod is in some pending state limbo
231
+ conditions = status.dig(:conditions)
232
+ # best guess to start time is just the first condition's
233
+ # transition time
234
+ str = conditions[0].dig(:lastTransitionTime)
235
+ else
236
+ str = start
237
+ end
238
+
239
+ DateTime.parse(str).to_time.to_i
240
+ end
241
+
242
+ def pod_status_from_json(json_data)
243
+ phase = json_data.dig(:status, :phase)
244
+ state = case phase
245
+ when "Running"
246
+ "running"
247
+ when "Pending"
248
+ "queued"
249
+ when "Failed"
250
+ "suspended"
251
+ when "Succeeded"
252
+ "completed"
253
+ when "Unknown"
254
+ "undetermined"
255
+ else
256
+ "undetermined"
257
+ end
258
+
259
+ OodCore::Job::Status.new(state: state)
260
+ end
261
+
262
+ def terminated_state(status)
263
+ reason = status.dig(:terminated, :reason)
264
+ if reason == 'Error'
265
+ 'suspended'
266
+ else
267
+ 'completed'
268
+ end
269
+ end
270
+
271
+ def procs_from_json(json_data)
272
+ containers = json_data.dig(:spec, :containers)
273
+ resources = containers[0].dig(:resources)
274
+
275
+ cpu = resources.dig(:limits, :cpu)
276
+ millicores_rex = /(\d+)m/
277
+
278
+ # ok to return string bc nil.to_i == 0 and we'd rather return
279
+ # nil (undefined) than 0 which is confusing.
280
+ if millicores_rex.match?(cpu)
281
+ millicores = millicores_rex.match(cpu)[1].to_i
282
+
283
+ # have to return at least 1 bc 200m could be 0
284
+ ((millicores + 1000) / 1000).to_s
285
+ else
286
+ cpu
287
+ end
288
+ end
289
+
290
+ def job_owner_from_json(json_data = {}, ns_prefix = nil)
291
+ namespace = json_data.dig(:metadata, :namespace).to_s
292
+ namespace.delete_prefix(ns_prefix.to_s)
293
+ end
294
+ end