konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. konduktor/__init__.py +49 -0
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/aws.py +221 -0
  4. konduktor/adaptors/common.py +118 -0
  5. konduktor/adaptors/gcp.py +126 -0
  6. konduktor/authentication.py +124 -0
  7. konduktor/backends/__init__.py +6 -0
  8. konduktor/backends/backend.py +86 -0
  9. konduktor/backends/constants.py +21 -0
  10. konduktor/backends/deployment.py +204 -0
  11. konduktor/backends/deployment_utils.py +1351 -0
  12. konduktor/backends/jobset.py +225 -0
  13. konduktor/backends/jobset_utils.py +726 -0
  14. konduktor/backends/pod_utils.py +501 -0
  15. konduktor/check.py +184 -0
  16. konduktor/cli.py +1945 -0
  17. konduktor/config.py +420 -0
  18. konduktor/constants.py +36 -0
  19. konduktor/controller/__init__.py +0 -0
  20. konduktor/controller/constants.py +56 -0
  21. konduktor/controller/launch.py +44 -0
  22. konduktor/controller/node.py +116 -0
  23. konduktor/controller/parse.py +111 -0
  24. konduktor/dashboard/README.md +30 -0
  25. konduktor/dashboard/backend/main.py +169 -0
  26. konduktor/dashboard/backend/sockets.py +154 -0
  27. konduktor/dashboard/frontend/.eslintrc.json +3 -0
  28. konduktor/dashboard/frontend/.gitignore +36 -0
  29. konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
  30. konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
  31. konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
  32. konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
  33. konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
  34. konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
  35. konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
  36. konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
  37. konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
  38. konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
  39. konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
  40. konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
  41. konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
  42. konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
  43. konduktor/dashboard/frontend/app/favicon.ico +0 -0
  44. konduktor/dashboard/frontend/app/globals.css +120 -0
  45. konduktor/dashboard/frontend/app/jobs/page.js +10 -0
  46. konduktor/dashboard/frontend/app/layout.js +22 -0
  47. konduktor/dashboard/frontend/app/logs/page.js +11 -0
  48. konduktor/dashboard/frontend/app/page.js +12 -0
  49. konduktor/dashboard/frontend/jsconfig.json +7 -0
  50. konduktor/dashboard/frontend/next.config.mjs +4 -0
  51. konduktor/dashboard/frontend/package-lock.json +6687 -0
  52. konduktor/dashboard/frontend/package.json +37 -0
  53. konduktor/dashboard/frontend/postcss.config.mjs +8 -0
  54. konduktor/dashboard/frontend/server.js +64 -0
  55. konduktor/dashboard/frontend/tailwind.config.js +17 -0
  56. konduktor/data/__init__.py +9 -0
  57. konduktor/data/aws/__init__.py +15 -0
  58. konduktor/data/aws/s3.py +1138 -0
  59. konduktor/data/constants.py +7 -0
  60. konduktor/data/data_utils.py +268 -0
  61. konduktor/data/gcp/__init__.py +19 -0
  62. konduktor/data/gcp/constants.py +42 -0
  63. konduktor/data/gcp/gcs.py +994 -0
  64. konduktor/data/gcp/utils.py +9 -0
  65. konduktor/data/registry.py +19 -0
  66. konduktor/data/storage.py +812 -0
  67. konduktor/data/storage_utils.py +535 -0
  68. konduktor/execution.py +447 -0
  69. konduktor/kube_client.py +237 -0
  70. konduktor/logging.py +111 -0
  71. konduktor/manifests/aibrix-setup.yaml +430 -0
  72. konduktor/manifests/apoxy-setup.yaml +184 -0
  73. konduktor/manifests/apoxy-setup2.yaml +98 -0
  74. konduktor/manifests/controller_deployment.yaml +69 -0
  75. konduktor/manifests/dashboard_deployment.yaml +131 -0
  76. konduktor/manifests/dmesg_daemonset.yaml +57 -0
  77. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  78. konduktor/resource.py +546 -0
  79. konduktor/serving.py +153 -0
  80. konduktor/task.py +949 -0
  81. konduktor/templates/deployment.yaml.j2 +191 -0
  82. konduktor/templates/jobset.yaml.j2 +43 -0
  83. konduktor/templates/pod.yaml.j2 +563 -0
  84. konduktor/usage/__init__.py +0 -0
  85. konduktor/usage/constants.py +21 -0
  86. konduktor/utils/__init__.py +0 -0
  87. konduktor/utils/accelerator_registry.py +17 -0
  88. konduktor/utils/annotations.py +62 -0
  89. konduktor/utils/base64_utils.py +95 -0
  90. konduktor/utils/common_utils.py +426 -0
  91. konduktor/utils/constants.py +5 -0
  92. konduktor/utils/env_options.py +55 -0
  93. konduktor/utils/exceptions.py +234 -0
  94. konduktor/utils/kubernetes_enums.py +8 -0
  95. konduktor/utils/kubernetes_utils.py +763 -0
  96. konduktor/utils/log_utils.py +467 -0
  97. konduktor/utils/loki_utils.py +102 -0
  98. konduktor/utils/rich_utils.py +123 -0
  99. konduktor/utils/schemas.py +625 -0
  100. konduktor/utils/subprocess_utils.py +273 -0
  101. konduktor/utils/ux_utils.py +247 -0
  102. konduktor/utils/validator.py +461 -0
  103. konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
  104. konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
  105. konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
  106. konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
  107. konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,763 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Kubernetes utilities."""
14
+
15
+ import functools
16
+ import math
17
+ import os
18
+ import re
19
+ import typing
20
+ from typing import Any, Dict, List, Optional, Tuple, Union
21
+
22
+ import filelock
23
+ import kubernetes
24
+ import yaml # type: ignore
25
+
26
+ from konduktor import config, kube_client, logging
27
+ from konduktor.backends import constants as backend_constants
28
+ from konduktor.utils import common_utils, kubernetes_enums
29
+
30
+ if typing.TYPE_CHECKING:
31
+ pass
32
+
33
+ DEFAULT_NAMESPACE = 'default'
34
+
35
+ DEFAULT_SERVICE_ACCOUNT_NAME = 'konduktor-service-account'
36
+
37
+ DNS_SUBDOMAIN_REGEX = r'^[a-z0-9]([-a-z0-9]*[a-z0-9])?$'
38
+
39
+ MEMORY_SIZE_UNITS = {
40
+ 'B': 1,
41
+ 'K': 2**10,
42
+ 'M': 2**20,
43
+ 'G': 2**30,
44
+ 'T': 2**40,
45
+ 'P': 2**50,
46
+ }
47
+
48
+ # The resource keys used by Kubernetes to track NVIDIA GPUs and Google on
49
+ # nodes. These keys are typically used in the node's status.allocatable
50
+ # or status.capacity fields to indicate the available resources on the node.
51
+ GPU_RESOURCE_KEY = 'nvidia.com/gpu'
52
+
53
+ NO_ACCELERATOR_HELP_MESSAGE = (
54
+ 'If your cluster contains GPUs, make sure '
55
+ f'{GPU_RESOURCE_KEY} resource is available '
56
+ 'on the nodes and the node labels for identifying GPUs '
57
+ '(e.g. `nvidia.com/gpu` are setup correctly. '
58
+ )
59
+
60
+ _K8S_CLIENT_LOCK_PATH = os.path.expanduser('~/.konduktor/k8s_client.lock')
61
+ _K8s_CLIENT_LOCK = filelock.FileLock(_K8S_CLIENT_LOCK_PATH)
62
+
63
+ logger = logging.get_logger(__name__)
64
+
65
+
66
+ class GPULabelFormatter:
67
+ """Base class to define a GPU label formatter for a Kubernetes cluster
68
+
69
+ A GPU label formatter is a class that defines how to use GPU type labels in
70
+ a Kubernetes cluster. It is used by the Kubernetes cloud class to pick the
71
+ key:value pair to use as node selector for GPU nodes.
72
+ """
73
+
74
+ @classmethod
75
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
76
+ """Returns the label key for GPU type used by the Kubernetes cluster"""
77
+ raise NotImplementedError
78
+
79
+ @classmethod
80
+ def get_label_keys(cls) -> List[str]:
81
+ """Returns a list of label keys for GPU used by Kubernetes cluster."""
82
+ raise NotImplementedError
83
+
84
+ @classmethod
85
+ def get_label_value(cls, accelerator: str) -> str:
86
+ """Given a GPU type, returns the label value to be used"""
87
+ raise NotImplementedError
88
+
89
+ @classmethod
90
+ def match_label_key(cls, label_key: str) -> bool:
91
+ """Checks if the given label key matches the formatter's label keys"""
92
+ raise NotImplementedError
93
+
94
+ @classmethod
95
+ def get_accelerator_from_label_value(cls, value: str) -> str:
96
+ """Given a label value, returns the GPU type"""
97
+ raise NotImplementedError
98
+
99
+ @classmethod
100
+ def validate_label_value(cls, value: str) -> Tuple[bool, str]:
101
+ """Validates if the specified label value is correct.
102
+
103
+ Used to check if the labelling on the cluster is correct and
104
+ preemptively raise an error if it is not.
105
+
106
+ Returns:
107
+ bool: True if the label value is valid, False otherwise.
108
+ str: Error message if the label value is invalid, None otherwise.
109
+ """
110
+ del value
111
+ return True, ''
112
+
113
+
114
+ def get_gke_accelerator_name(accelerator: str) -> str:
115
+ """Returns the accelerator name for GKE clusters.
116
+
117
+ Uses the format - nvidia-tesla-<accelerator>.
118
+ A100-80GB, H100-80GB, L4 are an exception. They use nvidia-<accelerator>.
119
+ types are an exception as well keeping the given name.
120
+ """
121
+ if accelerator == 'H100':
122
+ # H100 is named as H100-80GB in GKE.
123
+ accelerator = 'H100-80GB'
124
+ if accelerator in ('A100-80GB', 'L4', 'H100-80GB', 'H100-MEGA-80GB'):
125
+ # A100-80GB, L4, H100-80GB and H100-MEGA-80GB
126
+ # have a different name pattern.
127
+ return 'nvidia-{}'.format(accelerator.lower())
128
+ else:
129
+ return 'nvidia-tesla-{}'.format(accelerator.lower())
130
+
131
+
132
+ class GKELabelFormatter(GPULabelFormatter):
133
+ """GKE label formatter
134
+
135
+ GKE nodes by default are populated with `cloud.google.com/gke-accelerator`
136
+ label, which is used to identify the GPU type.
137
+ """
138
+
139
+ GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
140
+ ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
141
+
142
+ @classmethod
143
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
144
+ return cls.GPU_LABEL_KEY
145
+
146
+ @classmethod
147
+ def get_label_keys(cls) -> List[str]:
148
+ return [cls.GPU_LABEL_KEY]
149
+
150
+ @classmethod
151
+ def match_label_key(cls, label_key: str) -> bool:
152
+ return label_key in cls.get_label_keys()
153
+
154
+ @classmethod
155
+ def get_label_value(cls, accelerator: str) -> str:
156
+ return get_gke_accelerator_name(accelerator)
157
+
158
+ @classmethod
159
+ def get_accelerator_from_label_value(cls, value: str) -> str:
160
+ if value.startswith('nvidia-tesla-'):
161
+ return value.replace('nvidia-tesla-', '').upper()
162
+ elif value.startswith('nvidia-'):
163
+ acc = value.replace('nvidia-', '').upper()
164
+ if acc == 'H100-80GB':
165
+ # H100 can be either H100-80GB or H100-MEGA-80GB in GKE
166
+ # we map H100 ---> H100-80GB and keep H100-MEGA-80GB
167
+ # to distinguish between a3-high and a3-mega instances
168
+ return 'H100'
169
+ return acc
170
+ else:
171
+ raise ValueError(f'Invalid accelerator name in GKE cluster: {value}')
172
+
173
+
174
+ class GFDLabelFormatter(GPULabelFormatter):
175
+ """GPU Feature Discovery label formatter
176
+
177
+ NVIDIA GPUs nodes are labeled by GPU feature discovery
178
+ e.g. nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3
179
+ https://github.com/NVIDIA/gpu-feature-discovery
180
+
181
+ GPU feature discovery is included as part of the
182
+ NVIDIA GPU Operator:
183
+ https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html
184
+
185
+ This LabelFormatter can't be used in autoscaling clusters since accelerators
186
+ may map to multiple label, so we're not implementing `get_label_value`
187
+ """
188
+
189
+ LABEL_KEY = 'nvidia.com/gpu.product'
190
+
191
+ @classmethod
192
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
193
+ return cls.LABEL_KEY
194
+
195
+ @classmethod
196
+ def get_label_keys(cls) -> List[str]:
197
+ return [cls.LABEL_KEY]
198
+
199
+ @classmethod
200
+ def get_label_value(cls, accelerator: str) -> str:
201
+ """An accelerator can map to many Nvidia GFD labels
202
+ (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
203
+ As a result, we do not support get_label_value for GFDLabelFormatter."""
204
+ raise NotImplementedError
205
+
206
+ @classmethod
207
+ def match_label_key(cls, label_key: str) -> bool:
208
+ return label_key == cls.LABEL_KEY
209
+
210
+ @classmethod
211
+ def get_accelerator_from_label_value(cls, value: str) -> str:
212
+ """Searches against a canonical list of NVIDIA GPUs and pattern
213
+ matches the canonical GPU name against the GFD label.
214
+ """
215
+ canonical_gpu_names = [
216
+ 'A100-80GB',
217
+ 'A100',
218
+ 'A10G',
219
+ 'H100',
220
+ 'K80',
221
+ 'M60',
222
+ 'T4g',
223
+ 'T4',
224
+ 'V100',
225
+ 'A10',
226
+ 'P4000',
227
+ 'P100',
228
+ 'P40',
229
+ 'P4',
230
+ 'L40',
231
+ 'L4',
232
+ ]
233
+ for canonical_name in canonical_gpu_names:
234
+ # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB
235
+ if canonical_name == 'A100-80GB' and re.search(r'A100.*-80GB', value):
236
+ return canonical_name
237
+ # Use word boundary matching to prevent substring matches
238
+ elif re.search(rf'\b{re.escape(canonical_name)}\b', value):
239
+ return canonical_name
240
+
241
+ # If we didn't find a canonical name:
242
+ # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000')
243
+ # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070')
244
+ # 3. remove 'RTX-' (e.g. 'RTX-6000' -> 'RTX6000')
245
+ return (
246
+ value.upper()
247
+ .replace('NVIDIA-', '')
248
+ .replace('GEFORCE-', '')
249
+ .replace('RTX-', 'RTX')
250
+ )
251
+
252
+
253
+ # LABEL_FORMATTER_REGISTRY stores the label formats that will try to
254
+ # discover the accelerator type from. The order of the list is important, as
255
+ # it will be used to determine the priority of the label formats when
256
+ # auto-detecting the GPU label type.
257
+ LABEL_FORMATTER_REGISTRY = [GKELabelFormatter, GFDLabelFormatter]
258
+
259
+ # Mapping of autoscaler type to label formatter
260
+ AUTOSCALER_TO_LABEL_FORMATTER = {
261
+ kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
262
+ }
263
+
264
+
265
+ @functools.lru_cache()
266
+ def get_current_kube_config_context_name() -> Optional[str]:
267
+ """Get the active Kubernetes context name.
268
+
269
+ Precedence:
270
+ 1) The first entry in `kubernetes.allowed_contexts` (if configured).
271
+ 2) kubeconfig's current-context (fallback when not configured).
272
+
273
+ Returns:
274
+ str | None: The selected context if it exists, None otherwise.
275
+ """
276
+ # 1) Prefer a user-configured allowed context if provided.
277
+ try:
278
+ allowed_contexts: Optional[List[str]] = config.get_nested(
279
+ ('kubernetes', 'allowed_contexts'), None
280
+ )
281
+ if allowed_contexts:
282
+ context = allowed_contexts[0]
283
+ logger.info(
284
+ 'Detected kubernetes.allowed_contexts in config; using context: %s',
285
+ context,
286
+ )
287
+ return context
288
+ except Exception: # fallback safely if config loading fails unexpectedly
289
+ pass
290
+
291
+ # 2) Fall back to kubeconfig's current context
292
+ k8s = kubernetes
293
+ try:
294
+ _, current_context = k8s.config.list_kube_config_contexts()
295
+ return current_context['name']
296
+ except k8s.config.config_exception.ConfigException:
297
+ return None
298
+
299
+
300
+ @functools.lru_cache(maxsize=10)
301
+ def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
302
+ """Gets the kubernetes nodes in the context.
303
+
304
+ If context is None, gets the nodes in the current context.
305
+ """
306
+ if context is None:
307
+ context = get_current_kube_config_context_name()
308
+
309
+ nodes = (
310
+ kube_client.core_api(context)
311
+ .list_node(_request_timeout=kubernetes.API_TIMEOUT)
312
+ .items
313
+ )
314
+ return nodes
315
+
316
+
317
+ @functools.lru_cache()
318
+ def detect_gpu_label_formatter(
319
+ context: Optional[str],
320
+ ) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]:
321
+ """Detects the GPU label formatter for the Kubernetes cluster
322
+
323
+ Returns:
324
+ GPULabelFormatter: The GPU label formatter for the cluster, if found.
325
+ Dict[str, List[Tuple[str, str]]]: A mapping of nodes and the list of
326
+ labels on each node. E.g., {'node1': [('label1', 'value1')]}
327
+ """
328
+ # Get all labels across all nodes
329
+ node_labels: Dict[str, List[Tuple[str, str]]] = {}
330
+ nodes = get_kubernetes_nodes(context)
331
+ for node in nodes:
332
+ node_labels[node.metadata.name] = []
333
+ for label, value in node.metadata.labels.items():
334
+ node_labels[node.metadata.name].append((label, value))
335
+
336
+ label_formatter = None
337
+
338
+ # Check if the node labels contain any of the GPU label prefixes
339
+ for lf in LABEL_FORMATTER_REGISTRY:
340
+ for _, label_list in node_labels.items():
341
+ for label, _ in label_list:
342
+ if lf.match_label_key(label):
343
+ label_formatter = lf()
344
+ return label_formatter, node_labels
345
+
346
+ return label_formatter, node_labels
347
+
348
+
349
+ @functools.lru_cache()
350
+ def get_kube_config_context_namespace(context_name: Optional[str] = None) -> str:
351
+ """Get the current kubernetes context namespace from the kubeconfig file
352
+
353
+ Returns:
354
+ str | None: The current kubernetes context namespace if it exists, else
355
+ the default namespace.
356
+ """
357
+ k8s = kubernetes
358
+ ns_path = '/var/run/secrets/kubernetes.io/serviceaccount/namespace'
359
+
360
+ # If no explicit context provided, prefer configured allowed context first.
361
+ if context_name is None:
362
+ try:
363
+ allowed_contexts: Optional[List[str]] = config.get_nested(
364
+ ('kubernetes', 'allowed_contexts'), None
365
+ )
366
+ if allowed_contexts:
367
+ context_name = allowed_contexts[0]
368
+ except Exception:
369
+ pass
370
+
371
+ # If using in-cluster context, get the namespace from the SA namespace file.
372
+ if context_name == kube_client.in_cluster_context_name() or context_name is None:
373
+ if os.path.exists(ns_path):
374
+ with open(ns_path, encoding='utf-8') as f:
375
+ return f.read().strip()
376
+
377
+ # If not in-cluster, get the namespace from kubeconfig
378
+ try:
379
+ contexts, current_context = k8s.config.list_kube_config_contexts()
380
+ if context_name is None:
381
+ context = current_context
382
+ else:
383
+ context = next((c for c in contexts if c['name'] == context_name), None)
384
+ if context is None:
385
+ return DEFAULT_NAMESPACE
386
+
387
+ if 'namespace' in context['context']:
388
+ return context['context']['namespace']
389
+ else:
390
+ return DEFAULT_NAMESPACE
391
+ except k8s.config.config_exception.ConfigException:
392
+ return DEFAULT_NAMESPACE
393
+
394
+
395
+ def check_credentials(
396
+ context: Optional[str], timeout: int = kube_client.API_TIMEOUT
397
+ ) -> Tuple[bool, Optional[str]]:
398
+ """Check if the credentials in kubeconfig file are valid
399
+
400
+ Args:
401
+ context (Optional[str]): The Kubernetes context to use. If none, uses
402
+ in-cluster auth to check credentials, if available.
403
+ timeout (int): Timeout in seconds for the test API call
404
+
405
+ Returns:
406
+ bool: True if credentials are valid, False otherwise
407
+ str: Error message if credentials are invalid, None otherwise
408
+ """
409
+ try:
410
+ namespace = get_kube_config_context_namespace(context)
411
+ kube_client.core_api(context).list_namespaced_pod(
412
+ namespace, _request_timeout=timeout
413
+ )
414
+ return True, None
415
+ except ImportError:
416
+ return False, (
417
+ '`kubernetes` package is not installed. '
418
+ 'Install it with: pip install kubernetes'
419
+ )
420
+ except kube_client.api_exception() as e:
421
+ # Check if the error is due to invalid credentials
422
+ if e.status == 401:
423
+ return (
424
+ False,
425
+ 'Invalid credentials - do you have permission '
426
+ 'to access the cluster?',
427
+ )
428
+ else:
429
+ return False, f'Failed to communicate with the cluster: {str(e)}'
430
+ except kube_client.config_exception() as e:
431
+ return False, f'Invalid configuration file: {str(e)}'
432
+ except kube_client.max_retry_error():
433
+ return False, (
434
+ 'Failed to communicate with the cluster - timeout. '
435
+ 'Check if your cluster is running and your network '
436
+ 'is stable.'
437
+ )
438
+ except ValueError as e:
439
+ return False, common_utils.format_exception(e)
440
+ except Exception as e: # pylint: disable=broad-except
441
+ return False, (
442
+ 'An error occurred: '
443
+ f'{common_utils.format_exception(e, use_bracket=True)}'
444
+ )
445
+
446
+
447
+ def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
448
+ resource_str = str(resource_qty_str)
449
+ if resource_str[-1] == 'm':
450
+ # For example, '500m' rounds up to 1.
451
+ return math.ceil(int(resource_str[:-1]) / 1000)
452
+ else:
453
+ return float(resource_str)
454
+
455
+
456
+ def parse_memory_resource(resource_qty_str: str, unit: str = 'B') -> Union[int, float]:
457
+ """Returns memory size in chosen units given a resource quantity string."""
458
+ if unit not in MEMORY_SIZE_UNITS:
459
+ valid_units = ', '.join(MEMORY_SIZE_UNITS.keys())
460
+ raise ValueError(f'Invalid unit: {unit}. Valid units are: {valid_units}')
461
+
462
+ resource_str = str(resource_qty_str)
463
+ bytes_value: Union[int, float]
464
+ try:
465
+ bytes_value = int(resource_str)
466
+ except ValueError:
467
+ memory_size = re.sub(r'([KMGTPB]+)', r' \1', resource_str)
468
+ number, unit_index = [item.strip() for item in memory_size.split()]
469
+ unit_index = unit_index[0]
470
+ bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
471
+ return bytes_value / MEMORY_SIZE_UNITS[unit]
472
+
473
+
474
+ def combine_pod_config_fields(
475
+ cluster_yaml_path: str,
476
+ cluster_config_overrides: Dict[str, Any],
477
+ ) -> None:
478
+ """Adds or updates fields in the YAML with fields from the ~/.konduktor/config's
479
+ kubernetes.pod_spec dict.
480
+ This can be used to add fields to the YAML that are not supported by
481
+ yet, or require simple configuration (e.g., adding an
482
+ imagePullSecrets field).
483
+ Note that new fields are added and existing ones are updated. Nested fields
484
+ are not completely replaced, instead their objects are merged. Similarly,
485
+ if a list is encountered in the config, it will be appended to the
486
+ destination list.
487
+ For example, if the YAML has the following:
488
+ ```
489
+ ...
490
+ node_config:
491
+ spec:
492
+ containers:
493
+ - name: ray
494
+ image: rayproject/ray:nightly
495
+ ```
496
+ and the config has the following:
497
+ ```
498
+ kubernetes:
499
+ pod_config:
500
+ spec:
501
+ imagePullSecrets:
502
+ - name: my-secret
503
+ ```
504
+ then the resulting YAML will be:
505
+ ```
506
+ ...
507
+ node_config:
508
+ spec:
509
+ containers:
510
+ - name: ray
511
+ image: rayproject/ray:nightly
512
+ imagePullSecrets:
513
+ - name: my-secret
514
+ ```
515
+ """
516
+ with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
517
+ yaml_content = f.read()
518
+ yaml_obj = yaml.safe_load(yaml_content)
519
+ # We don't use override_configs in `konduktor_config.get_nested`, as merging
520
+ # the pod config requires special handling.
521
+ kubernetes_config = config.get_nested(
522
+ ('kubernetes', 'pod_config'), default_value={}, override_configs={}
523
+ )
524
+ override_pod_config = cluster_config_overrides.get('kubernetes', {}).get(
525
+ 'pod_config', {}
526
+ )
527
+ config.merge_k8s_configs(override_pod_config, kubernetes_config)
528
+
529
+ yaml_obj['kubernetes']['pod_config'] = override_pod_config
530
+
531
+ # Write the updated YAML back to the file
532
+ common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
533
+
534
+
535
+ def combine_metadata_fields(cluster_yaml_path: str) -> None:
536
+ """Updates the metadata for all Kubernetes objects created with
537
+ fields from the ~/.konduktor/config's kubernetes.custom_metadata dict.
538
+
539
+ Obeys the same add or update semantics as combine_pod_config_fields().
540
+ """
541
+
542
+ with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
543
+ yaml_content = f.read()
544
+ yaml_obj = yaml.safe_load(yaml_content)
545
+ custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
546
+
547
+ # List of objects in the cluster YAML to be updated
548
+ combination_destinations = [
549
+ # Service accounts
550
+ yaml_obj['provider']['autoscaler_service_account']['metadata'],
551
+ yaml_obj['provider']['autoscaler_role']['metadata'],
552
+ yaml_obj['provider']['autoscaler_role_binding']['metadata'],
553
+ yaml_obj['provider']['autoscaler_service_account']['metadata'],
554
+ # Pod spec
555
+ yaml_obj['available_node_types']['ray_head_default']['node_config']['metadata'],
556
+ # Services for pods
557
+ *[svc['metadata'] for svc in yaml_obj['provider']['services']],
558
+ ]
559
+
560
+ for destination in combination_destinations:
561
+ config.merge_k8s_configs(custom_metadata, destination)
562
+
563
+ # Write the updated YAML back to the file
564
+ common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
565
+
566
+
567
+ def merge_custom_metadata(original_metadata: Dict[str, Any]) -> None:
568
+ """Merges original metadata with custom_metadata from config
569
+
570
+ Merge is done in-place, so return is not required
571
+ """
572
+ custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
573
+ config.merge_k8s_configs(custom_metadata, original_metadata)
574
+
575
+
576
+ def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
577
+ """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
578
+ # Fetch the list of available RuntimeClasses
579
+ runtime_classes = kube_client.node_api(context).list_runtime_class()
580
+
581
+ # Check if 'nvidia' RuntimeClass exists
582
+ nvidia_exists = any(rc.metadata.name == 'nvidia' for rc in runtime_classes.items)
583
+ return nvidia_exists
584
+
585
+
586
+ def check_secret_exists(
587
+ secret_name: str, namespace: str, context: Optional[str]
588
+ ) -> Tuple[bool, Union[str, Dict[str, Any]]]:
589
+ """Checks if a secret exists in a namespace
590
+
591
+ Args:
592
+ secret_name: Name of secret to check
593
+ namespace: Namespace to check
594
+ Returns:
595
+ bool: True if the secret exists, False otherwise
596
+ str: response payload if True, error string otherwise
597
+ """
598
+
599
+ try:
600
+ response = kube_client.core_api(context).read_namespaced_secret(
601
+ secret_name, namespace, _request_timeout=kube_client.API_TIMEOUT
602
+ )
603
+ except kube_client.api_exception() as e:
604
+ if e.status == 404:
605
+ return False, str(e)
606
+ raise
607
+ else:
608
+ return True, response
609
+
610
+
611
+ def set_secret(
612
+ secret_name: str,
613
+ namespace: str,
614
+ context: Optional[str],
615
+ data: Dict[str, str],
616
+ secret_metadata: Optional[Dict[str, Any]] = None,
617
+ ) -> Tuple[bool, Optional[str]]:
618
+ """
619
+ Create/update a secret in a namespace. Values are encoded to base64.
620
+ `secret` must be base64 encoded ie
621
+ ```
622
+ base64.b64encode(secret).decode()
623
+ ```
624
+ """
625
+ with _K8s_CLIENT_LOCK:
626
+ user_hash = common_utils.get_user_hash()
627
+
628
+ full_name = (
629
+ secret_metadata.get('name')
630
+ if secret_metadata and 'name' in secret_metadata
631
+ else secret_name
632
+ )
633
+ assert isinstance(full_name, str), 'Secret name must be a string'
634
+
635
+ metadata: Dict[str, Any] = {
636
+ 'name': full_name,
637
+ 'labels': {
638
+ 'parent': 'konduktor',
639
+ backend_constants.SECRET_OWNER_LABEL: user_hash,
640
+ backend_constants.SECRET_BASENAME_LABEL: secret_name,
641
+ },
642
+ }
643
+
644
+ if secret_metadata:
645
+ metadata['labels'].update(secret_metadata.get('labels', {}))
646
+
647
+ custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
648
+ config.merge_k8s_configs(metadata, custom_metadata)
649
+
650
+ secret = kubernetes.client.V1Secret(
651
+ metadata=kubernetes.client.V1ObjectMeta(**metadata),
652
+ type='Opaque',
653
+ data=data,
654
+ )
655
+
656
+ secret_exists, _ = check_secret_exists(
657
+ secret_name=full_name,
658
+ namespace=namespace,
659
+ context=context,
660
+ )
661
+
662
+ try:
663
+ if secret_exists:
664
+ kube_client.core_api(context).patch_namespaced_secret(
665
+ full_name, namespace, secret
666
+ )
667
+ else:
668
+ kube_client.core_api(context).create_namespaced_secret(
669
+ namespace, secret
670
+ )
671
+ except kube_client.api_exception() as e:
672
+ return False, str(e)
673
+ else:
674
+ logger.debug(
675
+ f'Secret {full_name} in namespace {namespace} '
676
+ f'in context {context} created/updated'
677
+ )
678
+ return True, None
679
+
680
+
681
+ def list_secrets(
682
+ namespace: str,
683
+ context: Optional[str],
684
+ label_filter: Optional[str] = None,
685
+ ) -> List[kubernetes.client.V1Secret]:
686
+ """List all secrets in a namespace, optionally filtering by label."""
687
+ secrets = kube_client.core_api(context).list_namespaced_secret(namespace).items
688
+ if label_filter:
689
+ key, val = label_filter.split('=', 1)
690
+ return [
691
+ s
692
+ for s in secrets
693
+ if s.metadata.labels and s.metadata.labels.get(key) == val
694
+ ]
695
+ return secrets
696
+
697
+
698
+ def delete_secret(
699
+ name: str,
700
+ namespace: str,
701
+ context: Optional[str],
702
+ ) -> Tuple[bool, Optional[str]]:
703
+ """Deletes a secret by name in the given namespace/context."""
704
+ try:
705
+ kube_client.core_api(context).delete_namespaced_secret(name, namespace)
706
+ logger.debug(f'Secret {name} deleted from namespace {namespace}')
707
+ return True, None
708
+ except kube_client.api_exception() as e:
709
+ return False, str(e)
710
+
711
+
712
+ def get_secret_kind(secret: kubernetes.client.V1Secret) -> Optional[str]:
713
+ """Get the konduktor-specific kind of a secret, if labeled."""
714
+ if secret.metadata.labels:
715
+ return secret.metadata.labels.get(backend_constants.SECRET_KIND_LABEL)
716
+ return None
717
+
718
+
719
+ def get_autoscaler_type() -> Optional[kubernetes_enums.KubernetesAutoscalerType]:
720
+ """Returns the autoscaler type by reading from config"""
721
+ autoscaler_type = config.get_nested(('kubernetes', 'autoscaler'), None)
722
+ if autoscaler_type is not None:
723
+ autoscaler_type = kubernetes_enums.KubernetesAutoscalerType(autoscaler_type)
724
+ return autoscaler_type
725
+
726
+
727
+ # TODO(asaiacai): some checks here for CRDs for jobset and Kueue CRDs, queues, etc.
728
+ def is_label_valid(label_key: str, label_value: str) -> Tuple[bool, Optional[str]]:
729
+ # Kubernetes labels can be of the format <domain>/<key>: <value>
730
+ key_regex = re.compile(
731
+ # Look-ahead to ensure proper domain formatting up to a slash
732
+ r'^(?:(?=[a-z0-9]([-a-z0-9.]*[a-z0-9])?\/)'
733
+ # Match domain: starts and ends with alphanum up to 253 chars
734
+ # including a slash in the domain.
735
+ r'[a-z0-9]([-a-z0-9.]{0,251}[a-z0-9])?\/)?'
736
+ # Match key: starts and ends with alphanum, upto to 63 chars.
737
+ r'[a-z0-9]([-a-z0-9_.]{0,61}[a-z0-9])?$'
738
+ )
739
+ value_regex = re.compile(r'^([a-zA-Z0-9]([-a-zA-Z0-9_.]{0,61}[a-zA-Z0-9])?)?$')
740
+ key_valid = bool(key_regex.match(label_key))
741
+ value_valid = bool(value_regex.match(label_value))
742
+ error_msg = None
743
+ condition_msg = (
744
+ 'Value must consist of alphanumeric characters or '
745
+ "'-', '_', '.', and must be no more than 63 "
746
+ 'characters in length.'
747
+ )
748
+ if not key_valid:
749
+ error_msg = f'Invalid label key {label_key} for Kubernetes. ' f'{condition_msg}'
750
+ if not value_valid:
751
+ error_msg = (
752
+ f'Invalid label value {label_value} for Kubernetes. ' f'{condition_msg}'
753
+ )
754
+ if not key_valid or not value_valid:
755
+ return False, error_msg
756
+ return True, None
757
+
758
+
759
+ def is_k8s_resource_name_valid(name):
760
+ """Returns whether or not a k8s name is valid (must consist of
761
+ lower case alphanumeric characters or -, and must start and end
762
+ with alphanumeric characters)"""
763
+ return re.match(DNS_SUBDOMAIN_REGEX, name)