konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. konduktor/__init__.py +16 -6
  2. konduktor/adaptors/__init__.py +0 -0
  3. konduktor/adaptors/common.py +88 -0
  4. konduktor/adaptors/gcp.py +112 -0
  5. konduktor/backends/__init__.py +8 -0
  6. konduktor/backends/backend.py +86 -0
  7. konduktor/backends/jobset.py +218 -0
  8. konduktor/backends/jobset_utils.py +447 -0
  9. konduktor/check.py +192 -0
  10. konduktor/cli.py +790 -0
  11. konduktor/cloud_stores.py +158 -0
  12. konduktor/config.py +420 -0
  13. konduktor/constants.py +36 -0
  14. konduktor/controller/constants.py +6 -6
  15. konduktor/controller/launch.py +3 -3
  16. konduktor/controller/node.py +5 -5
  17. konduktor/controller/parse.py +23 -23
  18. konduktor/dashboard/backend/main.py +57 -57
  19. konduktor/dashboard/backend/sockets.py +19 -19
  20. konduktor/data/__init__.py +9 -0
  21. konduktor/data/constants.py +12 -0
  22. konduktor/data/data_utils.py +223 -0
  23. konduktor/data/gcp/__init__.py +19 -0
  24. konduktor/data/gcp/constants.py +42 -0
  25. konduktor/data/gcp/gcs.py +906 -0
  26. konduktor/data/gcp/utils.py +9 -0
  27. konduktor/data/storage.py +799 -0
  28. konduktor/data/storage_utils.py +500 -0
  29. konduktor/execution.py +444 -0
  30. konduktor/kube_client.py +153 -48
  31. konduktor/logging.py +49 -5
  32. konduktor/manifests/dmesg_daemonset.yaml +8 -0
  33. konduktor/manifests/pod_cleanup_controller.yaml +129 -0
  34. konduktor/resource.py +478 -0
  35. konduktor/task.py +867 -0
  36. konduktor/templates/jobset.yaml.j2 +31 -0
  37. konduktor/templates/pod.yaml.j2 +185 -0
  38. konduktor/usage/__init__.py +0 -0
  39. konduktor/usage/constants.py +21 -0
  40. konduktor/utils/__init__.py +0 -0
  41. konduktor/utils/accelerator_registry.py +21 -0
  42. konduktor/utils/annotations.py +62 -0
  43. konduktor/utils/base64_utils.py +93 -0
  44. konduktor/utils/common_utils.py +393 -0
  45. konduktor/utils/constants.py +5 -0
  46. konduktor/utils/env_options.py +55 -0
  47. konduktor/utils/exceptions.py +226 -0
  48. konduktor/utils/kubernetes_enums.py +8 -0
  49. konduktor/utils/kubernetes_utils.py +652 -0
  50. konduktor/utils/log_utils.py +251 -0
  51. konduktor/utils/loki_utils.py +85 -0
  52. konduktor/utils/rich_utils.py +123 -0
  53. konduktor/utils/schemas.py +581 -0
  54. konduktor/utils/subprocess_utils.py +273 -0
  55. konduktor/utils/ux_utils.py +216 -0
  56. konduktor/utils/validator.py +20 -0
  57. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
  58. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
  59. konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
  60. konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
  61. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
  62. {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,652 @@
1
+ # Proprietary Changes made for Trainy under the Trainy Software License
2
+ # Original source: skypilot: https://github.com/skypilot-org/skypilot
3
+ # which is Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+ """Kubernetes utilities."""
14
+
15
+ import functools
16
+ import math
17
+ import os
18
+ import re
19
+ import typing
20
+ from typing import Any, Dict, List, Optional, Tuple, Union
21
+
22
+ import kubernetes
23
+ import yaml
24
+
25
+ from konduktor import config, kube_client, logging
26
+ from konduktor.utils import common_utils, kubernetes_enums
27
+
28
+ if typing.TYPE_CHECKING:
29
+ pass
30
+
31
+ DEFAULT_NAMESPACE = 'default'
32
+
33
+ DEFAULT_SERVICE_ACCOUNT_NAME = 'konduktor-service-account'
34
+
35
+ MEMORY_SIZE_UNITS = {
36
+ 'B': 1,
37
+ 'K': 2**10,
38
+ 'M': 2**20,
39
+ 'G': 2**30,
40
+ 'T': 2**40,
41
+ 'P': 2**50,
42
+ }
43
+
44
+ # The resource keys used by Kubernetes to track NVIDIA GPUs and Google on
45
+ # nodes. These keys are typically used in the node's status.allocatable
46
+ # or status.capacity fields to indicate the available resources on the node.
47
+ GPU_RESOURCE_KEY = 'nvidia.com/gpu'
48
+
49
+ NO_ACCELERATOR_HELP_MESSAGE = (
50
+ 'If your cluster contains GPUs, make sure '
51
+ f'{GPU_RESOURCE_KEY} resource is available '
52
+ 'on the nodes and the node labels for identifying GPUs '
53
+ '(e.g. `nvidia.com/gpu` are setup correctly. '
54
+ )
55
+
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+
60
+ class GPULabelFormatter:
61
+ """Base class to define a GPU label formatter for a Kubernetes cluster
62
+
63
+ A GPU label formatter is a class that defines how to use GPU type labels in
64
+ a Kubernetes cluster. It is used by the Kubernetes cloud class to pick the
65
+ key:value pair to use as node selector for GPU nodes.
66
+ """
67
+
68
+ @classmethod
69
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
70
+ """Returns the label key for GPU type used by the Kubernetes cluster"""
71
+ raise NotImplementedError
72
+
73
+ @classmethod
74
+ def get_label_keys(cls) -> List[str]:
75
+ """Returns a list of label keys for GPU used by Kubernetes cluster."""
76
+ raise NotImplementedError
77
+
78
+ @classmethod
79
+ def get_label_value(cls, accelerator: str) -> str:
80
+ """Given a GPU type, returns the label value to be used"""
81
+ raise NotImplementedError
82
+
83
+ @classmethod
84
+ def match_label_key(cls, label_key: str) -> bool:
85
+ """Checks if the given label key matches the formatter's label keys"""
86
+ raise NotImplementedError
87
+
88
+ @classmethod
89
+ def get_accelerator_from_label_value(cls, value: str) -> str:
90
+ """Given a label value, returns the GPU type"""
91
+ raise NotImplementedError
92
+
93
+ @classmethod
94
+ def validate_label_value(cls, value: str) -> Tuple[bool, str]:
95
+ """Validates if the specified label value is correct.
96
+
97
+ Used to check if the labelling on the cluster is correct and
98
+ preemptively raise an error if it is not.
99
+
100
+ Returns:
101
+ bool: True if the label value is valid, False otherwise.
102
+ str: Error message if the label value is invalid, None otherwise.
103
+ """
104
+ del value
105
+ return True, ''
106
+
107
+
108
+ def get_gke_accelerator_name(accelerator: str) -> str:
109
+ """Returns the accelerator name for GKE clusters.
110
+
111
+ Uses the format - nvidia-tesla-<accelerator>.
112
+ A100-80GB, H100-80GB, L4 are an exception. They use nvidia-<accelerator>.
113
+ types are an exception as well keeping the given name.
114
+ """
115
+ if accelerator == 'H100':
116
+ # H100 is named as H100-80GB in GKE.
117
+ accelerator = 'H100-80GB'
118
+ if accelerator in ('A100-80GB', 'L4', 'H100-80GB', 'H100-MEGA-80GB'):
119
+ # A100-80GB, L4, H100-80GB and H100-MEGA-80GB
120
+ # have a different name pattern.
121
+ return 'nvidia-{}'.format(accelerator.lower())
122
+ else:
123
+ return 'nvidia-tesla-{}'.format(accelerator.lower())
124
+
125
+
126
+ class GKELabelFormatter(GPULabelFormatter):
127
+ """GKE label formatter
128
+
129
+ GKE nodes by default are populated with `cloud.google.com/gke-accelerator`
130
+ label, which is used to identify the GPU type.
131
+ """
132
+
133
+ GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
134
+ ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
135
+
136
+ @classmethod
137
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
138
+ return cls.GPU_LABEL_KEY
139
+
140
+ @classmethod
141
+ def get_label_keys(cls) -> List[str]:
142
+ return [cls.GPU_LABEL_KEY]
143
+
144
+ @classmethod
145
+ def match_label_key(cls, label_key: str) -> bool:
146
+ return label_key in cls.get_label_keys()
147
+
148
+ @classmethod
149
+ def get_label_value(cls, accelerator: str) -> str:
150
+ return get_gke_accelerator_name(accelerator)
151
+
152
+ @classmethod
153
+ def get_accelerator_from_label_value(cls, value: str) -> str:
154
+ if value.startswith('nvidia-tesla-'):
155
+ return value.replace('nvidia-tesla-', '').upper()
156
+ elif value.startswith('nvidia-'):
157
+ acc = value.replace('nvidia-', '').upper()
158
+ if acc == 'H100-80GB':
159
+ # H100 can be either H100-80GB or H100-MEGA-80GB in GKE
160
+ # we map H100 ---> H100-80GB and keep H100-MEGA-80GB
161
+ # to distinguish between a3-high and a3-mega instances
162
+ return 'H100'
163
+ return acc
164
+ else:
165
+ raise ValueError(f'Invalid accelerator name in GKE cluster: {value}')
166
+
167
+
168
+ class GFDLabelFormatter(GPULabelFormatter):
169
+ """GPU Feature Discovery label formatter
170
+
171
+ NVIDIA GPUs nodes are labeled by GPU feature discovery
172
+ e.g. nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3
173
+ https://github.com/NVIDIA/gpu-feature-discovery
174
+
175
+ GPU feature discovery is included as part of the
176
+ NVIDIA GPU Operator:
177
+ https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html
178
+
179
+ This LabelFormatter can't be used in autoscaling clusters since accelerators
180
+ may map to multiple label, so we're not implementing `get_label_value`
181
+ """
182
+
183
+ LABEL_KEY = 'nvidia.com/gpu.product'
184
+
185
+ @classmethod
186
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
187
+ return cls.LABEL_KEY
188
+
189
+ @classmethod
190
+ def get_label_keys(cls) -> List[str]:
191
+ return [cls.LABEL_KEY]
192
+
193
+ @classmethod
194
+ def get_label_value(cls, accelerator: str) -> str:
195
+ """An accelerator can map to many Nvidia GFD labels
196
+ (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
197
+ As a result, we do not support get_label_value for GFDLabelFormatter."""
198
+ raise NotImplementedError
199
+
200
+ @classmethod
201
+ def match_label_key(cls, label_key: str) -> bool:
202
+ return label_key == cls.LABEL_KEY
203
+
204
+ @classmethod
205
+ def get_accelerator_from_label_value(cls, value: str) -> str:
206
+ """Searches against a canonical list of NVIDIA GPUs and pattern
207
+ matches the canonical GPU name against the GFD label.
208
+ """
209
+ canonical_gpu_names = [
210
+ 'A100-80GB',
211
+ 'A100',
212
+ 'A10G',
213
+ 'H100',
214
+ 'K80',
215
+ 'M60',
216
+ 'T4g',
217
+ 'T4',
218
+ 'V100',
219
+ 'A10',
220
+ 'P4000',
221
+ 'P100',
222
+ 'P40',
223
+ 'P4',
224
+ 'L40',
225
+ 'L4',
226
+ ]
227
+ for canonical_name in canonical_gpu_names:
228
+ # A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB
229
+ if canonical_name == 'A100-80GB' and re.search(r'A100.*-80GB', value):
230
+ return canonical_name
231
+ # Use word boundary matching to prevent substring matches
232
+ elif re.search(rf'\b{re.escape(canonical_name)}\b', value):
233
+ return canonical_name
234
+
235
+ # If we didn't find a canonical name:
236
+ # 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000')
237
+ # 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070')
238
+ # 3. remove 'RTX-' (e.g. 'RTX-6000' -> 'RTX6000')
239
+ return (
240
+ value.upper()
241
+ .replace('NVIDIA-', '')
242
+ .replace('GEFORCE-', '')
243
+ .replace('RTX-', 'RTX')
244
+ )
245
+
246
+
247
+ # LABEL_FORMATTER_REGISTRY stores the label formats that will try to
248
+ # discover the accelerator type from. The order of the list is important, as
249
+ # it will be used to determine the priority of the label formats when
250
+ # auto-detecting the GPU label type.
251
+ LABEL_FORMATTER_REGISTRY = [GKELabelFormatter, GFDLabelFormatter]
252
+
253
+ # Mapping of autoscaler type to label formatter
254
+ AUTOSCALER_TO_LABEL_FORMATTER = {
255
+ kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
256
+ }
257
+
258
+
259
+ @functools.lru_cache()
260
+ def get_current_kube_config_context_name() -> Optional[str]:
261
+ """Get the current kubernetes context from the kubeconfig file
262
+
263
+ Returns:
264
+ str | None: The current kubernetes context if it exists, None otherwise
265
+ """
266
+ k8s = kubernetes
267
+ try:
268
+ _, current_context = k8s.config.list_kube_config_contexts()
269
+ return current_context['name']
270
+ except k8s.config.config_exception.ConfigException:
271
+ return None
272
+
273
+
274
+ @functools.lru_cache(maxsize=10)
275
+ def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
276
+ """Gets the kubernetes nodes in the context.
277
+
278
+ If context is None, gets the nodes in the current context.
279
+ """
280
+ if context is None:
281
+ context = get_current_kube_config_context_name()
282
+
283
+ nodes = (
284
+ kube_client.core_api(context)
285
+ .list_node(_request_timeout=kubernetes.API_TIMEOUT)
286
+ .items
287
+ )
288
+ return nodes
289
+
290
+
291
+ @functools.lru_cache()
292
+ def detect_gpu_label_formatter(
293
+ context: Optional[str],
294
+ ) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]:
295
+ """Detects the GPU label formatter for the Kubernetes cluster
296
+
297
+ Returns:
298
+ GPULabelFormatter: The GPU label formatter for the cluster, if found.
299
+ Dict[str, List[Tuple[str, str]]]: A mapping of nodes and the list of
300
+ labels on each node. E.g., {'node1': [('label1', 'value1')]}
301
+ """
302
+ # Get all labels across all nodes
303
+ node_labels: Dict[str, List[Tuple[str, str]]] = {}
304
+ nodes = get_kubernetes_nodes(context)
305
+ for node in nodes:
306
+ node_labels[node.metadata.name] = []
307
+ for label, value in node.metadata.labels.items():
308
+ node_labels[node.metadata.name].append((label, value))
309
+
310
+ label_formatter = None
311
+
312
+ # Check if the node labels contain any of the GPU label prefixes
313
+ for lf in LABEL_FORMATTER_REGISTRY:
314
+ for _, label_list in node_labels.items():
315
+ for label, _ in label_list:
316
+ if lf.match_label_key(label):
317
+ label_formatter = lf()
318
+ return label_formatter, node_labels
319
+
320
+ return label_formatter, node_labels
321
+
322
+
323
+ @functools.lru_cache()
324
+ def get_kube_config_context_namespace(context_name: Optional[str] = None) -> str:
325
+ """Get the current kubernetes context namespace from the kubeconfig file
326
+
327
+ Returns:
328
+ str | None: The current kubernetes context namespace if it exists, else
329
+ the default namespace.
330
+ """
331
+ k8s = kubernetes
332
+ ns_path = '/var/run/secrets/kubernetes.io/serviceaccount/namespace'
333
+ # If using in-cluster context, get the namespace from the service account
334
+ # namespace file. Uses the same logic as adaptors.kubernetes._load_config()
335
+ # to stay consistent with in-cluster config loading.
336
+ if context_name == kube_client.in_cluster_context_name() or context_name is None:
337
+ if os.path.exists(ns_path):
338
+ with open(ns_path, encoding='utf-8') as f:
339
+ return f.read().strip()
340
+ # If not in-cluster, get the namespace from kubeconfig
341
+ try:
342
+ contexts, current_context = k8s.config.list_kube_config_contexts()
343
+ if context_name is None:
344
+ context = current_context
345
+ else:
346
+ context = next((c for c in contexts if c['name'] == context_name), None)
347
+ if context is None:
348
+ return DEFAULT_NAMESPACE
349
+
350
+ if 'namespace' in context['context']:
351
+ return context['context']['namespace']
352
+ else:
353
+ return DEFAULT_NAMESPACE
354
+ except k8s.config.config_exception.ConfigException:
355
+ return DEFAULT_NAMESPACE
356
+
357
+
358
+ def check_credentials(
359
+ context: Optional[str], timeout: int = kube_client.API_TIMEOUT
360
+ ) -> Tuple[bool, Optional[str]]:
361
+ """Check if the credentials in kubeconfig file are valid
362
+
363
+ Args:
364
+ context (Optional[str]): The Kubernetes context to use. If none, uses
365
+ in-cluster auth to check credentials, if available.
366
+ timeout (int): Timeout in seconds for the test API call
367
+
368
+ Returns:
369
+ bool: True if credentials are valid, False otherwise
370
+ str: Error message if credentials are invalid, None otherwise
371
+ """
372
+ try:
373
+ namespace = get_kube_config_context_namespace(context)
374
+ kube_client.core_api(context).list_namespaced_pod(
375
+ namespace, _request_timeout=timeout
376
+ )
377
+ return True, None
378
+ except ImportError:
379
+ return False, (
380
+ '`kubernetes` package is not installed. '
381
+ 'Install it with: pip install kubernetes'
382
+ )
383
+ except kube_client.api_exception() as e:
384
+ # Check if the error is due to invalid credentials
385
+ if e.status == 401:
386
+ return (
387
+ False,
388
+ 'Invalid credentials - do you have permission '
389
+ 'to access the cluster?',
390
+ )
391
+ else:
392
+ return False, f'Failed to communicate with the cluster: {str(e)}'
393
+ except kube_client.config_exception() as e:
394
+ return False, f'Invalid configuration file: {str(e)}'
395
+ except kube_client.max_retry_error():
396
+ return False, (
397
+ 'Failed to communicate with the cluster - timeout. '
398
+ 'Check if your cluster is running and your network '
399
+ 'is stable.'
400
+ )
401
+ except ValueError as e:
402
+ return False, common_utils.format_exception(e)
403
+ except Exception as e: # pylint: disable=broad-except
404
+ return False, (
405
+ 'An error occurred: '
406
+ f'{common_utils.format_exception(e, use_bracket=True)}'
407
+ )
408
+
409
+
410
+ def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
411
+ resource_str = str(resource_qty_str)
412
+ if resource_str[-1] == 'm':
413
+ # For example, '500m' rounds up to 1.
414
+ return math.ceil(int(resource_str[:-1]) / 1000)
415
+ else:
416
+ return float(resource_str)
417
+
418
+
419
+ def parse_memory_resource(resource_qty_str: str, unit: str = 'B') -> Union[int, float]:
420
+ """Returns memory size in chosen units given a resource quantity string."""
421
+ if unit not in MEMORY_SIZE_UNITS:
422
+ valid_units = ', '.join(MEMORY_SIZE_UNITS.keys())
423
+ raise ValueError(f'Invalid unit: {unit}. Valid units are: {valid_units}')
424
+
425
+ resource_str = str(resource_qty_str)
426
+ bytes_value: Union[int, float]
427
+ try:
428
+ bytes_value = int(resource_str)
429
+ except ValueError:
430
+ memory_size = re.sub(r'([KMGTPB]+)', r' \1', resource_str)
431
+ number, unit_index = [item.strip() for item in memory_size.split()]
432
+ unit_index = unit_index[0]
433
+ bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
434
+ return bytes_value / MEMORY_SIZE_UNITS[unit]
435
+
436
+
437
+ def combine_pod_config_fields(
438
+ cluster_yaml_path: str,
439
+ cluster_config_overrides: Dict[str, Any],
440
+ ) -> None:
441
+ """Adds or updates fields in the YAML with fields from the ~/.konduktor/config's
442
+ kubernetes.pod_spec dict.
443
+ This can be used to add fields to the YAML that are not supported by
444
+ yet, or require simple configuration (e.g., adding an
445
+ imagePullSecrets field).
446
+ Note that new fields are added and existing ones are updated. Nested fields
447
+ are not completely replaced, instead their objects are merged. Similarly,
448
+ if a list is encountered in the config, it will be appended to the
449
+ destination list.
450
+ For example, if the YAML has the following:
451
+ ```
452
+ ...
453
+ node_config:
454
+ spec:
455
+ containers:
456
+ - name: ray
457
+ image: rayproject/ray:nightly
458
+ ```
459
+ and the config has the following:
460
+ ```
461
+ kubernetes:
462
+ pod_config:
463
+ spec:
464
+ imagePullSecrets:
465
+ - name: my-secret
466
+ ```
467
+ then the resulting YAML will be:
468
+ ```
469
+ ...
470
+ node_config:
471
+ spec:
472
+ containers:
473
+ - name: ray
474
+ image: rayproject/ray:nightly
475
+ imagePullSecrets:
476
+ - name: my-secret
477
+ ```
478
+ """
479
+ with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
480
+ yaml_content = f.read()
481
+ yaml_obj = yaml.safe_load(yaml_content)
482
+ # We don't use override_configs in `konduktor_config.get_nested`, as merging
483
+ # the pod config requires special handling.
484
+ kubernetes_config = config.get_nested(
485
+ ('kubernetes', 'pod_config'), default_value={}, override_configs={}
486
+ )
487
+ override_pod_config = cluster_config_overrides.get('kubernetes', {}).get(
488
+ 'pod_config', {}
489
+ )
490
+ config.merge_k8s_configs(override_pod_config, kubernetes_config)
491
+
492
+ # Write the updated YAML back to the file
493
+ common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
494
+
495
+
496
+ def combine_metadata_fields(cluster_yaml_path: str) -> None:
497
+ """Updates the metadata for all Kubernetes objects created with
498
+ fields from the ~/.konduktor/config's kubernetes.custom_metadata dict.
499
+
500
+ Obeys the same add or update semantics as combine_pod_config_fields().
501
+ """
502
+
503
+ with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
504
+ yaml_content = f.read()
505
+ yaml_obj = yaml.safe_load(yaml_content)
506
+ custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
507
+
508
+ # List of objects in the cluster YAML to be updated
509
+ combination_destinations = [
510
+ # Service accounts
511
+ yaml_obj['provider']['autoscaler_service_account']['metadata'],
512
+ yaml_obj['provider']['autoscaler_role']['metadata'],
513
+ yaml_obj['provider']['autoscaler_role_binding']['metadata'],
514
+ yaml_obj['provider']['autoscaler_service_account']['metadata'],
515
+ # Pod spec
516
+ yaml_obj['available_node_types']['ray_head_default']['node_config']['metadata'],
517
+ # Services for pods
518
+ *[svc['metadata'] for svc in yaml_obj['provider']['services']],
519
+ ]
520
+
521
+ for destination in combination_destinations:
522
+ config.merge_k8s_configs(custom_metadata, destination)
523
+
524
+ # Write the updated YAML back to the file
525
+ common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
526
+
527
+
528
+ def merge_custom_metadata(original_metadata: Dict[str, Any]) -> None:
529
+ """Merges original metadata with custom_metadata from config
530
+
531
+ Merge is done in-place, so return is not required
532
+ """
533
+ custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
534
+ config.merge_k8s_configs(custom_metadata, original_metadata)
535
+
536
+
537
+ def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
538
+ """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
539
+ # Fetch the list of available RuntimeClasses
540
+ runtime_classes = kube_client.node_api(context).list_runtime_class()
541
+
542
+ # Check if 'nvidia' RuntimeClass exists
543
+ nvidia_exists = any(rc.metadata.name == 'nvidia' for rc in runtime_classes.items)
544
+ return nvidia_exists
545
+
546
+
547
+ def check_secret_exists(
548
+ secret_name: str, namespace: str, context: Optional[str]
549
+ ) -> Tuple[bool, Union[str, Dict[str, Any]]]:
550
+ """Checks if a secret exists in a namespace
551
+
552
+ Args:
553
+ secret_name: Name of secret to check
554
+ namespace: Namespace to check
555
+ Returns:
556
+ bool: True if the secret exists, False otherwise
557
+ str: response payload if True, error string otherwise
558
+ """
559
+
560
+ try:
561
+ response = kube_client.core_api(context).read_namespaced_secret(
562
+ secret_name, namespace, _request_timeout=kube_client.API_TIMEOUT
563
+ )
564
+ except kube_client.api_exception() as e:
565
+ if e.status == 404:
566
+ return False, str(e)
567
+ raise
568
+ else:
569
+ return True, response
570
+
571
+
572
+ def set_secret(
573
+ secret_name: str,
574
+ namespace: str,
575
+ context: Optional[str],
576
+ secret_key: str,
577
+ secret_value: str,
578
+ ) -> Tuple[bool, Optional[str]]:
579
+ """
580
+ Create/update a secret in a namespace. Values are encoded to base64.
581
+ """
582
+ secret_exists, response = check_secret_exists(
583
+ secret_name=secret_name,
584
+ namespace=namespace,
585
+ context=context,
586
+ )
587
+
588
+ secret_metadata = {'name': secret_name, 'labels': {'parent': 'konduktor'}}
589
+ custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
590
+ config.merge_k8s_configs(secret_metadata, custom_metadata)
591
+
592
+ secret = kubernetes.client.V1Secret(
593
+ metadata=kubernetes.client.V1ObjectMeta(**secret_metadata),
594
+ type='Opaque',
595
+ data={secret_key: secret_value},
596
+ )
597
+
598
+ try:
599
+ if secret_exists:
600
+ kube_client.core_api(context).patch_namespaced_secret(
601
+ secret_name, namespace, secret
602
+ )
603
+ else:
604
+ kube_client.core_api(context).create_namespaced_secret(namespace, secret)
605
+ except kube_client.api_exception() as e:
606
+ return False, str(e)
607
+ else:
608
+ logger.debug(
609
+ f'Secret {secret_name} in namespace {namespace} '
610
+ f'in context {context} created/updated'
611
+ )
612
+ return True, None
613
+
614
+
615
+ def get_autoscaler_type() -> Optional[kubernetes_enums.KubernetesAutoscalerType]:
616
+ """Returns the autoscaler type by reading from config"""
617
+ autoscaler_type = config.get_nested(('kubernetes', 'autoscaler'), None)
618
+ if autoscaler_type is not None:
619
+ autoscaler_type = kubernetes_enums.KubernetesAutoscalerType(autoscaler_type)
620
+ return autoscaler_type
621
+
622
+
623
+ # TODO(asaiacai): some checks here for CRDs for jobset and Kueue CRDs, queues, etc.
624
+ def is_label_valid(label_key: str, label_value: str) -> Tuple[bool, Optional[str]]:
625
+ # Kubernetes labels can be of the format <domain>/<key>: <value>
626
+ key_regex = re.compile(
627
+ # Look-ahead to ensure proper domain formatting up to a slash
628
+ r'^(?:(?=[a-z0-9]([-a-z0-9.]*[a-z0-9])?\/)'
629
+ # Match domain: starts and ends with alphanum up to 253 chars
630
+ # including a slash in the domain.
631
+ r'[a-z0-9]([-a-z0-9.]{0,251}[a-z0-9])?\/)?'
632
+ # Match key: starts and ends with alphanum, upto to 63 chars.
633
+ r'[a-z0-9]([-a-z0-9_.]{0,61}[a-z0-9])?$'
634
+ )
635
+ value_regex = re.compile(r'^([a-zA-Z0-9]([-a-zA-Z0-9_.]{0,61}[a-zA-Z0-9])?)?$')
636
+ key_valid = bool(key_regex.match(label_key))
637
+ value_valid = bool(value_regex.match(label_value))
638
+ error_msg = None
639
+ condition_msg = (
640
+ 'Value must consist of alphanumeric characters or '
641
+ "'-', '_', '.', and must be no more than 63 "
642
+ 'characters in length.'
643
+ )
644
+ if not key_valid:
645
+ error_msg = f'Invalid label key {label_key} for Kubernetes. ' f'{condition_msg}'
646
+ if not value_valid:
647
+ error_msg = (
648
+ f'Invalid label value {label_value} for Kubernetes. ' f'{condition_msg}'
649
+ )
650
+ if not key_valid or not value_valid:
651
+ return False, error_msg
652
+ return True, None