konduktor-nightly 0.1.0.dev20250209104336__py3-none-any.whl → 0.1.0.dev20250313070642__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +16 -6
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/common.py +88 -0
- konduktor/adaptors/gcp.py +112 -0
- konduktor/backends/__init__.py +8 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/jobset.py +218 -0
- konduktor/backends/jobset_utils.py +447 -0
- konduktor/check.py +192 -0
- konduktor/cli.py +790 -0
- konduktor/cloud_stores.py +158 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/constants.py +6 -6
- konduktor/controller/launch.py +3 -3
- konduktor/controller/node.py +5 -5
- konduktor/controller/parse.py +23 -23
- konduktor/dashboard/backend/main.py +57 -57
- konduktor/dashboard/backend/sockets.py +19 -19
- konduktor/data/__init__.py +9 -0
- konduktor/data/constants.py +12 -0
- konduktor/data/data_utils.py +223 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +906 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/storage.py +799 -0
- konduktor/data/storage_utils.py +500 -0
- konduktor/execution.py +444 -0
- konduktor/kube_client.py +153 -48
- konduktor/logging.py +49 -5
- konduktor/manifests/dmesg_daemonset.yaml +8 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +478 -0
- konduktor/task.py +867 -0
- konduktor/templates/jobset.yaml.j2 +31 -0
- konduktor/templates/pod.yaml.j2 +185 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +21 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +93 -0
- konduktor/utils/common_utils.py +393 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +226 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +652 -0
- konduktor/utils/log_utils.py +251 -0
- konduktor/utils/loki_utils.py +85 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +581 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +216 -0
- konduktor/utils/validator.py +20 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/LICENSE +0 -1
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/METADATA +13 -2
- konduktor_nightly-0.1.0.dev20250313070642.dist-info/RECORD +94 -0
- konduktor_nightly-0.1.0.dev20250209104336.dist-info/RECORD +0 -48
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/WHEEL +0 -0
- {konduktor_nightly-0.1.0.dev20250209104336.dist-info → konduktor_nightly-0.1.0.dev20250313070642.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,652 @@
|
|
1
|
+
# Proprietary Changes made for Trainy under the Trainy Software License
|
2
|
+
# Original source: skypilot: https://github.com/skypilot-org/skypilot
|
3
|
+
# which is Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
# Unless required by applicable law or agreed to in writing, software
|
8
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
# See the License for the specific language governing permissions and
|
11
|
+
# limitations under the License.
|
12
|
+
|
13
|
+
"""Kubernetes utilities."""
|
14
|
+
|
15
|
+
import functools
|
16
|
+
import math
|
17
|
+
import os
|
18
|
+
import re
|
19
|
+
import typing
|
20
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
21
|
+
|
22
|
+
import kubernetes
|
23
|
+
import yaml
|
24
|
+
|
25
|
+
from konduktor import config, kube_client, logging
|
26
|
+
from konduktor.utils import common_utils, kubernetes_enums
|
27
|
+
|
28
|
+
if typing.TYPE_CHECKING:
|
29
|
+
pass
|
30
|
+
|
31
|
+
DEFAULT_NAMESPACE = 'default'
|
32
|
+
|
33
|
+
DEFAULT_SERVICE_ACCOUNT_NAME = 'konduktor-service-account'
|
34
|
+
|
35
|
+
MEMORY_SIZE_UNITS = {
|
36
|
+
'B': 1,
|
37
|
+
'K': 2**10,
|
38
|
+
'M': 2**20,
|
39
|
+
'G': 2**30,
|
40
|
+
'T': 2**40,
|
41
|
+
'P': 2**50,
|
42
|
+
}
|
43
|
+
|
44
|
+
# The resource keys used by Kubernetes to track NVIDIA GPUs and Google on
|
45
|
+
# nodes. These keys are typically used in the node's status.allocatable
|
46
|
+
# or status.capacity fields to indicate the available resources on the node.
|
47
|
+
GPU_RESOURCE_KEY = 'nvidia.com/gpu'
|
48
|
+
|
49
|
+
NO_ACCELERATOR_HELP_MESSAGE = (
|
50
|
+
'If your cluster contains GPUs, make sure '
|
51
|
+
f'{GPU_RESOURCE_KEY} resource is available '
|
52
|
+
'on the nodes and the node labels for identifying GPUs '
|
53
|
+
'(e.g. `nvidia.com/gpu` are setup correctly. '
|
54
|
+
)
|
55
|
+
|
56
|
+
|
57
|
+
logger = logging.get_logger(__name__)
|
58
|
+
|
59
|
+
|
60
|
+
class GPULabelFormatter:
|
61
|
+
"""Base class to define a GPU label formatter for a Kubernetes cluster
|
62
|
+
|
63
|
+
A GPU label formatter is a class that defines how to use GPU type labels in
|
64
|
+
a Kubernetes cluster. It is used by the Kubernetes cloud class to pick the
|
65
|
+
key:value pair to use as node selector for GPU nodes.
|
66
|
+
"""
|
67
|
+
|
68
|
+
@classmethod
|
69
|
+
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
70
|
+
"""Returns the label key for GPU type used by the Kubernetes cluster"""
|
71
|
+
raise NotImplementedError
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
def get_label_keys(cls) -> List[str]:
|
75
|
+
"""Returns a list of label keys for GPU used by Kubernetes cluster."""
|
76
|
+
raise NotImplementedError
|
77
|
+
|
78
|
+
@classmethod
|
79
|
+
def get_label_value(cls, accelerator: str) -> str:
|
80
|
+
"""Given a GPU type, returns the label value to be used"""
|
81
|
+
raise NotImplementedError
|
82
|
+
|
83
|
+
@classmethod
|
84
|
+
def match_label_key(cls, label_key: str) -> bool:
|
85
|
+
"""Checks if the given label key matches the formatter's label keys"""
|
86
|
+
raise NotImplementedError
|
87
|
+
|
88
|
+
@classmethod
|
89
|
+
def get_accelerator_from_label_value(cls, value: str) -> str:
|
90
|
+
"""Given a label value, returns the GPU type"""
|
91
|
+
raise NotImplementedError
|
92
|
+
|
93
|
+
@classmethod
|
94
|
+
def validate_label_value(cls, value: str) -> Tuple[bool, str]:
|
95
|
+
"""Validates if the specified label value is correct.
|
96
|
+
|
97
|
+
Used to check if the labelling on the cluster is correct and
|
98
|
+
preemptively raise an error if it is not.
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
bool: True if the label value is valid, False otherwise.
|
102
|
+
str: Error message if the label value is invalid, None otherwise.
|
103
|
+
"""
|
104
|
+
del value
|
105
|
+
return True, ''
|
106
|
+
|
107
|
+
|
108
|
+
def get_gke_accelerator_name(accelerator: str) -> str:
|
109
|
+
"""Returns the accelerator name for GKE clusters.
|
110
|
+
|
111
|
+
Uses the format - nvidia-tesla-<accelerator>.
|
112
|
+
A100-80GB, H100-80GB, L4 are an exception. They use nvidia-<accelerator>.
|
113
|
+
types are an exception as well keeping the given name.
|
114
|
+
"""
|
115
|
+
if accelerator == 'H100':
|
116
|
+
# H100 is named as H100-80GB in GKE.
|
117
|
+
accelerator = 'H100-80GB'
|
118
|
+
if accelerator in ('A100-80GB', 'L4', 'H100-80GB', 'H100-MEGA-80GB'):
|
119
|
+
# A100-80GB, L4, H100-80GB and H100-MEGA-80GB
|
120
|
+
# have a different name pattern.
|
121
|
+
return 'nvidia-{}'.format(accelerator.lower())
|
122
|
+
else:
|
123
|
+
return 'nvidia-tesla-{}'.format(accelerator.lower())
|
124
|
+
|
125
|
+
|
126
|
+
class GKELabelFormatter(GPULabelFormatter):
|
127
|
+
"""GKE label formatter
|
128
|
+
|
129
|
+
GKE nodes by default are populated with `cloud.google.com/gke-accelerator`
|
130
|
+
label, which is used to identify the GPU type.
|
131
|
+
"""
|
132
|
+
|
133
|
+
GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
|
134
|
+
ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
|
135
|
+
|
136
|
+
@classmethod
|
137
|
+
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
138
|
+
return cls.GPU_LABEL_KEY
|
139
|
+
|
140
|
+
@classmethod
|
141
|
+
def get_label_keys(cls) -> List[str]:
|
142
|
+
return [cls.GPU_LABEL_KEY]
|
143
|
+
|
144
|
+
@classmethod
|
145
|
+
def match_label_key(cls, label_key: str) -> bool:
|
146
|
+
return label_key in cls.get_label_keys()
|
147
|
+
|
148
|
+
@classmethod
|
149
|
+
def get_label_value(cls, accelerator: str) -> str:
|
150
|
+
return get_gke_accelerator_name(accelerator)
|
151
|
+
|
152
|
+
@classmethod
|
153
|
+
def get_accelerator_from_label_value(cls, value: str) -> str:
|
154
|
+
if value.startswith('nvidia-tesla-'):
|
155
|
+
return value.replace('nvidia-tesla-', '').upper()
|
156
|
+
elif value.startswith('nvidia-'):
|
157
|
+
acc = value.replace('nvidia-', '').upper()
|
158
|
+
if acc == 'H100-80GB':
|
159
|
+
# H100 can be either H100-80GB or H100-MEGA-80GB in GKE
|
160
|
+
# we map H100 ---> H100-80GB and keep H100-MEGA-80GB
|
161
|
+
# to distinguish between a3-high and a3-mega instances
|
162
|
+
return 'H100'
|
163
|
+
return acc
|
164
|
+
else:
|
165
|
+
raise ValueError(f'Invalid accelerator name in GKE cluster: {value}')
|
166
|
+
|
167
|
+
|
168
|
+
class GFDLabelFormatter(GPULabelFormatter):
|
169
|
+
"""GPU Feature Discovery label formatter
|
170
|
+
|
171
|
+
NVIDIA GPUs nodes are labeled by GPU feature discovery
|
172
|
+
e.g. nvidia.com/gpu.product=NVIDIA-H100-80GB-HBM3
|
173
|
+
https://github.com/NVIDIA/gpu-feature-discovery
|
174
|
+
|
175
|
+
GPU feature discovery is included as part of the
|
176
|
+
NVIDIA GPU Operator:
|
177
|
+
https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html
|
178
|
+
|
179
|
+
This LabelFormatter can't be used in autoscaling clusters since accelerators
|
180
|
+
may map to multiple label, so we're not implementing `get_label_value`
|
181
|
+
"""
|
182
|
+
|
183
|
+
LABEL_KEY = 'nvidia.com/gpu.product'
|
184
|
+
|
185
|
+
@classmethod
|
186
|
+
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
187
|
+
return cls.LABEL_KEY
|
188
|
+
|
189
|
+
@classmethod
|
190
|
+
def get_label_keys(cls) -> List[str]:
|
191
|
+
return [cls.LABEL_KEY]
|
192
|
+
|
193
|
+
@classmethod
|
194
|
+
def get_label_value(cls, accelerator: str) -> str:
|
195
|
+
"""An accelerator can map to many Nvidia GFD labels
|
196
|
+
(e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
|
197
|
+
As a result, we do not support get_label_value for GFDLabelFormatter."""
|
198
|
+
raise NotImplementedError
|
199
|
+
|
200
|
+
@classmethod
|
201
|
+
def match_label_key(cls, label_key: str) -> bool:
|
202
|
+
return label_key == cls.LABEL_KEY
|
203
|
+
|
204
|
+
@classmethod
|
205
|
+
def get_accelerator_from_label_value(cls, value: str) -> str:
|
206
|
+
"""Searches against a canonical list of NVIDIA GPUs and pattern
|
207
|
+
matches the canonical GPU name against the GFD label.
|
208
|
+
"""
|
209
|
+
canonical_gpu_names = [
|
210
|
+
'A100-80GB',
|
211
|
+
'A100',
|
212
|
+
'A10G',
|
213
|
+
'H100',
|
214
|
+
'K80',
|
215
|
+
'M60',
|
216
|
+
'T4g',
|
217
|
+
'T4',
|
218
|
+
'V100',
|
219
|
+
'A10',
|
220
|
+
'P4000',
|
221
|
+
'P100',
|
222
|
+
'P40',
|
223
|
+
'P4',
|
224
|
+
'L40',
|
225
|
+
'L4',
|
226
|
+
]
|
227
|
+
for canonical_name in canonical_gpu_names:
|
228
|
+
# A100-80G accelerator is A100-SXM-80GB or A100-PCIE-80GB
|
229
|
+
if canonical_name == 'A100-80GB' and re.search(r'A100.*-80GB', value):
|
230
|
+
return canonical_name
|
231
|
+
# Use word boundary matching to prevent substring matches
|
232
|
+
elif re.search(rf'\b{re.escape(canonical_name)}\b', value):
|
233
|
+
return canonical_name
|
234
|
+
|
235
|
+
# If we didn't find a canonical name:
|
236
|
+
# 1. remove 'NVIDIA-' (e.g., 'NVIDIA-RTX-A6000' -> 'RTX-A6000')
|
237
|
+
# 2. remove 'GEFORCE-' (e.g., 'NVIDIA-GEFORCE-RTX-3070' -> 'RTX-3070')
|
238
|
+
# 3. remove 'RTX-' (e.g. 'RTX-6000' -> 'RTX6000')
|
239
|
+
return (
|
240
|
+
value.upper()
|
241
|
+
.replace('NVIDIA-', '')
|
242
|
+
.replace('GEFORCE-', '')
|
243
|
+
.replace('RTX-', 'RTX')
|
244
|
+
)
|
245
|
+
|
246
|
+
|
247
|
+
# LABEL_FORMATTER_REGISTRY stores the label formats that will try to
|
248
|
+
# discover the accelerator type from. The order of the list is important, as
|
249
|
+
# it will be used to determine the priority of the label formats when
|
250
|
+
# auto-detecting the GPU label type.
|
251
|
+
LABEL_FORMATTER_REGISTRY = [GKELabelFormatter, GFDLabelFormatter]
|
252
|
+
|
253
|
+
# Mapping of autoscaler type to label formatter
|
254
|
+
AUTOSCALER_TO_LABEL_FORMATTER = {
|
255
|
+
kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
|
256
|
+
}
|
257
|
+
|
258
|
+
|
259
|
+
@functools.lru_cache()
|
260
|
+
def get_current_kube_config_context_name() -> Optional[str]:
|
261
|
+
"""Get the current kubernetes context from the kubeconfig file
|
262
|
+
|
263
|
+
Returns:
|
264
|
+
str | None: The current kubernetes context if it exists, None otherwise
|
265
|
+
"""
|
266
|
+
k8s = kubernetes
|
267
|
+
try:
|
268
|
+
_, current_context = k8s.config.list_kube_config_contexts()
|
269
|
+
return current_context['name']
|
270
|
+
except k8s.config.config_exception.ConfigException:
|
271
|
+
return None
|
272
|
+
|
273
|
+
|
274
|
+
@functools.lru_cache(maxsize=10)
|
275
|
+
def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
|
276
|
+
"""Gets the kubernetes nodes in the context.
|
277
|
+
|
278
|
+
If context is None, gets the nodes in the current context.
|
279
|
+
"""
|
280
|
+
if context is None:
|
281
|
+
context = get_current_kube_config_context_name()
|
282
|
+
|
283
|
+
nodes = (
|
284
|
+
kube_client.core_api(context)
|
285
|
+
.list_node(_request_timeout=kubernetes.API_TIMEOUT)
|
286
|
+
.items
|
287
|
+
)
|
288
|
+
return nodes
|
289
|
+
|
290
|
+
|
291
|
+
@functools.lru_cache()
|
292
|
+
def detect_gpu_label_formatter(
|
293
|
+
context: Optional[str],
|
294
|
+
) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]:
|
295
|
+
"""Detects the GPU label formatter for the Kubernetes cluster
|
296
|
+
|
297
|
+
Returns:
|
298
|
+
GPULabelFormatter: The GPU label formatter for the cluster, if found.
|
299
|
+
Dict[str, List[Tuple[str, str]]]: A mapping of nodes and the list of
|
300
|
+
labels on each node. E.g., {'node1': [('label1', 'value1')]}
|
301
|
+
"""
|
302
|
+
# Get all labels across all nodes
|
303
|
+
node_labels: Dict[str, List[Tuple[str, str]]] = {}
|
304
|
+
nodes = get_kubernetes_nodes(context)
|
305
|
+
for node in nodes:
|
306
|
+
node_labels[node.metadata.name] = []
|
307
|
+
for label, value in node.metadata.labels.items():
|
308
|
+
node_labels[node.metadata.name].append((label, value))
|
309
|
+
|
310
|
+
label_formatter = None
|
311
|
+
|
312
|
+
# Check if the node labels contain any of the GPU label prefixes
|
313
|
+
for lf in LABEL_FORMATTER_REGISTRY:
|
314
|
+
for _, label_list in node_labels.items():
|
315
|
+
for label, _ in label_list:
|
316
|
+
if lf.match_label_key(label):
|
317
|
+
label_formatter = lf()
|
318
|
+
return label_formatter, node_labels
|
319
|
+
|
320
|
+
return label_formatter, node_labels
|
321
|
+
|
322
|
+
|
323
|
+
@functools.lru_cache()
|
324
|
+
def get_kube_config_context_namespace(context_name: Optional[str] = None) -> str:
|
325
|
+
"""Get the current kubernetes context namespace from the kubeconfig file
|
326
|
+
|
327
|
+
Returns:
|
328
|
+
str | None: The current kubernetes context namespace if it exists, else
|
329
|
+
the default namespace.
|
330
|
+
"""
|
331
|
+
k8s = kubernetes
|
332
|
+
ns_path = '/var/run/secrets/kubernetes.io/serviceaccount/namespace'
|
333
|
+
# If using in-cluster context, get the namespace from the service account
|
334
|
+
# namespace file. Uses the same logic as adaptors.kubernetes._load_config()
|
335
|
+
# to stay consistent with in-cluster config loading.
|
336
|
+
if context_name == kube_client.in_cluster_context_name() or context_name is None:
|
337
|
+
if os.path.exists(ns_path):
|
338
|
+
with open(ns_path, encoding='utf-8') as f:
|
339
|
+
return f.read().strip()
|
340
|
+
# If not in-cluster, get the namespace from kubeconfig
|
341
|
+
try:
|
342
|
+
contexts, current_context = k8s.config.list_kube_config_contexts()
|
343
|
+
if context_name is None:
|
344
|
+
context = current_context
|
345
|
+
else:
|
346
|
+
context = next((c for c in contexts if c['name'] == context_name), None)
|
347
|
+
if context is None:
|
348
|
+
return DEFAULT_NAMESPACE
|
349
|
+
|
350
|
+
if 'namespace' in context['context']:
|
351
|
+
return context['context']['namespace']
|
352
|
+
else:
|
353
|
+
return DEFAULT_NAMESPACE
|
354
|
+
except k8s.config.config_exception.ConfigException:
|
355
|
+
return DEFAULT_NAMESPACE
|
356
|
+
|
357
|
+
|
358
|
+
def check_credentials(
|
359
|
+
context: Optional[str], timeout: int = kube_client.API_TIMEOUT
|
360
|
+
) -> Tuple[bool, Optional[str]]:
|
361
|
+
"""Check if the credentials in kubeconfig file are valid
|
362
|
+
|
363
|
+
Args:
|
364
|
+
context (Optional[str]): The Kubernetes context to use. If none, uses
|
365
|
+
in-cluster auth to check credentials, if available.
|
366
|
+
timeout (int): Timeout in seconds for the test API call
|
367
|
+
|
368
|
+
Returns:
|
369
|
+
bool: True if credentials are valid, False otherwise
|
370
|
+
str: Error message if credentials are invalid, None otherwise
|
371
|
+
"""
|
372
|
+
try:
|
373
|
+
namespace = get_kube_config_context_namespace(context)
|
374
|
+
kube_client.core_api(context).list_namespaced_pod(
|
375
|
+
namespace, _request_timeout=timeout
|
376
|
+
)
|
377
|
+
return True, None
|
378
|
+
except ImportError:
|
379
|
+
return False, (
|
380
|
+
'`kubernetes` package is not installed. '
|
381
|
+
'Install it with: pip install kubernetes'
|
382
|
+
)
|
383
|
+
except kube_client.api_exception() as e:
|
384
|
+
# Check if the error is due to invalid credentials
|
385
|
+
if e.status == 401:
|
386
|
+
return (
|
387
|
+
False,
|
388
|
+
'Invalid credentials - do you have permission '
|
389
|
+
'to access the cluster?',
|
390
|
+
)
|
391
|
+
else:
|
392
|
+
return False, f'Failed to communicate with the cluster: {str(e)}'
|
393
|
+
except kube_client.config_exception() as e:
|
394
|
+
return False, f'Invalid configuration file: {str(e)}'
|
395
|
+
except kube_client.max_retry_error():
|
396
|
+
return False, (
|
397
|
+
'Failed to communicate with the cluster - timeout. '
|
398
|
+
'Check if your cluster is running and your network '
|
399
|
+
'is stable.'
|
400
|
+
)
|
401
|
+
except ValueError as e:
|
402
|
+
return False, common_utils.format_exception(e)
|
403
|
+
except Exception as e: # pylint: disable=broad-except
|
404
|
+
return False, (
|
405
|
+
'An error occurred: '
|
406
|
+
f'{common_utils.format_exception(e, use_bracket=True)}'
|
407
|
+
)
|
408
|
+
|
409
|
+
|
410
|
+
def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
|
411
|
+
resource_str = str(resource_qty_str)
|
412
|
+
if resource_str[-1] == 'm':
|
413
|
+
# For example, '500m' rounds up to 1.
|
414
|
+
return math.ceil(int(resource_str[:-1]) / 1000)
|
415
|
+
else:
|
416
|
+
return float(resource_str)
|
417
|
+
|
418
|
+
|
419
|
+
def parse_memory_resource(resource_qty_str: str, unit: str = 'B') -> Union[int, float]:
|
420
|
+
"""Returns memory size in chosen units given a resource quantity string."""
|
421
|
+
if unit not in MEMORY_SIZE_UNITS:
|
422
|
+
valid_units = ', '.join(MEMORY_SIZE_UNITS.keys())
|
423
|
+
raise ValueError(f'Invalid unit: {unit}. Valid units are: {valid_units}')
|
424
|
+
|
425
|
+
resource_str = str(resource_qty_str)
|
426
|
+
bytes_value: Union[int, float]
|
427
|
+
try:
|
428
|
+
bytes_value = int(resource_str)
|
429
|
+
except ValueError:
|
430
|
+
memory_size = re.sub(r'([KMGTPB]+)', r' \1', resource_str)
|
431
|
+
number, unit_index = [item.strip() for item in memory_size.split()]
|
432
|
+
unit_index = unit_index[0]
|
433
|
+
bytes_value = float(number) * MEMORY_SIZE_UNITS[unit_index]
|
434
|
+
return bytes_value / MEMORY_SIZE_UNITS[unit]
|
435
|
+
|
436
|
+
|
437
|
+
def combine_pod_config_fields(
|
438
|
+
cluster_yaml_path: str,
|
439
|
+
cluster_config_overrides: Dict[str, Any],
|
440
|
+
) -> None:
|
441
|
+
"""Adds or updates fields in the YAML with fields from the ~/.konduktor/config's
|
442
|
+
kubernetes.pod_spec dict.
|
443
|
+
This can be used to add fields to the YAML that are not supported by
|
444
|
+
yet, or require simple configuration (e.g., adding an
|
445
|
+
imagePullSecrets field).
|
446
|
+
Note that new fields are added and existing ones are updated. Nested fields
|
447
|
+
are not completely replaced, instead their objects are merged. Similarly,
|
448
|
+
if a list is encountered in the config, it will be appended to the
|
449
|
+
destination list.
|
450
|
+
For example, if the YAML has the following:
|
451
|
+
```
|
452
|
+
...
|
453
|
+
node_config:
|
454
|
+
spec:
|
455
|
+
containers:
|
456
|
+
- name: ray
|
457
|
+
image: rayproject/ray:nightly
|
458
|
+
```
|
459
|
+
and the config has the following:
|
460
|
+
```
|
461
|
+
kubernetes:
|
462
|
+
pod_config:
|
463
|
+
spec:
|
464
|
+
imagePullSecrets:
|
465
|
+
- name: my-secret
|
466
|
+
```
|
467
|
+
then the resulting YAML will be:
|
468
|
+
```
|
469
|
+
...
|
470
|
+
node_config:
|
471
|
+
spec:
|
472
|
+
containers:
|
473
|
+
- name: ray
|
474
|
+
image: rayproject/ray:nightly
|
475
|
+
imagePullSecrets:
|
476
|
+
- name: my-secret
|
477
|
+
```
|
478
|
+
"""
|
479
|
+
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
480
|
+
yaml_content = f.read()
|
481
|
+
yaml_obj = yaml.safe_load(yaml_content)
|
482
|
+
# We don't use override_configs in `konduktor_config.get_nested`, as merging
|
483
|
+
# the pod config requires special handling.
|
484
|
+
kubernetes_config = config.get_nested(
|
485
|
+
('kubernetes', 'pod_config'), default_value={}, override_configs={}
|
486
|
+
)
|
487
|
+
override_pod_config = cluster_config_overrides.get('kubernetes', {}).get(
|
488
|
+
'pod_config', {}
|
489
|
+
)
|
490
|
+
config.merge_k8s_configs(override_pod_config, kubernetes_config)
|
491
|
+
|
492
|
+
# Write the updated YAML back to the file
|
493
|
+
common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
494
|
+
|
495
|
+
|
496
|
+
def combine_metadata_fields(cluster_yaml_path: str) -> None:
|
497
|
+
"""Updates the metadata for all Kubernetes objects created with
|
498
|
+
fields from the ~/.konduktor/config's kubernetes.custom_metadata dict.
|
499
|
+
|
500
|
+
Obeys the same add or update semantics as combine_pod_config_fields().
|
501
|
+
"""
|
502
|
+
|
503
|
+
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
504
|
+
yaml_content = f.read()
|
505
|
+
yaml_obj = yaml.safe_load(yaml_content)
|
506
|
+
custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
|
507
|
+
|
508
|
+
# List of objects in the cluster YAML to be updated
|
509
|
+
combination_destinations = [
|
510
|
+
# Service accounts
|
511
|
+
yaml_obj['provider']['autoscaler_service_account']['metadata'],
|
512
|
+
yaml_obj['provider']['autoscaler_role']['metadata'],
|
513
|
+
yaml_obj['provider']['autoscaler_role_binding']['metadata'],
|
514
|
+
yaml_obj['provider']['autoscaler_service_account']['metadata'],
|
515
|
+
# Pod spec
|
516
|
+
yaml_obj['available_node_types']['ray_head_default']['node_config']['metadata'],
|
517
|
+
# Services for pods
|
518
|
+
*[svc['metadata'] for svc in yaml_obj['provider']['services']],
|
519
|
+
]
|
520
|
+
|
521
|
+
for destination in combination_destinations:
|
522
|
+
config.merge_k8s_configs(custom_metadata, destination)
|
523
|
+
|
524
|
+
# Write the updated YAML back to the file
|
525
|
+
common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
526
|
+
|
527
|
+
|
528
|
+
def merge_custom_metadata(original_metadata: Dict[str, Any]) -> None:
|
529
|
+
"""Merges original metadata with custom_metadata from config
|
530
|
+
|
531
|
+
Merge is done in-place, so return is not required
|
532
|
+
"""
|
533
|
+
custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
|
534
|
+
config.merge_k8s_configs(custom_metadata, original_metadata)
|
535
|
+
|
536
|
+
|
537
|
+
def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
|
538
|
+
"""Checks if the 'nvidia' RuntimeClass exists in the cluster"""
|
539
|
+
# Fetch the list of available RuntimeClasses
|
540
|
+
runtime_classes = kube_client.node_api(context).list_runtime_class()
|
541
|
+
|
542
|
+
# Check if 'nvidia' RuntimeClass exists
|
543
|
+
nvidia_exists = any(rc.metadata.name == 'nvidia' for rc in runtime_classes.items)
|
544
|
+
return nvidia_exists
|
545
|
+
|
546
|
+
|
547
|
+
def check_secret_exists(
|
548
|
+
secret_name: str, namespace: str, context: Optional[str]
|
549
|
+
) -> Tuple[bool, Union[str, Dict[str, Any]]]:
|
550
|
+
"""Checks if a secret exists in a namespace
|
551
|
+
|
552
|
+
Args:
|
553
|
+
secret_name: Name of secret to check
|
554
|
+
namespace: Namespace to check
|
555
|
+
Returns:
|
556
|
+
bool: True if the secret exists, False otherwise
|
557
|
+
str: response payload if True, error string otherwise
|
558
|
+
"""
|
559
|
+
|
560
|
+
try:
|
561
|
+
response = kube_client.core_api(context).read_namespaced_secret(
|
562
|
+
secret_name, namespace, _request_timeout=kube_client.API_TIMEOUT
|
563
|
+
)
|
564
|
+
except kube_client.api_exception() as e:
|
565
|
+
if e.status == 404:
|
566
|
+
return False, str(e)
|
567
|
+
raise
|
568
|
+
else:
|
569
|
+
return True, response
|
570
|
+
|
571
|
+
|
572
|
+
def set_secret(
|
573
|
+
secret_name: str,
|
574
|
+
namespace: str,
|
575
|
+
context: Optional[str],
|
576
|
+
secret_key: str,
|
577
|
+
secret_value: str,
|
578
|
+
) -> Tuple[bool, Optional[str]]:
|
579
|
+
"""
|
580
|
+
Create/update a secret in a namespace. Values are encoded to base64.
|
581
|
+
"""
|
582
|
+
secret_exists, response = check_secret_exists(
|
583
|
+
secret_name=secret_name,
|
584
|
+
namespace=namespace,
|
585
|
+
context=context,
|
586
|
+
)
|
587
|
+
|
588
|
+
secret_metadata = {'name': secret_name, 'labels': {'parent': 'konduktor'}}
|
589
|
+
custom_metadata = config.get_nested(('kubernetes', 'custom_metadata'), {})
|
590
|
+
config.merge_k8s_configs(secret_metadata, custom_metadata)
|
591
|
+
|
592
|
+
secret = kubernetes.client.V1Secret(
|
593
|
+
metadata=kubernetes.client.V1ObjectMeta(**secret_metadata),
|
594
|
+
type='Opaque',
|
595
|
+
data={secret_key: secret_value},
|
596
|
+
)
|
597
|
+
|
598
|
+
try:
|
599
|
+
if secret_exists:
|
600
|
+
kube_client.core_api(context).patch_namespaced_secret(
|
601
|
+
secret_name, namespace, secret
|
602
|
+
)
|
603
|
+
else:
|
604
|
+
kube_client.core_api(context).create_namespaced_secret(namespace, secret)
|
605
|
+
except kube_client.api_exception() as e:
|
606
|
+
return False, str(e)
|
607
|
+
else:
|
608
|
+
logger.debug(
|
609
|
+
f'Secret {secret_name} in namespace {namespace} '
|
610
|
+
f'in context {context} created/updated'
|
611
|
+
)
|
612
|
+
return True, None
|
613
|
+
|
614
|
+
|
615
|
+
def get_autoscaler_type() -> Optional[kubernetes_enums.KubernetesAutoscalerType]:
|
616
|
+
"""Returns the autoscaler type by reading from config"""
|
617
|
+
autoscaler_type = config.get_nested(('kubernetes', 'autoscaler'), None)
|
618
|
+
if autoscaler_type is not None:
|
619
|
+
autoscaler_type = kubernetes_enums.KubernetesAutoscalerType(autoscaler_type)
|
620
|
+
return autoscaler_type
|
621
|
+
|
622
|
+
|
623
|
+
# TODO(asaiacai): some checks here for CRDs for jobset and Kueue CRDs, queues, etc.
|
624
|
+
def is_label_valid(label_key: str, label_value: str) -> Tuple[bool, Optional[str]]:
|
625
|
+
# Kubernetes labels can be of the format <domain>/<key>: <value>
|
626
|
+
key_regex = re.compile(
|
627
|
+
# Look-ahead to ensure proper domain formatting up to a slash
|
628
|
+
r'^(?:(?=[a-z0-9]([-a-z0-9.]*[a-z0-9])?\/)'
|
629
|
+
# Match domain: starts and ends with alphanum up to 253 chars
|
630
|
+
# including a slash in the domain.
|
631
|
+
r'[a-z0-9]([-a-z0-9.]{0,251}[a-z0-9])?\/)?'
|
632
|
+
# Match key: starts and ends with alphanum, upto to 63 chars.
|
633
|
+
r'[a-z0-9]([-a-z0-9_.]{0,61}[a-z0-9])?$'
|
634
|
+
)
|
635
|
+
value_regex = re.compile(r'^([a-zA-Z0-9]([-a-zA-Z0-9_.]{0,61}[a-zA-Z0-9])?)?$')
|
636
|
+
key_valid = bool(key_regex.match(label_key))
|
637
|
+
value_valid = bool(value_regex.match(label_value))
|
638
|
+
error_msg = None
|
639
|
+
condition_msg = (
|
640
|
+
'Value must consist of alphanumeric characters or '
|
641
|
+
"'-', '_', '.', and must be no more than 63 "
|
642
|
+
'characters in length.'
|
643
|
+
)
|
644
|
+
if not key_valid:
|
645
|
+
error_msg = f'Invalid label key {label_key} for Kubernetes. ' f'{condition_msg}'
|
646
|
+
if not value_valid:
|
647
|
+
error_msg = (
|
648
|
+
f'Invalid label value {label_value} for Kubernetes. ' f'{condition_msg}'
|
649
|
+
)
|
650
|
+
if not key_valid or not value_valid:
|
651
|
+
return False, error_msg
|
652
|
+
return True, None
|