skypilot-nightly 1.0.0.dev20241112__py3-none-any.whl → 1.0.0.dev20241113__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +1 -0
- sky/cli.py +18 -6
- sky/clouds/cloud.py +2 -0
- sky/clouds/kubernetes.py +19 -3
- sky/clouds/service_catalog/kubernetes_catalog.py +83 -65
- sky/clouds/utils/gcp_utils.py +5 -1
- sky/jobs/core.py +2 -0
- sky/optimizer.py +2 -0
- sky/provision/__init__.py +2 -0
- sky/provision/kubernetes/instance.py +125 -55
- sky/provision/kubernetes/utils.py +348 -101
- sky/resources.py +38 -27
- sky/serve/serve_utils.py +79 -78
- sky/skylet/log_lib.py +1 -4
- sky/templates/kubernetes-ray.yml.j2 +29 -3
- sky/utils/kubernetes/gpu_labeler.py +2 -2
- sky/utils/log_utils.py +52 -1
- sky/utils/timeline.py +3 -1
- {skypilot_nightly-1.0.0.dev20241112.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20241112.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/RECORD +25 -25
- {skypilot_nightly-1.0.0.dev20241112.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241112.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241112.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241112.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/top_level.txt +0 -0
@@ -48,10 +48,18 @@ MEMORY_SIZE_UNITS = {
|
|
48
48
|
'T': 2**40,
|
49
49
|
'P': 2**50,
|
50
50
|
}
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
51
|
+
|
52
|
+
# The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
|
53
|
+
# nodes. These keys are typically used in the node's status.allocatable
|
54
|
+
# or status.capacity fields to indicate the available resources on the node.
|
55
|
+
GPU_RESOURCE_KEY = 'nvidia.com/gpu'
|
56
|
+
TPU_RESOURCE_KEY = 'google.com/tpu'
|
57
|
+
|
58
|
+
NO_ACCELERATOR_HELP_MESSAGE = (
|
59
|
+
'If your cluster contains GPUs or TPUs, make sure '
|
60
|
+
f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available '
|
61
|
+
'on the nodes and the node labels for identifying GPUs/TPUs '
|
62
|
+
'(e.g., skypilot.co/accelerator) are setup correctly. ')
|
55
63
|
|
56
64
|
KUBERNETES_AUTOSCALER_NOTE = (
|
57
65
|
'Note: Kubernetes cluster autoscaling is enabled. '
|
@@ -74,6 +82,17 @@ PORT_FORWARD_PROXY_CMD_VERSION = 2
|
|
74
82
|
PORT_FORWARD_PROXY_CMD_PATH = ('~/.sky/kubernetes-port-forward-proxy-command-'
|
75
83
|
f'v{PORT_FORWARD_PROXY_CMD_VERSION}.sh')
|
76
84
|
|
85
|
+
# Mapping used to get generation for TPU accelerator name.
|
86
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
|
87
|
+
GKE_TPU_ACCELERATOR_TO_GENERATION = {
|
88
|
+
'tpu-v4-podslice': 'v4',
|
89
|
+
# Only Single-host v5e TPU configurations are allowed.
|
90
|
+
'tpu-v5-lite-device': 'v5e',
|
91
|
+
# Multi-host compatible v5e TPU configurations allowed.
|
92
|
+
'tpu-v5-lite-podslice': 'v5e',
|
93
|
+
'tpu-v5p-slice': 'v5p',
|
94
|
+
}
|
95
|
+
|
77
96
|
POD_STATUSES = {
|
78
97
|
'Pending', 'Running', 'Succeeded', 'Failed', 'Unknown', 'Terminating'
|
79
98
|
}
|
@@ -96,15 +115,25 @@ class GPULabelFormatter:
|
|
96
115
|
"""
|
97
116
|
|
98
117
|
@classmethod
|
99
|
-
def get_label_key(cls) -> str:
|
118
|
+
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
100
119
|
"""Returns the label key for GPU type used by the Kubernetes cluster"""
|
101
120
|
raise NotImplementedError
|
102
121
|
|
122
|
+
@classmethod
|
123
|
+
def get_label_keys(cls) -> List[str]:
|
124
|
+
"""Returns a list of label keys for GPU used by Kubernetes cluster."""
|
125
|
+
raise NotImplementedError
|
126
|
+
|
103
127
|
@classmethod
|
104
128
|
def get_label_value(cls, accelerator: str) -> str:
|
105
129
|
"""Given a GPU type, returns the label value to be used"""
|
106
130
|
raise NotImplementedError
|
107
131
|
|
132
|
+
@classmethod
|
133
|
+
def match_label_key(cls, label_key: str) -> bool:
|
134
|
+
"""Checks if the given label key matches the formatter's label keys"""
|
135
|
+
raise NotImplementedError
|
136
|
+
|
108
137
|
@classmethod
|
109
138
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
110
139
|
"""Given a label value, returns the GPU type"""
|
@@ -126,10 +155,11 @@ class GPULabelFormatter:
|
|
126
155
|
|
127
156
|
|
128
157
|
def get_gke_accelerator_name(accelerator: str) -> str:
|
129
|
-
"""Returns the accelerator name for GKE clusters
|
158
|
+
"""Returns the accelerator name for GKE clusters.
|
130
159
|
|
131
160
|
Uses the format - nvidia-tesla-<accelerator>.
|
132
|
-
A100-80GB, H100-80GB
|
161
|
+
A100-80GB, H100-80GB, L4 are an exception. They use nvidia-<accelerator>.
|
162
|
+
TPU types are an exception as well keeping the given name.
|
133
163
|
"""
|
134
164
|
if accelerator == 'H100':
|
135
165
|
# H100 is named as H100-80GB in GKE.
|
@@ -138,6 +168,8 @@ def get_gke_accelerator_name(accelerator: str) -> str:
|
|
138
168
|
# A100-80GB, L4, H100-80GB and H100-MEGA-80GB
|
139
169
|
# have a different name pattern.
|
140
170
|
return 'nvidia-{}'.format(accelerator.lower())
|
171
|
+
elif accelerator.startswith('tpu-'):
|
172
|
+
return accelerator
|
141
173
|
else:
|
142
174
|
return 'nvidia-tesla-{}'.format(accelerator.lower())
|
143
175
|
|
@@ -152,15 +184,23 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
|
|
152
184
|
LABEL_KEY = 'skypilot.co/accelerator'
|
153
185
|
|
154
186
|
@classmethod
|
155
|
-
def get_label_key(cls) -> str:
|
187
|
+
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
156
188
|
return cls.LABEL_KEY
|
157
189
|
|
190
|
+
@classmethod
|
191
|
+
def get_label_keys(cls) -> List[str]:
|
192
|
+
return [cls.LABEL_KEY]
|
193
|
+
|
158
194
|
@classmethod
|
159
195
|
def get_label_value(cls, accelerator: str) -> str:
|
160
196
|
# For SkyPilot formatter, we use the accelerator str directly.
|
161
197
|
# See sky.utils.kubernetes.gpu_labeler.
|
162
198
|
return accelerator.lower()
|
163
199
|
|
200
|
+
@classmethod
|
201
|
+
def match_label_key(cls, label_key: str) -> bool:
|
202
|
+
return label_key == cls.LABEL_KEY
|
203
|
+
|
164
204
|
@classmethod
|
165
205
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
166
206
|
return value.upper()
|
@@ -184,13 +224,21 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
184
224
|
LABEL_KEY = 'gpu.nvidia.com/class'
|
185
225
|
|
186
226
|
@classmethod
|
187
|
-
def get_label_key(cls) -> str:
|
227
|
+
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
188
228
|
return cls.LABEL_KEY
|
189
229
|
|
230
|
+
@classmethod
|
231
|
+
def get_label_keys(cls) -> List[str]:
|
232
|
+
return [cls.LABEL_KEY]
|
233
|
+
|
190
234
|
@classmethod
|
191
235
|
def get_label_value(cls, accelerator: str) -> str:
|
192
236
|
return accelerator.upper()
|
193
237
|
|
238
|
+
@classmethod
|
239
|
+
def match_label_key(cls, label_key: str) -> bool:
|
240
|
+
return label_key == cls.LABEL_KEY
|
241
|
+
|
194
242
|
@classmethod
|
195
243
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
196
244
|
return value
|
@@ -203,11 +251,28 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
203
251
|
label, which is used to identify the GPU type.
|
204
252
|
"""
|
205
253
|
|
206
|
-
|
254
|
+
GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
|
255
|
+
TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator'
|
256
|
+
ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
|
257
|
+
TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
|
207
258
|
|
208
259
|
@classmethod
|
209
|
-
def get_label_key(cls) -> str:
|
210
|
-
|
260
|
+
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
261
|
+
if accelerator is not None and accelerator.startswith('tpu-'):
|
262
|
+
return cls.TPU_LABEL_KEY
|
263
|
+
return cls.GPU_LABEL_KEY
|
264
|
+
|
265
|
+
@classmethod
|
266
|
+
def get_label_keys(cls) -> List[str]:
|
267
|
+
return [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
|
268
|
+
|
269
|
+
@classmethod
|
270
|
+
def match_label_key(cls, label_key: str) -> bool:
|
271
|
+
return label_key in cls.get_label_keys()
|
272
|
+
|
273
|
+
@classmethod
|
274
|
+
def get_tpu_topology_label_key(cls) -> str:
|
275
|
+
return cls.TPU_TOPOLOGY_LABEL_KEY
|
211
276
|
|
212
277
|
@classmethod
|
213
278
|
def get_label_value(cls, accelerator: str) -> str:
|
@@ -225,6 +290,8 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
225
290
|
# to distinguish between a3-high and a3-mega instances
|
226
291
|
return 'H100'
|
227
292
|
return acc
|
293
|
+
elif is_tpu_on_gke(value):
|
294
|
+
return value
|
228
295
|
else:
|
229
296
|
raise ValueError(
|
230
297
|
f'Invalid accelerator name in GKE cluster: {value}')
|
@@ -248,9 +315,13 @@ class GFDLabelFormatter(GPULabelFormatter):
|
|
248
315
|
LABEL_KEY = 'nvidia.com/gpu.product'
|
249
316
|
|
250
317
|
@classmethod
|
251
|
-
def get_label_key(cls) -> str:
|
318
|
+
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
252
319
|
return cls.LABEL_KEY
|
253
320
|
|
321
|
+
@classmethod
|
322
|
+
def get_label_keys(cls) -> List[str]:
|
323
|
+
return [cls.LABEL_KEY]
|
324
|
+
|
254
325
|
@classmethod
|
255
326
|
def get_label_value(cls, accelerator: str) -> str:
|
256
327
|
"""An accelerator can map to many Nvidia GFD labels
|
@@ -258,6 +329,10 @@ class GFDLabelFormatter(GPULabelFormatter):
|
|
258
329
|
As a result, we do not support get_label_value for GFDLabelFormatter."""
|
259
330
|
raise NotImplementedError
|
260
331
|
|
332
|
+
@classmethod
|
333
|
+
def match_label_key(cls, label_key: str) -> bool:
|
334
|
+
return label_key == cls.LABEL_KEY
|
335
|
+
|
261
336
|
@classmethod
|
262
337
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
263
338
|
"""Searches against a canonical list of NVIDIA GPUs and pattern
|
@@ -335,10 +410,9 @@ def detect_gpu_label_formatter(
|
|
335
410
|
|
336
411
|
# Check if the node labels contain any of the GPU label prefixes
|
337
412
|
for lf in LABEL_FORMATTER_REGISTRY:
|
338
|
-
label_key = lf.get_label_key()
|
339
413
|
for _, label_list in node_labels.items():
|
340
414
|
for label, _ in label_list:
|
341
|
-
if
|
415
|
+
if lf.match_label_key(label):
|
342
416
|
label_formatter = lf()
|
343
417
|
return label_formatter, node_labels
|
344
418
|
|
@@ -346,24 +420,28 @@ def detect_gpu_label_formatter(
|
|
346
420
|
|
347
421
|
|
348
422
|
@functools.lru_cache(maxsize=10)
|
349
|
-
def
|
350
|
-
|
423
|
+
def detect_accelerator_resource(
|
424
|
+
context: Optional[str]) -> Tuple[bool, Set[str]]:
|
425
|
+
"""Checks if the Kubernetes cluster has GPU/TPU resource.
|
351
426
|
|
352
|
-
|
353
|
-
|
354
|
-
|
427
|
+
Two types of accelerator resources are available which are each checked
|
428
|
+
with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
|
429
|
+
missing, that typically means that the Kubernetes cluster does not have
|
430
|
+
GPUs or the nvidia GPU operator and/or device drivers are not installed.
|
355
431
|
|
356
432
|
Returns:
|
357
|
-
bool: True if the cluster has
|
433
|
+
bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
|
434
|
+
resource, False otherwise.
|
358
435
|
"""
|
359
436
|
# Get the set of resources across all nodes
|
360
437
|
cluster_resources: Set[str] = set()
|
361
438
|
nodes = get_kubernetes_nodes(context)
|
362
439
|
for node in nodes:
|
363
440
|
cluster_resources.update(node.status.allocatable.keys())
|
364
|
-
|
441
|
+
has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or
|
442
|
+
TPU_RESOURCE_KEY in cluster_resources)
|
365
443
|
|
366
|
-
return
|
444
|
+
return has_accelerator, cluster_resources
|
367
445
|
|
368
446
|
|
369
447
|
@functools.lru_cache(maxsize=10)
|
@@ -451,16 +529,52 @@ def check_instance_fits(context: Optional[str],
|
|
451
529
|
'Maximum resources found on a single node: '
|
452
530
|
f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory')
|
453
531
|
|
532
|
+
def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
|
533
|
+
node_list: List[Any]) -> Tuple[bool, Optional[str]]:
|
534
|
+
"""Checks if the instance fits on the cluster based on requested TPU.
|
535
|
+
|
536
|
+
It checks if the TPU type and count on each node match the required
|
537
|
+
number of TPU chips for the instance. In the case of multi-host TPU
|
538
|
+
podslice, the function ensures that the number of TPU chips on a single
|
539
|
+
node (node_tpu_chip_count) and the total TPU chips across the entire
|
540
|
+
podslice (topology_chip_count) are correctly handled.
|
541
|
+
"""
|
542
|
+
acc_type = candidate_instance_type.accelerator_type
|
543
|
+
acc_count = candidate_instance_type.accelerator_count
|
544
|
+
tpu_list_in_cluster = []
|
545
|
+
for node in node_list:
|
546
|
+
if acc_type == node.metadata.labels[
|
547
|
+
GKELabelFormatter.TPU_LABEL_KEY]:
|
548
|
+
# TODO(Doyoung): Update the logic when adding support for
|
549
|
+
# multi-host TPUs.
|
550
|
+
if is_multi_host_tpu(node.metadata.labels):
|
551
|
+
continue
|
552
|
+
node_tpu_chip_count = int(node.metadata.labels[
|
553
|
+
GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY])
|
554
|
+
tpu_type = f'{acc_type}:{node_tpu_chip_count}'
|
555
|
+
tpu_list_in_cluster.append(tpu_type)
|
556
|
+
if node_tpu_chip_count == acc_count:
|
557
|
+
return True, None
|
558
|
+
tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster)
|
559
|
+
# TODO(Doyoung): Update the error message raised with the multi-host
|
560
|
+
# TPU support.
|
561
|
+
return False, ('Requested TPU type was not found in the cluster. TPU '
|
562
|
+
'types found in the cluster: '
|
563
|
+
f'{tpu_list_in_cluster_str}. Note that multi-host TPU '
|
564
|
+
'podslices are currently not unsupported.')
|
565
|
+
|
454
566
|
nodes = get_kubernetes_nodes(context)
|
455
567
|
k8s_instance_type = KubernetesInstanceType.\
|
456
568
|
from_instance_type(instance)
|
457
569
|
acc_type = k8s_instance_type.accelerator_type
|
570
|
+
acc_count = k8s_instance_type.accelerator_count
|
458
571
|
if acc_type is not None:
|
459
|
-
# If
|
460
|
-
# check if CPU and memory requirements on the specific node are
|
572
|
+
# If GPU/TPUs are requested, check if GPU/TPU type is available, and
|
573
|
+
# if so, check if CPU and memory requirements on the specific node are
|
574
|
+
# met.
|
461
575
|
try:
|
462
|
-
gpu_label_key, gpu_label_val =
|
463
|
-
context, acc_type)
|
576
|
+
gpu_label_key, gpu_label_val, _, _ = (
|
577
|
+
get_accelerator_label_key_value(context, acc_type, acc_count))
|
464
578
|
except exceptions.ResourcesUnavailableError as e:
|
465
579
|
# If GPU not found, return empty list and error message.
|
466
580
|
return False, str(e)
|
@@ -470,6 +584,13 @@ def check_instance_fits(context: Optional[str],
|
|
470
584
|
node.metadata.labels[gpu_label_key] == gpu_label_val
|
471
585
|
]
|
472
586
|
assert len(gpu_nodes) > 0, 'GPU nodes not found'
|
587
|
+
if is_tpu_on_gke(acc_type):
|
588
|
+
# If requested accelerator is a TPU type, check if the cluster
|
589
|
+
# has sufficient TPU resource to meet the requirement.
|
590
|
+
fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
|
591
|
+
if reason is not None:
|
592
|
+
return fits, reason
|
593
|
+
|
473
594
|
candidate_nodes = gpu_nodes
|
474
595
|
not_fit_reason_prefix = (
|
475
596
|
f'GPU nodes with {acc_type} do not have '
|
@@ -481,7 +602,7 @@ def check_instance_fits(context: Optional[str],
|
|
481
602
|
f'CPU (> {k8s_instance_type.cpus} CPUs) '
|
482
603
|
'and/or memory '
|
483
604
|
f'(> {k8s_instance_type.memory} G). ')
|
484
|
-
# Check if
|
605
|
+
# Check if CPU and memory requirements are met on at least one
|
485
606
|
# candidate node.
|
486
607
|
fits, reason = check_cpu_mem_fits(k8s_instance_type, candidate_nodes)
|
487
608
|
if not fits:
|
@@ -492,25 +613,33 @@ def check_instance_fits(context: Optional[str],
|
|
492
613
|
return fits, reason
|
493
614
|
|
494
615
|
|
495
|
-
def
|
496
|
-
|
497
|
-
|
498
|
-
|
616
|
+
def get_accelerator_label_key_value(
|
617
|
+
context: Optional[str],
|
618
|
+
acc_type: str,
|
619
|
+
acc_count: Optional[int],
|
620
|
+
check_mode=False
|
621
|
+
) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
|
622
|
+
"""Returns the label key and value for the given GPU/TPU type.
|
499
623
|
|
500
624
|
Args:
|
501
|
-
acc_type: The GPU type required by the task.
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
625
|
+
acc_type: The GPU/TPU type required by the task.
|
626
|
+
acc_count: Number of GPU/TPUs required by the task.
|
627
|
+
check_mode: If True, only checks if the cluster has GPU/TPU resources
|
628
|
+
and labels are setup on the cluster. acc_type is ignore does not
|
629
|
+
return the label key and value. Useful for checking if GPUs are
|
630
|
+
configured correctly on the cluster without explicitly requesting
|
631
|
+
a acc_type.
|
506
632
|
Returns:
|
507
|
-
A tuple of the label key
|
508
|
-
|
633
|
+
A tuple of the accelerator label key, value, topology label key, and
|
634
|
+
topology value. The topology label key and value are populated only if
|
635
|
+
the requested accelerator type is TPU. Returns None if check_mode is
|
636
|
+
True.
|
509
637
|
Raises:
|
510
638
|
ResourcesUnavailableError: Can be raised from the following conditions:
|
511
|
-
- The cluster does not have GPU resources
|
512
|
-
|
513
|
-
- The cluster
|
639
|
+
- The cluster does not have GPU/TPU resources
|
640
|
+
(nvidia.com/gpu, google.com/tpu)
|
641
|
+
- The cluster does not have GPU/TPU labels setup correctly
|
642
|
+
- The cluster doesn't have any nodes with acc_type GPU/TPU
|
514
643
|
"""
|
515
644
|
# Check if the cluster has GPU resources
|
516
645
|
# TODO(romilb): This assumes the accelerator is a nvidia GPU. We
|
@@ -529,13 +658,14 @@ def get_gpu_label_key_value(context: Optional[str],
|
|
529
658
|
# If check mode is enabled and autoscaler is set, we can return
|
530
659
|
# early since we assume the cluster autoscaler will handle GPU
|
531
660
|
# node provisioning.
|
532
|
-
return
|
661
|
+
return None, None, None, None
|
533
662
|
formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
|
534
663
|
assert formatter is not None, ('Unsupported autoscaler type:'
|
535
664
|
f' {autoscaler_type}')
|
536
|
-
return formatter.get_label_key(), formatter.get_label_value(
|
665
|
+
return formatter.get_label_key(acc_type), formatter.get_label_value(
|
666
|
+
acc_type), None, None
|
537
667
|
|
538
|
-
has_gpus, cluster_resources =
|
668
|
+
has_gpus, cluster_resources = detect_accelerator_resource(context)
|
539
669
|
if has_gpus:
|
540
670
|
# Check if the cluster has GPU labels setup correctly
|
541
671
|
label_formatter, node_labels = \
|
@@ -544,8 +674,10 @@ def get_gpu_label_key_value(context: Optional[str],
|
|
544
674
|
# If none of the GPU labels from LABEL_FORMATTER_REGISTRY are
|
545
675
|
# detected, raise error
|
546
676
|
with ux_utils.print_exception_no_traceback():
|
547
|
-
supported_formats = ', '.join(
|
548
|
-
|
677
|
+
supported_formats = ', '.join([
|
678
|
+
key for f in LABEL_FORMATTER_REGISTRY
|
679
|
+
for key in f.get_label_keys()
|
680
|
+
])
|
549
681
|
suffix = ''
|
550
682
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
551
683
|
suffix = f' Found node labels: {node_labels}'
|
@@ -561,7 +693,7 @@ def get_gpu_label_key_value(context: Optional[str],
|
|
561
693
|
# correctly setup and will behave as expected.
|
562
694
|
for node_name, label_list in node_labels.items():
|
563
695
|
for label, value in label_list:
|
564
|
-
if
|
696
|
+
if label_formatter.match_label_key(label):
|
565
697
|
is_valid, reason = label_formatter.validate_label_value(
|
566
698
|
value)
|
567
699
|
if not is_valid:
|
@@ -571,8 +703,7 @@ def get_gpu_label_key_value(context: Optional[str],
|
|
571
703
|
if check_mode:
|
572
704
|
# If check mode is enabled and we reached so far, we can
|
573
705
|
# conclude that the cluster is setup correctly and return.
|
574
|
-
return
|
575
|
-
k8s_acc_label_key = label_formatter.get_label_key()
|
706
|
+
return None, None, None, None
|
576
707
|
# Search in node_labels to see if any node has the requested
|
577
708
|
# GPU type.
|
578
709
|
# Note - this only checks if the label is available on a
|
@@ -580,11 +711,38 @@ def get_gpu_label_key_value(context: Optional[str],
|
|
580
711
|
# quantity is available since that is dynamic and can change
|
581
712
|
# during scheduling.
|
582
713
|
for node_name, label_list in node_labels.items():
|
714
|
+
node_metadata_labels = dict(label_list)
|
715
|
+
# TODO(Doyoung): Update the logic when adding support for
|
716
|
+
# multi-host TPUs.
|
717
|
+
if is_multi_host_tpu(node_metadata_labels):
|
718
|
+
continue
|
583
719
|
for label, value in label_list:
|
584
|
-
if (label
|
720
|
+
if (label_formatter.match_label_key(label) and
|
585
721
|
label_formatter.get_accelerator_from_label_value(
|
586
722
|
value) == acc_type):
|
587
|
-
|
723
|
+
if is_tpu_on_gke(acc_type):
|
724
|
+
assert isinstance(label_formatter,
|
725
|
+
GKELabelFormatter)
|
726
|
+
if node_metadata_labels.get(
|
727
|
+
label_formatter.TPU_LABEL_KEY) == acc_type:
|
728
|
+
topology_label_key = (
|
729
|
+
label_formatter.TPU_TOPOLOGY_LABEL_KEY)
|
730
|
+
topology_value = node_metadata_labels.get(
|
731
|
+
topology_label_key)
|
732
|
+
assert topology_value is not None
|
733
|
+
tpu_topology_chip_count = reduce_tpu_topology(
|
734
|
+
topology_value)
|
735
|
+
# For single-host TPUs, there aren't multiple
|
736
|
+
# different topologies that maps to identical
|
737
|
+
# number of TPU chips.
|
738
|
+
if tpu_topology_chip_count == acc_count:
|
739
|
+
return (label, value, topology_label_key,
|
740
|
+
topology_value)
|
741
|
+
else:
|
742
|
+
continue
|
743
|
+
else:
|
744
|
+
return label, value, None, None
|
745
|
+
|
588
746
|
# If no node is found with the requested acc_type, raise error
|
589
747
|
with ux_utils.print_exception_no_traceback():
|
590
748
|
suffix = ''
|
@@ -592,15 +750,19 @@ def get_gpu_label_key_value(context: Optional[str],
|
|
592
750
|
all_labels = []
|
593
751
|
for node_name, label_list in node_labels.items():
|
594
752
|
all_labels.extend(label_list)
|
595
|
-
|
596
|
-
|
597
|
-
suffix =
|
753
|
+
acc_available = set(v for k, v in all_labels
|
754
|
+
if label_formatter.match_label_key(k))
|
755
|
+
suffix = (' Available GPU/TPUs on the cluster: '
|
756
|
+
f'{acc_available}')
|
757
|
+
# TODO(Doyoung): Update the error message raised with the
|
758
|
+
# multi-host TPU support.
|
598
759
|
raise exceptions.ResourcesUnavailableError(
|
599
760
|
'Could not find any node in the Kubernetes cluster '
|
600
|
-
f'with {acc_type}
|
601
|
-
f'
|
602
|
-
'
|
603
|
-
f'
|
761
|
+
f'with {acc_type}. Please ensure at least one node in the '
|
762
|
+
f'cluster has {acc_type} and node labels are setup '
|
763
|
+
'correctly. Please refer to the documentration for more. '
|
764
|
+
f'{suffix}. Note that multi-host TPU podslices are '
|
765
|
+
'currently not unsupported.')
|
604
766
|
else:
|
605
767
|
# If GPU resources are not detected, raise error
|
606
768
|
with ux_utils.print_exception_no_traceback():
|
@@ -609,13 +771,14 @@ def get_gpu_label_key_value(context: Optional[str],
|
|
609
771
|
suffix = (' Available resources on the cluster: '
|
610
772
|
f'{cluster_resources}')
|
611
773
|
raise exceptions.ResourcesUnavailableError(
|
612
|
-
'Could not detect GPU resources (
|
613
|
-
'Kubernetes cluster. If this cluster
|
614
|
-
'ensure GPU drivers are installed on
|
615
|
-
'GPUs are setup correctly by running
|
616
|
-
'and looking for the
|
617
|
-
'
|
618
|
-
|
774
|
+
f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
|
775
|
+
f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
|
776
|
+
' contains GPUs, please ensure GPU drivers are installed on '
|
777
|
+
'the node. Check if the GPUs are setup correctly by running '
|
778
|
+
'`kubectl describe nodes` and looking for the '
|
779
|
+
f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
|
780
|
+
'Please refer to the documentation on how to set up GPUs.'
|
781
|
+
f'{suffix}')
|
619
782
|
|
620
783
|
|
621
784
|
def get_head_ssh_port(cluster_name: str, namespace: str,
|
@@ -710,7 +873,10 @@ def check_credentials(context: Optional[str],
|
|
710
873
|
# provider if their cluster GPUs are not setup correctly.
|
711
874
|
gpu_msg = ''
|
712
875
|
try:
|
713
|
-
|
876
|
+
get_accelerator_label_key_value(context,
|
877
|
+
acc_type='',
|
878
|
+
acc_count=0,
|
879
|
+
check_mode=True)
|
714
880
|
except exceptions.ResourcesUnavailableError as e:
|
715
881
|
# If GPUs are not available, we return cluster as enabled (since it can
|
716
882
|
# be a CPU-only cluster) but we also return the exception message which
|
@@ -1787,7 +1953,7 @@ def dict_to_k8s_object(object_dict: Dict[str, Any], object_type: 'str') -> Any:
|
|
1787
1953
|
class KubernetesNodeInfo:
|
1788
1954
|
"""Dataclass to store Kubernetes node information."""
|
1789
1955
|
name: str
|
1790
|
-
|
1956
|
+
accelerator_type: Optional[str]
|
1791
1957
|
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
|
1792
1958
|
total: Dict[str, int]
|
1793
1959
|
free: Dict[str, int]
|
@@ -1818,47 +1984,54 @@ def get_kubernetes_node_info(
|
|
1818
1984
|
else:
|
1819
1985
|
raise
|
1820
1986
|
|
1821
|
-
|
1822
|
-
if not
|
1987
|
+
lf, _ = detect_gpu_label_formatter(context)
|
1988
|
+
if not lf:
|
1823
1989
|
label_key = None
|
1824
1990
|
else:
|
1825
|
-
|
1991
|
+
label_keys = lf.get_label_keys()
|
1826
1992
|
|
1827
1993
|
node_info_dict: Dict[str, KubernetesNodeInfo] = {}
|
1828
1994
|
|
1829
|
-
for
|
1830
|
-
|
1831
|
-
|
1832
|
-
|
1833
|
-
|
1834
|
-
|
1835
|
-
|
1995
|
+
for label_key in label_keys:
|
1996
|
+
for node in nodes:
|
1997
|
+
allocated_qty = 0
|
1998
|
+
if lf is not None and label_key in node.metadata.labels:
|
1999
|
+
accelerator_name = lf.get_accelerator_from_label_value(
|
2000
|
+
node.metadata.labels.get(label_key))
|
2001
|
+
else:
|
2002
|
+
accelerator_name = None
|
1836
2003
|
|
1837
|
-
|
1838
|
-
|
2004
|
+
accelerator_count = get_node_accelerator_count(
|
2005
|
+
node.status.allocatable)
|
1839
2006
|
|
1840
|
-
|
1841
|
-
|
2007
|
+
if pods is None:
|
2008
|
+
accelerators_available = -1
|
1842
2009
|
|
1843
|
-
|
1844
|
-
|
1845
|
-
|
1846
|
-
|
1847
|
-
|
1848
|
-
|
1849
|
-
|
1850
|
-
|
1851
|
-
|
1852
|
-
|
1853
|
-
|
1854
|
-
|
1855
|
-
|
1856
|
-
|
1857
|
-
|
1858
|
-
|
1859
|
-
|
1860
|
-
|
1861
|
-
|
2010
|
+
else:
|
2011
|
+
for pod in pods:
|
2012
|
+
# Get all the pods running on the node
|
2013
|
+
if (pod.spec.node_name == node.metadata.name and
|
2014
|
+
pod.status.phase in ['Running', 'Pending']):
|
2015
|
+
# Iterate over all the containers in the pod and sum the
|
2016
|
+
# GPU requests
|
2017
|
+
for container in pod.spec.containers:
|
2018
|
+
if container.resources.requests:
|
2019
|
+
allocated_qty += get_node_accelerator_count(
|
2020
|
+
container.resources.requests)
|
2021
|
+
|
2022
|
+
accelerators_available = accelerator_count - allocated_qty
|
2023
|
+
|
2024
|
+
# Exclude multi-host TPUs from being processed.
|
2025
|
+
# TODO(Doyoung): Remove the logic when adding support for
|
2026
|
+
# multi-host TPUs.
|
2027
|
+
if is_multi_host_tpu(node.metadata.labels):
|
2028
|
+
continue
|
2029
|
+
|
2030
|
+
node_info_dict[node.metadata.name] = KubernetesNodeInfo(
|
2031
|
+
name=node.metadata.name,
|
2032
|
+
accelerator_type=accelerator_name,
|
2033
|
+
total={'accelerator_count': int(accelerator_count)},
|
2034
|
+
free={'accelerators_available': int(accelerators_available)})
|
1862
2035
|
|
1863
2036
|
return node_info_dict
|
1864
2037
|
|
@@ -2040,6 +2213,80 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
2040
2213
|
return pods
|
2041
2214
|
|
2042
2215
|
|
2216
|
+
def is_tpu_on_gke(accelerator: str) -> bool:
|
2217
|
+
"""Determins if the given accelerator is a TPU supported on GKE."""
|
2218
|
+
return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
|
2219
|
+
|
2220
|
+
|
2221
|
+
def get_node_accelerator_count(attribute_dict: dict) -> int:
|
2222
|
+
"""Retrieves the count of accelerators from a node's resource dictionary.
|
2223
|
+
|
2224
|
+
This method checks the node's allocatable resources or the accelerators
|
2225
|
+
already deployed on the node, using pod objects that describe resource
|
2226
|
+
requests.
|
2227
|
+
|
2228
|
+
Args:
|
2229
|
+
attribute_dict: Containing resource information from a node, such as
|
2230
|
+
allocatable or requested resources.
|
2231
|
+
|
2232
|
+
Returns:
|
2233
|
+
Number of accelerators allocated or available from the node. If no
|
2234
|
+
resource is found, it returns 0.
|
2235
|
+
"""
|
2236
|
+
assert not (GPU_RESOURCE_KEY in attribute_dict and
|
2237
|
+
TPU_RESOURCE_KEY in attribute_dict)
|
2238
|
+
if GPU_RESOURCE_KEY in attribute_dict:
|
2239
|
+
return int(attribute_dict[GPU_RESOURCE_KEY])
|
2240
|
+
elif TPU_RESOURCE_KEY in attribute_dict:
|
2241
|
+
return int(attribute_dict[TPU_RESOURCE_KEY])
|
2242
|
+
return 0
|
2243
|
+
|
2244
|
+
|
2245
|
+
def reduce_tpu_topology(topology: str) -> int:
|
2246
|
+
"""Computes the number of TPU chips from its topology string."""
|
2247
|
+
chip_dimensions = [int(chip_count) for chip_count in topology.split('x')]
|
2248
|
+
# tpu_topology_chip_count represents the total number of TPU chips in the
|
2249
|
+
# entire podslice, whether it is a single-host or multi-host TPU podslice.
|
2250
|
+
tpu_topology_chip_count = functools.reduce(lambda x, y: x * y,
|
2251
|
+
chip_dimensions)
|
2252
|
+
return tpu_topology_chip_count
|
2253
|
+
|
2254
|
+
|
2255
|
+
def is_multi_host_tpu(node_metadata_labels: dict) -> bool:
|
2256
|
+
"""Determines whether the given node is a multi-host TPU configuration."""
|
2257
|
+
if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels:
|
2258
|
+
assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels
|
2259
|
+
topology_value = (
|
2260
|
+
node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY])
|
2261
|
+
accelerator_count_label_key = (
|
2262
|
+
GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY)
|
2263
|
+
assert accelerator_count_label_key in node_metadata_labels
|
2264
|
+
# node_tpu_chip_count represents the number of TPU chips
|
2265
|
+
# available in this node. If the node is part of a node pool
|
2266
|
+
# forming a multi-host TPU podslice, it only reflects the
|
2267
|
+
# number of TPU chips in this individual node, not the entire
|
2268
|
+
# multi-host TPU podslice.
|
2269
|
+
node_tpu_chip_count = int(
|
2270
|
+
node_metadata_labels[accelerator_count_label_key])
|
2271
|
+
topology_chip_count = reduce_tpu_topology(topology_value)
|
2272
|
+
# For multi-host TPU podslices, topology_chip_count and
|
2273
|
+
# node_tpu_chip_count will differ, as topology_chip_count
|
2274
|
+
# reflects the total across all hosts, while
|
2275
|
+
# node_tpu_chip_count reflects only the chips in a single node.
|
2276
|
+
if node_tpu_chip_count != topology_chip_count:
|
2277
|
+
return True
|
2278
|
+
return False
|
2279
|
+
|
2280
|
+
|
2281
|
+
def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool:
|
2282
|
+
"""Checks if there exists a multi-host TPU within the cluster."""
|
2283
|
+
nodes = get_kubernetes_nodes(context)
|
2284
|
+
for node in nodes:
|
2285
|
+
if is_multi_host_tpu(node.metadata.labels):
|
2286
|
+
return True
|
2287
|
+
return False
|
2288
|
+
|
2289
|
+
|
2043
2290
|
@dataclasses.dataclass
|
2044
2291
|
class KubernetesSkyPilotClusterInfo:
|
2045
2292
|
cluster_name_on_cloud: str
|