skypilot-nightly 1.0.0.dev20241112__py3-none-any.whl → 1.0.0.dev20241113__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -48,10 +48,18 @@ MEMORY_SIZE_UNITS = {
48
48
  'T': 2**40,
49
49
  'P': 2**50,
50
50
  }
51
- NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure '
52
- 'nvidia.com/gpu resource is available on the nodes and '
53
- 'the node labels for identifying GPUs '
54
- '(e.g., skypilot.co/accelerator) are setup correctly. ')
51
+
52
+ # The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
53
+ # nodes. These keys are typically used in the node's status.allocatable
54
+ # or status.capacity fields to indicate the available resources on the node.
55
+ GPU_RESOURCE_KEY = 'nvidia.com/gpu'
56
+ TPU_RESOURCE_KEY = 'google.com/tpu'
57
+
58
+ NO_ACCELERATOR_HELP_MESSAGE = (
59
+ 'If your cluster contains GPUs or TPUs, make sure '
60
+ f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available '
61
+ 'on the nodes and the node labels for identifying GPUs/TPUs '
62
+ '(e.g., skypilot.co/accelerator) are setup correctly. ')
55
63
 
56
64
  KUBERNETES_AUTOSCALER_NOTE = (
57
65
  'Note: Kubernetes cluster autoscaling is enabled. '
@@ -74,6 +82,17 @@ PORT_FORWARD_PROXY_CMD_VERSION = 2
74
82
  PORT_FORWARD_PROXY_CMD_PATH = ('~/.sky/kubernetes-port-forward-proxy-command-'
75
83
  f'v{PORT_FORWARD_PROXY_CMD_VERSION}.sh')
76
84
 
85
+ # Mapping used to get generation for TPU accelerator name.
86
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/tpus#run
87
+ GKE_TPU_ACCELERATOR_TO_GENERATION = {
88
+ 'tpu-v4-podslice': 'v4',
89
+ # Only Single-host v5e TPU configurations are allowed.
90
+ 'tpu-v5-lite-device': 'v5e',
91
+ # Multi-host compatible v5e TPU configurations allowed.
92
+ 'tpu-v5-lite-podslice': 'v5e',
93
+ 'tpu-v5p-slice': 'v5p',
94
+ }
95
+
77
96
  POD_STATUSES = {
78
97
  'Pending', 'Running', 'Succeeded', 'Failed', 'Unknown', 'Terminating'
79
98
  }
@@ -96,15 +115,25 @@ class GPULabelFormatter:
96
115
  """
97
116
 
98
117
  @classmethod
99
- def get_label_key(cls) -> str:
118
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
100
119
  """Returns the label key for GPU type used by the Kubernetes cluster"""
101
120
  raise NotImplementedError
102
121
 
122
+ @classmethod
123
+ def get_label_keys(cls) -> List[str]:
124
+ """Returns a list of label keys for GPU used by Kubernetes cluster."""
125
+ raise NotImplementedError
126
+
103
127
  @classmethod
104
128
  def get_label_value(cls, accelerator: str) -> str:
105
129
  """Given a GPU type, returns the label value to be used"""
106
130
  raise NotImplementedError
107
131
 
132
+ @classmethod
133
+ def match_label_key(cls, label_key: str) -> bool:
134
+ """Checks if the given label key matches the formatter's label keys"""
135
+ raise NotImplementedError
136
+
108
137
  @classmethod
109
138
  def get_accelerator_from_label_value(cls, value: str) -> str:
110
139
  """Given a label value, returns the GPU type"""
@@ -126,10 +155,11 @@ class GPULabelFormatter:
126
155
 
127
156
 
128
157
  def get_gke_accelerator_name(accelerator: str) -> str:
129
- """Returns the accelerator name for GKE clusters
158
+ """Returns the accelerator name for GKE clusters.
130
159
 
131
160
  Uses the format - nvidia-tesla-<accelerator>.
132
- A100-80GB, H100-80GB and L4 are an exception. They use nvidia-<accelerator>.
161
+ A100-80GB, H100-80GB, L4 are an exception. They use nvidia-<accelerator>.
162
+ TPU types are an exception as well keeping the given name.
133
163
  """
134
164
  if accelerator == 'H100':
135
165
  # H100 is named as H100-80GB in GKE.
@@ -138,6 +168,8 @@ def get_gke_accelerator_name(accelerator: str) -> str:
138
168
  # A100-80GB, L4, H100-80GB and H100-MEGA-80GB
139
169
  # have a different name pattern.
140
170
  return 'nvidia-{}'.format(accelerator.lower())
171
+ elif accelerator.startswith('tpu-'):
172
+ return accelerator
141
173
  else:
142
174
  return 'nvidia-tesla-{}'.format(accelerator.lower())
143
175
 
@@ -152,15 +184,23 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
152
184
  LABEL_KEY = 'skypilot.co/accelerator'
153
185
 
154
186
  @classmethod
155
- def get_label_key(cls) -> str:
187
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
156
188
  return cls.LABEL_KEY
157
189
 
190
+ @classmethod
191
+ def get_label_keys(cls) -> List[str]:
192
+ return [cls.LABEL_KEY]
193
+
158
194
  @classmethod
159
195
  def get_label_value(cls, accelerator: str) -> str:
160
196
  # For SkyPilot formatter, we use the accelerator str directly.
161
197
  # See sky.utils.kubernetes.gpu_labeler.
162
198
  return accelerator.lower()
163
199
 
200
+ @classmethod
201
+ def match_label_key(cls, label_key: str) -> bool:
202
+ return label_key == cls.LABEL_KEY
203
+
164
204
  @classmethod
165
205
  def get_accelerator_from_label_value(cls, value: str) -> str:
166
206
  return value.upper()
@@ -184,13 +224,21 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
184
224
  LABEL_KEY = 'gpu.nvidia.com/class'
185
225
 
186
226
  @classmethod
187
- def get_label_key(cls) -> str:
227
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
188
228
  return cls.LABEL_KEY
189
229
 
230
+ @classmethod
231
+ def get_label_keys(cls) -> List[str]:
232
+ return [cls.LABEL_KEY]
233
+
190
234
  @classmethod
191
235
  def get_label_value(cls, accelerator: str) -> str:
192
236
  return accelerator.upper()
193
237
 
238
+ @classmethod
239
+ def match_label_key(cls, label_key: str) -> bool:
240
+ return label_key == cls.LABEL_KEY
241
+
194
242
  @classmethod
195
243
  def get_accelerator_from_label_value(cls, value: str) -> str:
196
244
  return value
@@ -203,11 +251,28 @@ class GKELabelFormatter(GPULabelFormatter):
203
251
  label, which is used to identify the GPU type.
204
252
  """
205
253
 
206
- LABEL_KEY = 'cloud.google.com/gke-accelerator'
254
+ GPU_LABEL_KEY = 'cloud.google.com/gke-accelerator'
255
+ TPU_LABEL_KEY = 'cloud.google.com/gke-tpu-accelerator'
256
+ ACCELERATOR_COUNT_LABEL_KEY = 'cloud.google.com/gke-accelerator-count'
257
+ TPU_TOPOLOGY_LABEL_KEY = 'cloud.google.com/gke-tpu-topology'
207
258
 
208
259
  @classmethod
209
- def get_label_key(cls) -> str:
210
- return cls.LABEL_KEY
260
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
261
+ if accelerator is not None and accelerator.startswith('tpu-'):
262
+ return cls.TPU_LABEL_KEY
263
+ return cls.GPU_LABEL_KEY
264
+
265
+ @classmethod
266
+ def get_label_keys(cls) -> List[str]:
267
+ return [cls.GPU_LABEL_KEY, cls.TPU_LABEL_KEY]
268
+
269
+ @classmethod
270
+ def match_label_key(cls, label_key: str) -> bool:
271
+ return label_key in cls.get_label_keys()
272
+
273
+ @classmethod
274
+ def get_tpu_topology_label_key(cls) -> str:
275
+ return cls.TPU_TOPOLOGY_LABEL_KEY
211
276
 
212
277
  @classmethod
213
278
  def get_label_value(cls, accelerator: str) -> str:
@@ -225,6 +290,8 @@ class GKELabelFormatter(GPULabelFormatter):
225
290
  # to distinguish between a3-high and a3-mega instances
226
291
  return 'H100'
227
292
  return acc
293
+ elif is_tpu_on_gke(value):
294
+ return value
228
295
  else:
229
296
  raise ValueError(
230
297
  f'Invalid accelerator name in GKE cluster: {value}')
@@ -248,9 +315,13 @@ class GFDLabelFormatter(GPULabelFormatter):
248
315
  LABEL_KEY = 'nvidia.com/gpu.product'
249
316
 
250
317
  @classmethod
251
- def get_label_key(cls) -> str:
318
+ def get_label_key(cls, accelerator: Optional[str] = None) -> str:
252
319
  return cls.LABEL_KEY
253
320
 
321
+ @classmethod
322
+ def get_label_keys(cls) -> List[str]:
323
+ return [cls.LABEL_KEY]
324
+
254
325
  @classmethod
255
326
  def get_label_value(cls, accelerator: str) -> str:
256
327
  """An accelerator can map to many Nvidia GFD labels
@@ -258,6 +329,10 @@ class GFDLabelFormatter(GPULabelFormatter):
258
329
  As a result, we do not support get_label_value for GFDLabelFormatter."""
259
330
  raise NotImplementedError
260
331
 
332
+ @classmethod
333
+ def match_label_key(cls, label_key: str) -> bool:
334
+ return label_key == cls.LABEL_KEY
335
+
261
336
  @classmethod
262
337
  def get_accelerator_from_label_value(cls, value: str) -> str:
263
338
  """Searches against a canonical list of NVIDIA GPUs and pattern
@@ -335,10 +410,9 @@ def detect_gpu_label_formatter(
335
410
 
336
411
  # Check if the node labels contain any of the GPU label prefixes
337
412
  for lf in LABEL_FORMATTER_REGISTRY:
338
- label_key = lf.get_label_key()
339
413
  for _, label_list in node_labels.items():
340
414
  for label, _ in label_list:
341
- if label.startswith(label_key):
415
+ if lf.match_label_key(label):
342
416
  label_formatter = lf()
343
417
  return label_formatter, node_labels
344
418
 
@@ -346,24 +420,28 @@ def detect_gpu_label_formatter(
346
420
 
347
421
 
348
422
  @functools.lru_cache(maxsize=10)
349
- def detect_gpu_resource(context: Optional[str]) -> Tuple[bool, Set[str]]:
350
- """Checks if the Kubernetes cluster has nvidia.com/gpu resource.
423
+ def detect_accelerator_resource(
424
+ context: Optional[str]) -> Tuple[bool, Set[str]]:
425
+ """Checks if the Kubernetes cluster has GPU/TPU resource.
351
426
 
352
- If nvidia.com/gpu resource is missing, that typically means that the
353
- Kubernetes cluster does not have GPUs or the nvidia GPU operator and/or
354
- device drivers are not installed.
427
+ Two types of accelerator resources are available which are each checked
428
+ with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
429
+ missing, that typically means that the Kubernetes cluster does not have
430
+ GPUs or the nvidia GPU operator and/or device drivers are not installed.
355
431
 
356
432
  Returns:
357
- bool: True if the cluster has nvidia.com/gpu resource, False otherwise.
433
+ bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
434
+ resource, False otherwise.
358
435
  """
359
436
  # Get the set of resources across all nodes
360
437
  cluster_resources: Set[str] = set()
361
438
  nodes = get_kubernetes_nodes(context)
362
439
  for node in nodes:
363
440
  cluster_resources.update(node.status.allocatable.keys())
364
- has_gpu = 'nvidia.com/gpu' in cluster_resources
441
+ has_accelerator = (GPU_RESOURCE_KEY in cluster_resources or
442
+ TPU_RESOURCE_KEY in cluster_resources)
365
443
 
366
- return has_gpu, cluster_resources
444
+ return has_accelerator, cluster_resources
367
445
 
368
446
 
369
447
  @functools.lru_cache(maxsize=10)
@@ -451,16 +529,52 @@ def check_instance_fits(context: Optional[str],
451
529
  'Maximum resources found on a single node: '
452
530
  f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory')
453
531
 
532
+ def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
533
+ node_list: List[Any]) -> Tuple[bool, Optional[str]]:
534
+ """Checks if the instance fits on the cluster based on requested TPU.
535
+
536
+ It checks if the TPU type and count on each node match the required
537
+ number of TPU chips for the instance. In the case of multi-host TPU
538
+ podslice, the function ensures that the number of TPU chips on a single
539
+ node (node_tpu_chip_count) and the total TPU chips across the entire
540
+ podslice (topology_chip_count) are correctly handled.
541
+ """
542
+ acc_type = candidate_instance_type.accelerator_type
543
+ acc_count = candidate_instance_type.accelerator_count
544
+ tpu_list_in_cluster = []
545
+ for node in node_list:
546
+ if acc_type == node.metadata.labels[
547
+ GKELabelFormatter.TPU_LABEL_KEY]:
548
+ # TODO(Doyoung): Update the logic when adding support for
549
+ # multi-host TPUs.
550
+ if is_multi_host_tpu(node.metadata.labels):
551
+ continue
552
+ node_tpu_chip_count = int(node.metadata.labels[
553
+ GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY])
554
+ tpu_type = f'{acc_type}:{node_tpu_chip_count}'
555
+ tpu_list_in_cluster.append(tpu_type)
556
+ if node_tpu_chip_count == acc_count:
557
+ return True, None
558
+ tpu_list_in_cluster_str = ','.join(tpu_list_in_cluster)
559
+ # TODO(Doyoung): Update the error message raised with the multi-host
560
+ # TPU support.
561
+ return False, ('Requested TPU type was not found in the cluster. TPU '
562
+ 'types found in the cluster: '
563
+ f'{tpu_list_in_cluster_str}. Note that multi-host TPU '
564
+ 'podslices are currently not unsupported.')
565
+
454
566
  nodes = get_kubernetes_nodes(context)
455
567
  k8s_instance_type = KubernetesInstanceType.\
456
568
  from_instance_type(instance)
457
569
  acc_type = k8s_instance_type.accelerator_type
570
+ acc_count = k8s_instance_type.accelerator_count
458
571
  if acc_type is not None:
459
- # If GPUs are requested, check if GPU type is available, and if so,
460
- # check if CPU and memory requirements on the specific node are met.
572
+ # If GPU/TPUs are requested, check if GPU/TPU type is available, and
573
+ # if so, check if CPU and memory requirements on the specific node are
574
+ # met.
461
575
  try:
462
- gpu_label_key, gpu_label_val = get_gpu_label_key_value(
463
- context, acc_type)
576
+ gpu_label_key, gpu_label_val, _, _ = (
577
+ get_accelerator_label_key_value(context, acc_type, acc_count))
464
578
  except exceptions.ResourcesUnavailableError as e:
465
579
  # If GPU not found, return empty list and error message.
466
580
  return False, str(e)
@@ -470,6 +584,13 @@ def check_instance_fits(context: Optional[str],
470
584
  node.metadata.labels[gpu_label_key] == gpu_label_val
471
585
  ]
472
586
  assert len(gpu_nodes) > 0, 'GPU nodes not found'
587
+ if is_tpu_on_gke(acc_type):
588
+ # If requested accelerator is a TPU type, check if the cluster
589
+ # has sufficient TPU resource to meet the requirement.
590
+ fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
591
+ if reason is not None:
592
+ return fits, reason
593
+
473
594
  candidate_nodes = gpu_nodes
474
595
  not_fit_reason_prefix = (
475
596
  f'GPU nodes with {acc_type} do not have '
@@ -481,7 +602,7 @@ def check_instance_fits(context: Optional[str],
481
602
  f'CPU (> {k8s_instance_type.cpus} CPUs) '
482
603
  'and/or memory '
483
604
  f'(> {k8s_instance_type.memory} G). ')
484
- # Check if CPU and memory requirements are met on at least one
605
+ # Check if CPU and memory requirements are met on at least one
485
606
  # candidate node.
486
607
  fits, reason = check_cpu_mem_fits(k8s_instance_type, candidate_nodes)
487
608
  if not fits:
@@ -492,25 +613,33 @@ def check_instance_fits(context: Optional[str],
492
613
  return fits, reason
493
614
 
494
615
 
495
- def get_gpu_label_key_value(context: Optional[str],
496
- acc_type: str,
497
- check_mode=False) -> Tuple[str, str]:
498
- """Returns the label key and value for the given GPU type.
616
+ def get_accelerator_label_key_value(
617
+ context: Optional[str],
618
+ acc_type: str,
619
+ acc_count: Optional[int],
620
+ check_mode=False
621
+ ) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
622
+ """Returns the label key and value for the given GPU/TPU type.
499
623
 
500
624
  Args:
501
- acc_type: The GPU type required by the task.
502
- check_mode: If True, only checks if the cluster has GPU resources and
503
- labels are setup on the cluster. acc_type is ignore does not return
504
- the label key and value. Useful for checking if GPUs are configured
505
- correctly on the cluster without explicitly requesting a acc_type.
625
+ acc_type: The GPU/TPU type required by the task.
626
+ acc_count: Number of GPU/TPUs required by the task.
627
+ check_mode: If True, only checks if the cluster has GPU/TPU resources
628
+ and labels are setup on the cluster. acc_type is ignore does not
629
+ return the label key and value. Useful for checking if GPUs are
630
+ configured correctly on the cluster without explicitly requesting
631
+ a acc_type.
506
632
  Returns:
507
- A tuple of the label key and value. Returns empty strings if check_mode
508
- is True.
633
+ A tuple of the accelerator label key, value, topology label key, and
634
+ topology value. The topology label key and value are populated only if
635
+ the requested accelerator type is TPU. Returns None if check_mode is
636
+ True.
509
637
  Raises:
510
638
  ResourcesUnavailableError: Can be raised from the following conditions:
511
- - The cluster does not have GPU resources (nvidia.com/gpu)
512
- - The cluster does not have GPU labels setup correctly
513
- - The cluster doesn't have any nodes with acc_type GPU
639
+ - The cluster does not have GPU/TPU resources
640
+ (nvidia.com/gpu, google.com/tpu)
641
+ - The cluster does not have GPU/TPU labels setup correctly
642
+ - The cluster doesn't have any nodes with acc_type GPU/TPU
514
643
  """
515
644
  # Check if the cluster has GPU resources
516
645
  # TODO(romilb): This assumes the accelerator is a nvidia GPU. We
@@ -529,13 +658,14 @@ def get_gpu_label_key_value(context: Optional[str],
529
658
  # If check mode is enabled and autoscaler is set, we can return
530
659
  # early since we assume the cluster autoscaler will handle GPU
531
660
  # node provisioning.
532
- return '', ''
661
+ return None, None, None, None
533
662
  formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
534
663
  assert formatter is not None, ('Unsupported autoscaler type:'
535
664
  f' {autoscaler_type}')
536
- return formatter.get_label_key(), formatter.get_label_value(acc_type)
665
+ return formatter.get_label_key(acc_type), formatter.get_label_value(
666
+ acc_type), None, None
537
667
 
538
- has_gpus, cluster_resources = detect_gpu_resource(context)
668
+ has_gpus, cluster_resources = detect_accelerator_resource(context)
539
669
  if has_gpus:
540
670
  # Check if the cluster has GPU labels setup correctly
541
671
  label_formatter, node_labels = \
@@ -544,8 +674,10 @@ def get_gpu_label_key_value(context: Optional[str],
544
674
  # If none of the GPU labels from LABEL_FORMATTER_REGISTRY are
545
675
  # detected, raise error
546
676
  with ux_utils.print_exception_no_traceback():
547
- supported_formats = ', '.join(
548
- [f.get_label_key() for f in LABEL_FORMATTER_REGISTRY])
677
+ supported_formats = ', '.join([
678
+ key for f in LABEL_FORMATTER_REGISTRY
679
+ for key in f.get_label_keys()
680
+ ])
549
681
  suffix = ''
550
682
  if env_options.Options.SHOW_DEBUG_INFO.get():
551
683
  suffix = f' Found node labels: {node_labels}'
@@ -561,7 +693,7 @@ def get_gpu_label_key_value(context: Optional[str],
561
693
  # correctly setup and will behave as expected.
562
694
  for node_name, label_list in node_labels.items():
563
695
  for label, value in label_list:
564
- if label == label_formatter.get_label_key():
696
+ if label_formatter.match_label_key(label):
565
697
  is_valid, reason = label_formatter.validate_label_value(
566
698
  value)
567
699
  if not is_valid:
@@ -571,8 +703,7 @@ def get_gpu_label_key_value(context: Optional[str],
571
703
  if check_mode:
572
704
  # If check mode is enabled and we reached so far, we can
573
705
  # conclude that the cluster is setup correctly and return.
574
- return '', ''
575
- k8s_acc_label_key = label_formatter.get_label_key()
706
+ return None, None, None, None
576
707
  # Search in node_labels to see if any node has the requested
577
708
  # GPU type.
578
709
  # Note - this only checks if the label is available on a
@@ -580,11 +711,38 @@ def get_gpu_label_key_value(context: Optional[str],
580
711
  # quantity is available since that is dynamic and can change
581
712
  # during scheduling.
582
713
  for node_name, label_list in node_labels.items():
714
+ node_metadata_labels = dict(label_list)
715
+ # TODO(Doyoung): Update the logic when adding support for
716
+ # multi-host TPUs.
717
+ if is_multi_host_tpu(node_metadata_labels):
718
+ continue
583
719
  for label, value in label_list:
584
- if (label == k8s_acc_label_key and
720
+ if (label_formatter.match_label_key(label) and
585
721
  label_formatter.get_accelerator_from_label_value(
586
722
  value) == acc_type):
587
- return label, value
723
+ if is_tpu_on_gke(acc_type):
724
+ assert isinstance(label_formatter,
725
+ GKELabelFormatter)
726
+ if node_metadata_labels.get(
727
+ label_formatter.TPU_LABEL_KEY) == acc_type:
728
+ topology_label_key = (
729
+ label_formatter.TPU_TOPOLOGY_LABEL_KEY)
730
+ topology_value = node_metadata_labels.get(
731
+ topology_label_key)
732
+ assert topology_value is not None
733
+ tpu_topology_chip_count = reduce_tpu_topology(
734
+ topology_value)
735
+ # For single-host TPUs, there aren't multiple
736
+ # different topologies that maps to identical
737
+ # number of TPU chips.
738
+ if tpu_topology_chip_count == acc_count:
739
+ return (label, value, topology_label_key,
740
+ topology_value)
741
+ else:
742
+ continue
743
+ else:
744
+ return label, value, None, None
745
+
588
746
  # If no node is found with the requested acc_type, raise error
589
747
  with ux_utils.print_exception_no_traceback():
590
748
  suffix = ''
@@ -592,15 +750,19 @@ def get_gpu_label_key_value(context: Optional[str],
592
750
  all_labels = []
593
751
  for node_name, label_list in node_labels.items():
594
752
  all_labels.extend(label_list)
595
- gpus_available = set(
596
- v for k, v in all_labels if k == k8s_acc_label_key)
597
- suffix = f' Available GPUs on the cluster: {gpus_available}'
753
+ acc_available = set(v for k, v in all_labels
754
+ if label_formatter.match_label_key(k))
755
+ suffix = (' Available GPU/TPUs on the cluster: '
756
+ f'{acc_available}')
757
+ # TODO(Doyoung): Update the error message raised with the
758
+ # multi-host TPU support.
598
759
  raise exceptions.ResourcesUnavailableError(
599
760
  'Could not find any node in the Kubernetes cluster '
600
- f'with {acc_type} GPU. Please ensure at least '
601
- f'one node in the cluster has {acc_type} GPU and node '
602
- 'labels are setup correctly. '
603
- f'Please refer to the documentation for more. {suffix}')
761
+ f'with {acc_type}. Please ensure at least one node in the '
762
+ f'cluster has {acc_type} and node labels are setup '
763
+ 'correctly. Please refer to the documentration for more. '
764
+ f'{suffix}. Note that multi-host TPU podslices are '
765
+ 'currently not unsupported.')
604
766
  else:
605
767
  # If GPU resources are not detected, raise error
606
768
  with ux_utils.print_exception_no_traceback():
@@ -609,13 +771,14 @@ def get_gpu_label_key_value(context: Optional[str],
609
771
  suffix = (' Available resources on the cluster: '
610
772
  f'{cluster_resources}')
611
773
  raise exceptions.ResourcesUnavailableError(
612
- 'Could not detect GPU resources (`nvidia.com/gpu`) in '
613
- 'Kubernetes cluster. If this cluster contains GPUs, please '
614
- 'ensure GPU drivers are installed on the node. Check if the '
615
- 'GPUs are setup correctly by running `kubectl describe nodes` '
616
- 'and looking for the nvidia.com/gpu resource. '
617
- 'Please refer to the documentation on how '
618
- f'to set up GPUs.{suffix}')
774
+ f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
775
+ f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
776
+ ' contains GPUs, please ensure GPU drivers are installed on '
777
+ 'the node. Check if the GPUs are setup correctly by running '
778
+ '`kubectl describe nodes` and looking for the '
779
+ f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
780
+ 'Please refer to the documentation on how to set up GPUs.'
781
+ f'{suffix}')
619
782
 
620
783
 
621
784
  def get_head_ssh_port(cluster_name: str, namespace: str,
@@ -710,7 +873,10 @@ def check_credentials(context: Optional[str],
710
873
  # provider if their cluster GPUs are not setup correctly.
711
874
  gpu_msg = ''
712
875
  try:
713
- _, _ = get_gpu_label_key_value(context, acc_type='', check_mode=True)
876
+ get_accelerator_label_key_value(context,
877
+ acc_type='',
878
+ acc_count=0,
879
+ check_mode=True)
714
880
  except exceptions.ResourcesUnavailableError as e:
715
881
  # If GPUs are not available, we return cluster as enabled (since it can
716
882
  # be a CPU-only cluster) but we also return the exception message which
@@ -1787,7 +1953,7 @@ def dict_to_k8s_object(object_dict: Dict[str, Any], object_type: 'str') -> Any:
1787
1953
  class KubernetesNodeInfo:
1788
1954
  """Dataclass to store Kubernetes node information."""
1789
1955
  name: str
1790
- gpu_type: Optional[str]
1956
+ accelerator_type: Optional[str]
1791
1957
  # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
1792
1958
  total: Dict[str, int]
1793
1959
  free: Dict[str, int]
@@ -1818,47 +1984,54 @@ def get_kubernetes_node_info(
1818
1984
  else:
1819
1985
  raise
1820
1986
 
1821
- label_formatter, _ = detect_gpu_label_formatter(context)
1822
- if not label_formatter:
1987
+ lf, _ = detect_gpu_label_formatter(context)
1988
+ if not lf:
1823
1989
  label_key = None
1824
1990
  else:
1825
- label_key = label_formatter.get_label_key()
1991
+ label_keys = lf.get_label_keys()
1826
1992
 
1827
1993
  node_info_dict: Dict[str, KubernetesNodeInfo] = {}
1828
1994
 
1829
- for node in nodes:
1830
- allocated_qty = 0
1831
- if label_formatter is not None and label_key in node.metadata.labels:
1832
- accelerator_name = label_formatter.get_accelerator_from_label_value(
1833
- node.metadata.labels.get(label_key))
1834
- else:
1835
- accelerator_name = None
1995
+ for label_key in label_keys:
1996
+ for node in nodes:
1997
+ allocated_qty = 0
1998
+ if lf is not None and label_key in node.metadata.labels:
1999
+ accelerator_name = lf.get_accelerator_from_label_value(
2000
+ node.metadata.labels.get(label_key))
2001
+ else:
2002
+ accelerator_name = None
1836
2003
 
1837
- accelerator_count = int(node.status.allocatable.get(
1838
- 'nvidia.com/gpu', 0))
2004
+ accelerator_count = get_node_accelerator_count(
2005
+ node.status.allocatable)
1839
2006
 
1840
- if pods is None:
1841
- accelerators_available = -1
2007
+ if pods is None:
2008
+ accelerators_available = -1
1842
2009
 
1843
- else:
1844
- for pod in pods:
1845
- # Get all the pods running on the node
1846
- if (pod.spec.node_name == node.metadata.name and
1847
- pod.status.phase in ['Running', 'Pending']):
1848
- # Iterate over all the containers in the pod and sum the
1849
- # GPU requests
1850
- for container in pod.spec.containers:
1851
- if container.resources.requests:
1852
- allocated_qty += int(
1853
- container.resources.requests.get(
1854
- 'nvidia.com/gpu', 0))
1855
- accelerators_available = accelerator_count - allocated_qty
1856
-
1857
- node_info_dict[node.metadata.name] = KubernetesNodeInfo(
1858
- name=node.metadata.name,
1859
- gpu_type=accelerator_name,
1860
- total={'nvidia.com/gpu': int(accelerator_count)},
1861
- free={'nvidia.com/gpu': int(accelerators_available)})
2010
+ else:
2011
+ for pod in pods:
2012
+ # Get all the pods running on the node
2013
+ if (pod.spec.node_name == node.metadata.name and
2014
+ pod.status.phase in ['Running', 'Pending']):
2015
+ # Iterate over all the containers in the pod and sum the
2016
+ # GPU requests
2017
+ for container in pod.spec.containers:
2018
+ if container.resources.requests:
2019
+ allocated_qty += get_node_accelerator_count(
2020
+ container.resources.requests)
2021
+
2022
+ accelerators_available = accelerator_count - allocated_qty
2023
+
2024
+ # Exclude multi-host TPUs from being processed.
2025
+ # TODO(Doyoung): Remove the logic when adding support for
2026
+ # multi-host TPUs.
2027
+ if is_multi_host_tpu(node.metadata.labels):
2028
+ continue
2029
+
2030
+ node_info_dict[node.metadata.name] = KubernetesNodeInfo(
2031
+ name=node.metadata.name,
2032
+ accelerator_type=accelerator_name,
2033
+ total={'accelerator_count': int(accelerator_count)},
2034
+ free={'accelerators_available': int(accelerators_available)})
1862
2035
 
1863
2036
  return node_info_dict
1864
2037
 
@@ -2040,6 +2213,80 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
2040
2213
  return pods
2041
2214
 
2042
2215
 
2216
+ def is_tpu_on_gke(accelerator: str) -> bool:
2217
+ """Determins if the given accelerator is a TPU supported on GKE."""
2218
+ return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
2219
+
2220
+
2221
+ def get_node_accelerator_count(attribute_dict: dict) -> int:
2222
+ """Retrieves the count of accelerators from a node's resource dictionary.
2223
+
2224
+ This method checks the node's allocatable resources or the accelerators
2225
+ already deployed on the node, using pod objects that describe resource
2226
+ requests.
2227
+
2228
+ Args:
2229
+ attribute_dict: Containing resource information from a node, such as
2230
+ allocatable or requested resources.
2231
+
2232
+ Returns:
2233
+ Number of accelerators allocated or available from the node. If no
2234
+ resource is found, it returns 0.
2235
+ """
2236
+ assert not (GPU_RESOURCE_KEY in attribute_dict and
2237
+ TPU_RESOURCE_KEY in attribute_dict)
2238
+ if GPU_RESOURCE_KEY in attribute_dict:
2239
+ return int(attribute_dict[GPU_RESOURCE_KEY])
2240
+ elif TPU_RESOURCE_KEY in attribute_dict:
2241
+ return int(attribute_dict[TPU_RESOURCE_KEY])
2242
+ return 0
2243
+
2244
+
2245
+ def reduce_tpu_topology(topology: str) -> int:
2246
+ """Computes the number of TPU chips from its topology string."""
2247
+ chip_dimensions = [int(chip_count) for chip_count in topology.split('x')]
2248
+ # tpu_topology_chip_count represents the total number of TPU chips in the
2249
+ # entire podslice, whether it is a single-host or multi-host TPU podslice.
2250
+ tpu_topology_chip_count = functools.reduce(lambda x, y: x * y,
2251
+ chip_dimensions)
2252
+ return tpu_topology_chip_count
2253
+
2254
+
2255
+ def is_multi_host_tpu(node_metadata_labels: dict) -> bool:
2256
+ """Determines whether the given node is a multi-host TPU configuration."""
2257
+ if GKELabelFormatter.TPU_LABEL_KEY in node_metadata_labels:
2258
+ assert GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY in node_metadata_labels
2259
+ topology_value = (
2260
+ node_metadata_labels[GKELabelFormatter.TPU_TOPOLOGY_LABEL_KEY])
2261
+ accelerator_count_label_key = (
2262
+ GKELabelFormatter.ACCELERATOR_COUNT_LABEL_KEY)
2263
+ assert accelerator_count_label_key in node_metadata_labels
2264
+ # node_tpu_chip_count represents the number of TPU chips
2265
+ # available in this node. If the node is part of a node pool
2266
+ # forming a multi-host TPU podslice, it only reflects the
2267
+ # number of TPU chips in this individual node, not the entire
2268
+ # multi-host TPU podslice.
2269
+ node_tpu_chip_count = int(
2270
+ node_metadata_labels[accelerator_count_label_key])
2271
+ topology_chip_count = reduce_tpu_topology(topology_value)
2272
+ # For multi-host TPU podslices, topology_chip_count and
2273
+ # node_tpu_chip_count will differ, as topology_chip_count
2274
+ # reflects the total across all hosts, while
2275
+ # node_tpu_chip_count reflects only the chips in a single node.
2276
+ if node_tpu_chip_count != topology_chip_count:
2277
+ return True
2278
+ return False
2279
+
2280
+
2281
+ def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool:
2282
+ """Checks if there exists a multi-host TPU within the cluster."""
2283
+ nodes = get_kubernetes_nodes(context)
2284
+ for node in nodes:
2285
+ if is_multi_host_tpu(node.metadata.labels):
2286
+ return True
2287
+ return False
2288
+
2289
+
2043
2290
  @dataclasses.dataclass
2044
2291
  class KubernetesSkyPilotClusterInfo:
2045
2292
  cluster_name_on_cloud: str