skypilot-nightly 1.0.0.dev20241111__py3-none-any.whl → 1.0.0.dev20241113__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '91323d86baaeb1341c6953e15bbf19f2896b67ad'
8
+ _SKYPILOT_COMMIT_SHA = 'eea13cc624a10bd4319eace0f48dcceb9d0287cd'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241111'
38
+ __version__ = '1.0.0.dev20241113'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -1418,6 +1418,7 @@ def check_network_connection():
1418
1418
  'Network seems down.') from e
1419
1419
 
1420
1420
 
1421
+ @timeline.event
1421
1422
  def check_owner_identity(cluster_name: str) -> None:
1422
1423
  """Check if current user is the same as the user who created the cluster.
1423
1424
 
sky/cli.py CHANGED
@@ -3102,6 +3102,7 @@ def show_gpus(
3102
3102
  kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
3103
3103
  kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
3104
3104
  sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
3105
+ no_permissions_str = '<no permissions>'
3105
3106
 
3106
3107
  def _list_to_str(lst):
3107
3108
  return ', '.join([str(e) for e in lst])
@@ -3142,13 +3143,16 @@ def show_gpus(
3142
3143
  'in Kubernetes cluster. ')
3143
3144
  debug_msg = ('To show available accelerators on kubernetes,'
3144
3145
  ' run: sky show-gpus --cloud kubernetes ')
3145
- full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE +
3146
+ full_err_msg = (err_msg +
3147
+ kubernetes_utils.NO_ACCELERATOR_HELP_MESSAGE +
3146
3148
  debug_msg)
3147
3149
  raise ValueError(full_err_msg)
3148
3150
  for gpu, _ in sorted(counts.items()):
3151
+ available_qty = available[gpu] if available[gpu] != -1 else (
3152
+ no_permissions_str)
3149
3153
  realtime_gpu_table.add_row([
3150
3154
  gpu,
3151
- _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]
3155
+ _list_to_str(counts.pop(gpu)), capacity[gpu], available_qty
3152
3156
  ])
3153
3157
  return realtime_gpu_table
3154
3158
 
@@ -3158,10 +3162,12 @@ def show_gpus(
3158
3162
 
3159
3163
  node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
3160
3164
  for node_name, node_info in node_info_dict.items():
3165
+ available = node_info.free[
3166
+ 'accelerators_available'] if node_info.free[
3167
+ 'accelerators_available'] != -1 else no_permissions_str
3161
3168
  node_table.add_row([
3162
- node_name, node_info.gpu_type,
3163
- node_info.total['nvidia.com/gpu'],
3164
- node_info.free['nvidia.com/gpu']
3169
+ node_name, node_info.accelerator_type,
3170
+ node_info.total['accelerator_count'], available
3165
3171
  ])
3166
3172
  return node_table
3167
3173
 
@@ -3216,8 +3222,18 @@ def show_gpus(
3216
3222
  yield from k8s_realtime_table.get_string()
3217
3223
  k8s_node_table = _get_kubernetes_node_info_table(context)
3218
3224
  yield '\n\n'
3225
+ # TODO(Doyoung): Update the message with the multi-host TPU
3226
+ # support.
3227
+ k8s_per_node_acc_message = (
3228
+ 'Kubernetes per node accelerator availability ')
3229
+ if kubernetes_utils.multi_host_tpu_exists_in_cluster(
3230
+ context):
3231
+ k8s_per_node_acc_message += (
3232
+ '(Note: Multi-host TPUs are detected and excluded '
3233
+ 'from the display as multi-host TPUs are not '
3234
+ 'supported.)')
3219
3235
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3220
- f'Kubernetes per node GPU availability'
3236
+ f'{k8s_per_node_acc_message}'
3221
3237
  f'{colorama.Style.RESET_ALL}\n')
3222
3238
  yield from k8s_node_table.get_string()
3223
3239
  if kubernetes_autoscaling:
sky/clouds/cloud.py CHANGED
@@ -18,6 +18,7 @@ from sky import skypilot_config
18
18
  from sky.clouds import service_catalog
19
19
  from sky.utils import log_utils
20
20
  from sky.utils import resources_utils
21
+ from sky.utils import timeline
21
22
  from sky.utils import ux_utils
22
23
 
23
24
  if typing.TYPE_CHECKING:
@@ -366,6 +367,7 @@ class Cloud:
366
367
  del label_key, label_value
367
368
  return True, None
368
369
 
370
+ @timeline.event
369
371
  def get_feasible_launchable_resources(
370
372
  self,
371
373
  resources: 'resources_lib.Resources',
sky/clouds/kubernetes.py CHANGED
@@ -362,11 +362,23 @@ class Kubernetes(clouds.Cloud):
362
362
 
363
363
  k8s_acc_label_key = None
364
364
  k8s_acc_label_value = None
365
+ k8s_topology_label_key = None
366
+ k8s_topology_label_value = None
367
+ k8s_resource_key = None
368
+ tpu_requested = False
365
369
 
366
- # If GPUs are requested, set node label to match the GPU type.
370
+ # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
367
371
  if acc_count > 0 and acc_type is not None:
368
- k8s_acc_label_key, k8s_acc_label_value = \
369
- kubernetes_utils.get_gpu_label_key_value(context, acc_type)
372
+ (k8s_acc_label_key, k8s_acc_label_value, k8s_topology_label_key,
373
+ k8s_topology_label_value) = (
374
+ kubernetes_utils.get_accelerator_label_key_value(
375
+ context, acc_type, acc_count))
376
+ if (k8s_acc_label_key ==
377
+ kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
378
+ tpu_requested = True
379
+ k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY
380
+ else:
381
+ k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY
370
382
 
371
383
  port_mode = network_utils.get_port_mode(None)
372
384
 
@@ -428,6 +440,10 @@ class Kubernetes(clouds.Cloud):
428
440
  'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
429
441
  'k8s_spot_label_key': spot_label_key,
430
442
  'k8s_spot_label_value': spot_label_value,
443
+ 'tpu_requested': tpu_requested,
444
+ 'k8s_topology_label_key': k8s_topology_label_key,
445
+ 'k8s_topology_label_value': k8s_topology_label_value,
446
+ 'k8s_resource_key': k8s_resource_key,
431
447
  'image_id': image_id,
432
448
  }
433
449
 
@@ -10,6 +10,7 @@ from typing import Dict, List, Optional, Set, Tuple
10
10
  from sky import check as sky_check
11
11
  from sky import sky_logging
12
12
  from sky.adaptors import common as adaptors_common
13
+ from sky.adaptors import kubernetes
13
14
  from sky.clouds import Kubernetes
14
15
  from sky.clouds.service_catalog import CloudFilter
15
16
  from sky.clouds.service_catalog import common
@@ -22,6 +23,8 @@ if typing.TYPE_CHECKING:
22
23
  else:
23
24
  pd = adaptors_common.LazyImport('pandas')
24
25
 
26
+ logger = sky_logging.init_logger(__name__)
27
+
25
28
  _PULL_FREQUENCY_HOURS = 7
26
29
 
27
30
  # We keep pull_frequency_hours so we can remotely update the default image paths
@@ -77,6 +80,11 @@ def list_accelerators_realtime(
77
80
  require_price: bool = True
78
81
  ) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
79
82
  int]]:
83
+ """List accelerators in the Kubernetes cluster.
84
+
85
+ If the user does not have sufficient permissions to list pods in all
86
+ namespaces, the function will return free GPUs as -1.
87
+ """
80
88
  # TODO(romilb): This should be refactored to use get_kubernetes_node_info()
81
89
  # function from kubernetes_utils.
82
90
  del all_regions, require_price # Unused.
@@ -96,19 +104,29 @@ def list_accelerators_realtime(
96
104
  ) or not kubernetes_utils.check_credentials(context)[0]:
97
105
  return {}, {}, {}
98
106
 
99
- has_gpu = kubernetes_utils.detect_gpu_resource(context)
107
+ has_gpu = kubernetes_utils.detect_accelerator_resource(context)
100
108
  if not has_gpu:
101
109
  return {}, {}, {}
102
110
 
103
- label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter(context)
104
- if not label_formatter:
111
+ lf, _ = kubernetes_utils.detect_gpu_label_formatter(context)
112
+ if not lf:
105
113
  return {}, {}, {}
106
114
 
107
115
  accelerators_qtys: Set[Tuple[str, int]] = set()
108
- key = label_formatter.get_label_key()
116
+ keys = lf.get_label_keys()
109
117
  nodes = kubernetes_utils.get_kubernetes_nodes(context)
110
118
  # Get the pods to get the real-time GPU usage
111
- pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
119
+ try:
120
+ pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
121
+ except kubernetes.api_exception() as e:
122
+ if e.status == 403:
123
+ logger.warning('Failed to get pods in the Kubernetes cluster '
124
+ '(forbidden). Please check if your account has '
125
+ 'necessary permissions to list pods. Realtime GPU '
126
+ 'availability information may be incorrect.')
127
+ pods = None
128
+ else:
129
+ raise
112
130
  # Total number of GPUs in the cluster
113
131
  total_accelerators_capacity: Dict[str, int] = {}
114
132
  # Total number of GPUs currently available in the cluster
@@ -116,62 +134,85 @@ def list_accelerators_realtime(
116
134
  min_quantity_filter = quantity_filter if quantity_filter else 1
117
135
 
118
136
  for node in nodes:
119
- if key in node.metadata.labels:
120
- allocated_qty = 0
121
- accelerator_name = label_formatter.get_accelerator_from_label_value(
122
- node.metadata.labels.get(key))
123
-
124
- # Check if name_filter regex matches the accelerator_name
125
- regex_flags = 0 if case_sensitive else re.IGNORECASE
126
- if name_filter and not re.match(
127
- name_filter, accelerator_name, flags=regex_flags):
128
- continue
129
-
130
- accelerator_count = int(
131
- node.status.allocatable.get('nvidia.com/gpu', 0))
132
-
133
- # Generate the GPU quantities for the accelerators
134
- if accelerator_name and accelerator_count > 0:
135
- count = 1
136
- while count <= accelerator_count:
137
- accelerators_qtys.add((accelerator_name, count))
138
- count *= 2
139
- # Add the accelerator count if it's not already in the set
140
- # (e.g., if there's 12 GPUs, we should have qtys 1, 2, 4, 8, 12)
141
- if accelerator_count not in accelerators_qtys:
142
- accelerators_qtys.add((accelerator_name, accelerator_count))
143
-
144
- for pod in pods:
145
- # Get all the pods running on the node
146
- if (pod.spec.node_name == node.metadata.name and
147
- pod.status.phase in ['Running', 'Pending']):
148
- # Iterate over all the containers in the pod and sum the
149
- # GPU requests
150
- for container in pod.spec.containers:
151
- if container.resources.requests:
152
- allocated_qty += int(
153
- container.resources.requests.get(
154
- 'nvidia.com/gpu', 0))
155
-
156
- accelerators_available = accelerator_count - allocated_qty
157
-
158
- if accelerator_count >= min_quantity_filter:
159
- quantized_count = (min_quantity_filter *
160
- (accelerator_count // min_quantity_filter))
161
- if accelerator_name not in total_accelerators_capacity:
162
- total_accelerators_capacity[
163
- accelerator_name] = quantized_count
164
- else:
165
- total_accelerators_capacity[
166
- accelerator_name] += quantized_count
167
-
168
- if accelerator_name not in total_accelerators_available:
169
- total_accelerators_available[accelerator_name] = 0
170
- if accelerators_available >= min_quantity_filter:
171
- quantized_availability = min_quantity_filter * (
172
- accelerators_available // min_quantity_filter)
173
- total_accelerators_available[
174
- accelerator_name] += quantized_availability
137
+ for key in keys:
138
+ if key in node.metadata.labels:
139
+ allocated_qty = 0
140
+ accelerator_name = lf.get_accelerator_from_label_value(
141
+ node.metadata.labels.get(key))
142
+
143
+ # Exclude multi-host TPUs from being processed.
144
+ # TODO(Doyoung): Remove the logic when adding support for
145
+ # multi-host TPUs.
146
+ if kubernetes_utils.is_multi_host_tpu(node.metadata.labels):
147
+ continue
148
+
149
+ # Check if name_filter regex matches the accelerator_name
150
+ regex_flags = 0 if case_sensitive else re.IGNORECASE
151
+ if name_filter and not re.match(
152
+ name_filter, accelerator_name, flags=regex_flags):
153
+ continue
154
+
155
+ # Generate the accelerator quantities
156
+ accelerator_count = (
157
+ kubernetes_utils.get_node_accelerator_count(
158
+ node.status.allocatable))
159
+
160
+ if accelerator_name and accelerator_count > 0:
161
+ # TPUs are counted in a different way compared to GPUs.
162
+ # Multi-node GPUs can be split into smaller units and be
163
+ # provisioned, but TPUs are considered as an atomic unit.
164
+ if kubernetes_utils.is_tpu_on_gke(accelerator_name):
165
+ accelerators_qtys.add(
166
+ (accelerator_name, accelerator_count))
167
+ else:
168
+ count = 1
169
+ while count <= accelerator_count:
170
+ accelerators_qtys.add((accelerator_name, count))
171
+ count *= 2
172
+ # Add the accelerator count if it's not already in the
173
+ # set (e.g., if there's 12 GPUs, we should have qtys 1,
174
+ # 2, 4, 8, 12)
175
+ if accelerator_count not in accelerators_qtys:
176
+ accelerators_qtys.add(
177
+ (accelerator_name, accelerator_count))
178
+
179
+ if accelerator_count >= min_quantity_filter:
180
+ quantized_count = (
181
+ min_quantity_filter *
182
+ (accelerator_count // min_quantity_filter))
183
+ if accelerator_name not in total_accelerators_capacity:
184
+ total_accelerators_capacity[
185
+ accelerator_name] = quantized_count
186
+ else:
187
+ total_accelerators_capacity[
188
+ accelerator_name] += quantized_count
189
+
190
+ if pods is None:
191
+ # If we can't get the pods, we can't get the GPU usage
192
+ total_accelerators_available[accelerator_name] = -1
193
+ continue
194
+
195
+ for pod in pods:
196
+ # Get all the pods running on the node
197
+ if (pod.spec.node_name == node.metadata.name and
198
+ pod.status.phase in ['Running', 'Pending']):
199
+ # Iterate over all the containers in the pod and sum
200
+ # the GPU requests
201
+ for container in pod.spec.containers:
202
+ if container.resources.requests:
203
+ allocated_qty += (
204
+ kubernetes_utils.get_node_accelerator_count(
205
+ container.resources.requests))
206
+
207
+ accelerators_available = accelerator_count - allocated_qty
208
+
209
+ if accelerator_name not in total_accelerators_available:
210
+ total_accelerators_available[accelerator_name] = 0
211
+ if accelerators_available >= min_quantity_filter:
212
+ quantized_availability = min_quantity_filter * (
213
+ accelerators_available // min_quantity_filter)
214
+ total_accelerators_available[
215
+ accelerator_name] += quantized_availability
175
216
 
176
217
  result = []
177
218
 
@@ -17,6 +17,7 @@ import cachetools
17
17
  from sky import sky_logging
18
18
  from sky import skypilot_config
19
19
  from sky.provision.gcp import constants
20
+ from sky.provision.kubernetes import utils as kubernetes_utils
20
21
  from sky.utils import subprocess_utils
21
22
 
22
23
  if typing.TYPE_CHECKING:
@@ -35,7 +36,10 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool:
35
36
  def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
36
37
  if not is_tpu(resources):
37
38
  return False
38
- assert resources is not None
39
+ assert (resources is not None and len(resources.accelerators) == 1)
40
+ acc, _ = list(resources.accelerators.items())[0]
41
+ if kubernetes_utils.is_tpu_on_gke(acc):
42
+ return False
39
43
  if resources.accelerator_args is None:
40
44
  return True
41
45
  return resources.accelerator_args.get('tpu_vm', True)
sky/jobs/core.py CHANGED
@@ -26,9 +26,11 @@ from sky.utils import controller_utils
26
26
  from sky.utils import dag_utils
27
27
  from sky.utils import rich_utils
28
28
  from sky.utils import subprocess_utils
29
+ from sky.utils import timeline
29
30
  from sky.utils import ux_utils
30
31
 
31
32
 
33
+ @timeline.event
32
34
  @usage_lib.entrypoint
33
35
  def launch(
34
36
  task: Union['sky.Task', 'sky.Dag'],
sky/optimizer.py CHANGED
@@ -22,6 +22,7 @@ from sky.utils import log_utils
22
22
  from sky.utils import resources_utils
23
23
  from sky.utils import rich_utils
24
24
  from sky.utils import subprocess_utils
25
+ from sky.utils import timeline
25
26
  from sky.utils import ux_utils
26
27
 
27
28
  if typing.TYPE_CHECKING:
@@ -105,6 +106,7 @@ class Optimizer:
105
106
  return egress_time
106
107
 
107
108
  @staticmethod
109
+ @timeline.event
108
110
  def optimize(dag: 'dag_lib.Dag',
109
111
  minimize: OptimizeTarget = OptimizeTarget.COST,
110
112
  blocked_resources: Optional[Iterable[
sky/provision/__init__.py CHANGED
@@ -24,6 +24,7 @@ from sky.provision import oci
24
24
  from sky.provision import runpod
25
25
  from sky.provision import vsphere
26
26
  from sky.utils import command_runner
27
+ from sky.utils import timeline
27
28
 
28
29
  logger = sky_logging.init_logger(__name__)
29
30
 
@@ -59,6 +60,7 @@ def _route_to_cloud_impl(func):
59
60
  # pylint: disable=unused-argument
60
61
 
61
62
 
63
+ @timeline.event
62
64
  @_route_to_cloud_impl
63
65
  def query_instances(
64
66
  provider_name: str,
@@ -2,7 +2,7 @@
2
2
  import copy
3
3
  import json
4
4
  import time
5
- from typing import Any, Callable, Dict, List, Optional
5
+ from typing import Any, Callable, Dict, List, Optional, Union
6
6
  import uuid
7
7
 
8
8
  from sky import exceptions
@@ -47,6 +47,72 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
47
47
  return {'component': f'{cluster_name}-head'}
48
48
 
49
49
 
50
+ def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
51
+ # Returns a formatted string of resource requirements for a pod.
52
+ resource_requirements = {}
53
+
54
+ if isinstance(pod_or_spec, dict):
55
+ containers = pod_or_spec.get('spec', {}).get('containers', [])
56
+ else:
57
+ containers = pod_or_spec.spec.containers
58
+
59
+ for container in containers:
60
+ if isinstance(container, dict):
61
+ resources = container.get('resources', {})
62
+ requests = resources.get('requests', {})
63
+ else:
64
+ resources = container.resources
65
+ requests = resources.requests or {}
66
+
67
+ for resource, value in requests.items():
68
+ if resource not in resource_requirements:
69
+ resource_requirements[resource] = 0
70
+ if resource == 'memory':
71
+ int_value = kubernetes_utils.parse_memory_resource(value)
72
+ else:
73
+ int_value = kubernetes_utils.parse_cpu_or_gpu_resource(value)
74
+ resource_requirements[resource] += int(int_value)
75
+ return ', '.join(f'{resource}={value}'
76
+ for resource, value in resource_requirements.items())
77
+
78
+
79
+ def _formatted_node_selector(pod_or_spec: Union[Any, dict]) -> Optional[str]:
80
+ # Returns a formatted string of node selectors for a pod.
81
+ node_selectors = []
82
+
83
+ if isinstance(pod_or_spec, dict):
84
+ selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {})
85
+ else:
86
+ selectors = pod_or_spec.spec.node_selector
87
+
88
+ if not selectors:
89
+ return None
90
+
91
+ for label_key, label_value in selectors.items():
92
+ node_selectors.append(f'{label_key}={label_value}')
93
+ return ', '.join(node_selectors)
94
+
95
+
96
+ def _lack_resource_msg(resource: str,
97
+ pod_or_spec: Union[Any, dict],
98
+ extra_msg: Optional[str] = None,
99
+ details: Optional[str] = None) -> str:
100
+ resource_requirements = _formatted_resource_requirements(pod_or_spec)
101
+ node_selectors = _formatted_node_selector(pod_or_spec)
102
+ node_selector_str = f' and labels ({node_selectors})' if (
103
+ node_selectors) else ''
104
+ msg = (f'Insufficient {resource} capacity on the cluster. '
105
+ f'Required resources ({resource_requirements}){node_selector_str} '
106
+ 'were not found in a single node. Other SkyPilot tasks or pods may '
107
+ 'be using resources. Check resource usage by running '
108
+ '`kubectl describe nodes`.')
109
+ if extra_msg:
110
+ msg += f' {extra_msg}'
111
+ if details:
112
+ msg += f'\nFull error: {details}'
113
+ return msg
114
+
115
+
50
116
  def _raise_pod_scheduling_errors(namespace, context, new_nodes):
51
117
  """Raise pod scheduling failure reason.
52
118
 
@@ -54,52 +120,6 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
54
120
  are recorded as events. This function retrieves those events and raises
55
121
  descriptive errors for better debugging and user feedback.
56
122
  """
57
-
58
- def _formatted_resource_requirements(pod):
59
- # Returns a formatted string of resource requirements for a pod.
60
- resource_requirements = {}
61
- for container in pod.spec.containers:
62
- for resource, value in container.resources.requests.items():
63
- if resource not in resource_requirements:
64
- resource_requirements[resource] = 0
65
- if resource == 'memory':
66
- int_value = kubernetes_utils.parse_memory_resource(value)
67
- else:
68
- int_value = kubernetes_utils.parse_cpu_or_gpu_resource(
69
- value)
70
- resource_requirements[resource] += int_value
71
- return ', '.join(f'{resource}={value}'
72
- for resource, value in resource_requirements.items())
73
-
74
- def _formatted_node_selector(pod) -> Optional[str]:
75
- # Returns a formatted string of node selectors for a pod.
76
- node_selectors = []
77
- if pod.spec.node_selector is None:
78
- return None
79
- for label_key, label_value in pod.spec.node_selector.items():
80
- node_selectors.append(f'{label_key}={label_value}')
81
- return ', '.join(node_selectors)
82
-
83
- def _lack_resource_msg(resource: str,
84
- pod,
85
- extra_msg: Optional[str] = None,
86
- details: Optional[str] = None) -> str:
87
- resource_requirements = _formatted_resource_requirements(pod)
88
- node_selectors = _formatted_node_selector(pod)
89
- node_selector_str = f' and labels ({node_selectors})' if (
90
- node_selectors) else ''
91
- msg = (
92
- f'Insufficient {resource} capacity on the cluster. '
93
- f'Required resources ({resource_requirements}){node_selector_str} '
94
- 'were not found in a single node. Other SkyPilot tasks or pods may '
95
- 'be using resources. Check resource usage by running '
96
- '`kubectl describe nodes`.')
97
- if extra_msg:
98
- msg += f' {extra_msg}'
99
- if details:
100
- msg += f'\nFull error: {details}'
101
- return msg
102
-
103
123
  for new_node in new_nodes:
104
124
  pod = kubernetes.core_api(context).read_namespaced_pod(
105
125
  new_node.metadata.name, namespace)
@@ -148,8 +168,8 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
148
168
  '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long
149
169
  f' Full error: {event_message}')
150
170
  gpu_lf_keys = [
151
- lf.get_label_key()
152
- for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
171
+ key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
172
+ for key in lf.get_label_keys()
153
173
  ]
154
174
  if pod.spec.node_selector:
155
175
  for label_key in pod.spec.node_selector.keys():
@@ -157,10 +177,24 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
157
177
  # TODO(romilb): We may have additional node
158
178
  # affinity selectors in the future - in that
159
179
  # case we will need to update this logic.
160
- if (('Insufficient nvidia.com/gpu'
161
- in event_message) or
162
- ('didn\'t match Pod\'s node affinity/selector'
163
- in event_message)):
180
+ # TODO(Doyoung): Update the error message raised
181
+ # with the multi-host TPU support.
182
+ if 'Insufficient google.com/tpu' in event_message:
183
+ extra_msg = (
184
+ f'Verify if '
185
+ f'{pod.spec.node_selector[label_key]}'
186
+ ' is available in the cluster. Note '
187
+ 'that multi-host TPU podslices are '
188
+ 'currently not unsupported.')
189
+ raise config_lib.KubernetesError(
190
+ _lack_resource_msg('TPU',
191
+ pod,
192
+ extra_msg,
193
+ details=event_message))
194
+ elif (('Insufficient nvidia.com/gpu'
195
+ in event_message) or
196
+ ('didn\'t match Pod\'s node affinity/selector'
197
+ in event_message)):
164
198
  extra_msg = (
165
199
  f'Verify if '
166
200
  f'{pod.spec.node_selector[label_key]}'
@@ -553,6 +587,20 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
553
587
  logger.info('Failed to create Pod without AppArmor annotation: '
554
588
  f'{retry_exception}')
555
589
  raise retry_exception
590
+ # Unlike other error from resource lackage on CPU/GPU/Memory, TPU
591
+ # lackage error is raised when pod is attemtped to be created.
592
+ # TODO(Doyoung): Update the error message raised with the multi-host
593
+ # TPU support.
594
+ elif 'Invalid resource requests for google.com/tpu.' in error_message:
595
+ extra_message = ('Verify if the cluster has a TPU slice node with '
596
+ 'a topology matching the number of TPU(s) '
597
+ 'requested. Note that multi-host TPU podslices '
598
+ 'are currently not unsupported.')
599
+ raise config_lib.KubernetesError(
600
+ _lack_resource_msg('TPU',
601
+ pod_spec,
602
+ details=error_message,
603
+ extra_msg=extra_message))
556
604
  else:
557
605
  # Re-raise the exception if it's a different error
558
606
  raise e
@@ -633,8 +681,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
633
681
  'override runtimeClassName in ~/.sky/config.yaml. '
634
682
  'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long
635
683
 
636
- needs_gpus = (pod_spec['spec']['containers'][0].get('resources', {}).get(
637
- 'limits', {}).get('nvidia.com/gpu', 0) > 0)
684
+ needs_gpus = False
685
+ limits = pod_spec['spec']['containers'][0].get('resources',
686
+ {}).get('limits')
687
+ if limits is not None:
688
+ needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0
689
+
690
+ # TPU pods provisioned on GKE use the default containerd runtime.
691
+ # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
638
692
  if nvidia_runtime_exists and needs_gpus:
639
693
  pod_spec['spec']['runtimeClassName'] = 'nvidia'
640
694
 
@@ -679,6 +733,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
679
733
  }
680
734
  }
681
735
 
736
+ # TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule.
737
+ # This is to prevent from non-TPU workloads from being scheduled on TPU
738
+ # slice nodes. We need this toleration to allow the pod to be scheduled
739
+ # on TPU nodes.
740
+ # Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long
741
+ tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
742
+ if tpu_label in config.node_config.get('spec',
743
+ {}).get('nodeSelector', {}):
744
+ tpu_toleration = {
745
+ 'key': kubernetes_utils.TPU_RESOURCE_KEY,
746
+ 'operator': 'Equal',
747
+ 'value': 'present',
748
+ 'effect': 'NoSchedule'
749
+ }
750
+ pod_spec['spec']['tolerations'] = [tpu_toleration]
751
+
682
752
  pod = _create_namespaced_pod_with_retries(namespace, pod_spec, context)
683
753
  created_pods[pod.metadata.name] = pod
684
754
  if head_pod_name is None: