skypilot-nightly 1.0.0.dev20241111__py3-none-any.whl → 1.0.0.dev20241113__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +1 -0
- sky/cli.py +22 -6
- sky/clouds/cloud.py +2 -0
- sky/clouds/kubernetes.py +19 -3
- sky/clouds/service_catalog/kubernetes_catalog.py +102 -61
- sky/clouds/utils/gcp_utils.py +5 -1
- sky/jobs/core.py +2 -0
- sky/optimizer.py +2 -0
- sky/provision/__init__.py +2 -0
- sky/provision/kubernetes/instance.py +125 -55
- sky/provision/kubernetes/utils.py +361 -102
- sky/resources.py +38 -27
- sky/serve/serve_utils.py +79 -78
- sky/skylet/log_lib.py +1 -4
- sky/templates/kubernetes-ray.yml.j2 +29 -3
- sky/utils/kubernetes/generate_kubeconfig.sh +3 -0
- sky/utils/kubernetes/gpu_labeler.py +2 -2
- sky/utils/log_utils.py +52 -1
- sky/utils/timeline.py +3 -1
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/RECORD +26 -26
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241111.dist-info → skypilot_nightly-1.0.0.dev20241113.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'eea13cc624a10bd4319eace0f48dcceb9d0287cd'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241113'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
sky/cli.py
CHANGED
@@ -3102,6 +3102,7 @@ def show_gpus(
|
|
3102
3102
|
kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
|
3103
3103
|
kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
|
3104
3104
|
sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
|
3105
|
+
no_permissions_str = '<no permissions>'
|
3105
3106
|
|
3106
3107
|
def _list_to_str(lst):
|
3107
3108
|
return ', '.join([str(e) for e in lst])
|
@@ -3142,13 +3143,16 @@ def show_gpus(
|
|
3142
3143
|
'in Kubernetes cluster. ')
|
3143
3144
|
debug_msg = ('To show available accelerators on kubernetes,'
|
3144
3145
|
' run: sky show-gpus --cloud kubernetes ')
|
3145
|
-
full_err_msg = (err_msg +
|
3146
|
+
full_err_msg = (err_msg +
|
3147
|
+
kubernetes_utils.NO_ACCELERATOR_HELP_MESSAGE +
|
3146
3148
|
debug_msg)
|
3147
3149
|
raise ValueError(full_err_msg)
|
3148
3150
|
for gpu, _ in sorted(counts.items()):
|
3151
|
+
available_qty = available[gpu] if available[gpu] != -1 else (
|
3152
|
+
no_permissions_str)
|
3149
3153
|
realtime_gpu_table.add_row([
|
3150
3154
|
gpu,
|
3151
|
-
_list_to_str(counts.pop(gpu)), capacity[gpu],
|
3155
|
+
_list_to_str(counts.pop(gpu)), capacity[gpu], available_qty
|
3152
3156
|
])
|
3153
3157
|
return realtime_gpu_table
|
3154
3158
|
|
@@ -3158,10 +3162,12 @@ def show_gpus(
|
|
3158
3162
|
|
3159
3163
|
node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
|
3160
3164
|
for node_name, node_info in node_info_dict.items():
|
3165
|
+
available = node_info.free[
|
3166
|
+
'accelerators_available'] if node_info.free[
|
3167
|
+
'accelerators_available'] != -1 else no_permissions_str
|
3161
3168
|
node_table.add_row([
|
3162
|
-
node_name, node_info.
|
3163
|
-
node_info.total['
|
3164
|
-
node_info.free['nvidia.com/gpu']
|
3169
|
+
node_name, node_info.accelerator_type,
|
3170
|
+
node_info.total['accelerator_count'], available
|
3165
3171
|
])
|
3166
3172
|
return node_table
|
3167
3173
|
|
@@ -3216,8 +3222,18 @@ def show_gpus(
|
|
3216
3222
|
yield from k8s_realtime_table.get_string()
|
3217
3223
|
k8s_node_table = _get_kubernetes_node_info_table(context)
|
3218
3224
|
yield '\n\n'
|
3225
|
+
# TODO(Doyoung): Update the message with the multi-host TPU
|
3226
|
+
# support.
|
3227
|
+
k8s_per_node_acc_message = (
|
3228
|
+
'Kubernetes per node accelerator availability ')
|
3229
|
+
if kubernetes_utils.multi_host_tpu_exists_in_cluster(
|
3230
|
+
context):
|
3231
|
+
k8s_per_node_acc_message += (
|
3232
|
+
'(Note: Multi-host TPUs are detected and excluded '
|
3233
|
+
'from the display as multi-host TPUs are not '
|
3234
|
+
'supported.)')
|
3219
3235
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3220
|
-
f'
|
3236
|
+
f'{k8s_per_node_acc_message}'
|
3221
3237
|
f'{colorama.Style.RESET_ALL}\n')
|
3222
3238
|
yield from k8s_node_table.get_string()
|
3223
3239
|
if kubernetes_autoscaling:
|
sky/clouds/cloud.py
CHANGED
@@ -18,6 +18,7 @@ from sky import skypilot_config
|
|
18
18
|
from sky.clouds import service_catalog
|
19
19
|
from sky.utils import log_utils
|
20
20
|
from sky.utils import resources_utils
|
21
|
+
from sky.utils import timeline
|
21
22
|
from sky.utils import ux_utils
|
22
23
|
|
23
24
|
if typing.TYPE_CHECKING:
|
@@ -366,6 +367,7 @@ class Cloud:
|
|
366
367
|
del label_key, label_value
|
367
368
|
return True, None
|
368
369
|
|
370
|
+
@timeline.event
|
369
371
|
def get_feasible_launchable_resources(
|
370
372
|
self,
|
371
373
|
resources: 'resources_lib.Resources',
|
sky/clouds/kubernetes.py
CHANGED
@@ -362,11 +362,23 @@ class Kubernetes(clouds.Cloud):
|
|
362
362
|
|
363
363
|
k8s_acc_label_key = None
|
364
364
|
k8s_acc_label_value = None
|
365
|
+
k8s_topology_label_key = None
|
366
|
+
k8s_topology_label_value = None
|
367
|
+
k8s_resource_key = None
|
368
|
+
tpu_requested = False
|
365
369
|
|
366
|
-
# If
|
370
|
+
# If GPU/TPUs are requested, set node label to match the GPU/TPU type.
|
367
371
|
if acc_count > 0 and acc_type is not None:
|
368
|
-
k8s_acc_label_key, k8s_acc_label_value
|
369
|
-
|
372
|
+
(k8s_acc_label_key, k8s_acc_label_value, k8s_topology_label_key,
|
373
|
+
k8s_topology_label_value) = (
|
374
|
+
kubernetes_utils.get_accelerator_label_key_value(
|
375
|
+
context, acc_type, acc_count))
|
376
|
+
if (k8s_acc_label_key ==
|
377
|
+
kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
|
378
|
+
tpu_requested = True
|
379
|
+
k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY
|
380
|
+
else:
|
381
|
+
k8s_resource_key = kubernetes_utils.GPU_RESOURCE_KEY
|
370
382
|
|
371
383
|
port_mode = network_utils.get_port_mode(None)
|
372
384
|
|
@@ -428,6 +440,10 @@ class Kubernetes(clouds.Cloud):
|
|
428
440
|
'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
|
429
441
|
'k8s_spot_label_key': spot_label_key,
|
430
442
|
'k8s_spot_label_value': spot_label_value,
|
443
|
+
'tpu_requested': tpu_requested,
|
444
|
+
'k8s_topology_label_key': k8s_topology_label_key,
|
445
|
+
'k8s_topology_label_value': k8s_topology_label_value,
|
446
|
+
'k8s_resource_key': k8s_resource_key,
|
431
447
|
'image_id': image_id,
|
432
448
|
}
|
433
449
|
|
@@ -10,6 +10,7 @@ from typing import Dict, List, Optional, Set, Tuple
|
|
10
10
|
from sky import check as sky_check
|
11
11
|
from sky import sky_logging
|
12
12
|
from sky.adaptors import common as adaptors_common
|
13
|
+
from sky.adaptors import kubernetes
|
13
14
|
from sky.clouds import Kubernetes
|
14
15
|
from sky.clouds.service_catalog import CloudFilter
|
15
16
|
from sky.clouds.service_catalog import common
|
@@ -22,6 +23,8 @@ if typing.TYPE_CHECKING:
|
|
22
23
|
else:
|
23
24
|
pd = adaptors_common.LazyImport('pandas')
|
24
25
|
|
26
|
+
logger = sky_logging.init_logger(__name__)
|
27
|
+
|
25
28
|
_PULL_FREQUENCY_HOURS = 7
|
26
29
|
|
27
30
|
# We keep pull_frequency_hours so we can remotely update the default image paths
|
@@ -77,6 +80,11 @@ def list_accelerators_realtime(
|
|
77
80
|
require_price: bool = True
|
78
81
|
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
79
82
|
int]]:
|
83
|
+
"""List accelerators in the Kubernetes cluster.
|
84
|
+
|
85
|
+
If the user does not have sufficient permissions to list pods in all
|
86
|
+
namespaces, the function will return free GPUs as -1.
|
87
|
+
"""
|
80
88
|
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
|
81
89
|
# function from kubernetes_utils.
|
82
90
|
del all_regions, require_price # Unused.
|
@@ -96,19 +104,29 @@ def list_accelerators_realtime(
|
|
96
104
|
) or not kubernetes_utils.check_credentials(context)[0]:
|
97
105
|
return {}, {}, {}
|
98
106
|
|
99
|
-
has_gpu = kubernetes_utils.
|
107
|
+
has_gpu = kubernetes_utils.detect_accelerator_resource(context)
|
100
108
|
if not has_gpu:
|
101
109
|
return {}, {}, {}
|
102
110
|
|
103
|
-
|
104
|
-
if not
|
111
|
+
lf, _ = kubernetes_utils.detect_gpu_label_formatter(context)
|
112
|
+
if not lf:
|
105
113
|
return {}, {}, {}
|
106
114
|
|
107
115
|
accelerators_qtys: Set[Tuple[str, int]] = set()
|
108
|
-
|
116
|
+
keys = lf.get_label_keys()
|
109
117
|
nodes = kubernetes_utils.get_kubernetes_nodes(context)
|
110
118
|
# Get the pods to get the real-time GPU usage
|
111
|
-
|
119
|
+
try:
|
120
|
+
pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(context)
|
121
|
+
except kubernetes.api_exception() as e:
|
122
|
+
if e.status == 403:
|
123
|
+
logger.warning('Failed to get pods in the Kubernetes cluster '
|
124
|
+
'(forbidden). Please check if your account has '
|
125
|
+
'necessary permissions to list pods. Realtime GPU '
|
126
|
+
'availability information may be incorrect.')
|
127
|
+
pods = None
|
128
|
+
else:
|
129
|
+
raise
|
112
130
|
# Total number of GPUs in the cluster
|
113
131
|
total_accelerators_capacity: Dict[str, int] = {}
|
114
132
|
# Total number of GPUs currently available in the cluster
|
@@ -116,62 +134,85 @@ def list_accelerators_realtime(
|
|
116
134
|
min_quantity_filter = quantity_filter if quantity_filter else 1
|
117
135
|
|
118
136
|
for node in nodes:
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
if
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
total_accelerators_capacity
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
accelerator_name]
|
137
|
+
for key in keys:
|
138
|
+
if key in node.metadata.labels:
|
139
|
+
allocated_qty = 0
|
140
|
+
accelerator_name = lf.get_accelerator_from_label_value(
|
141
|
+
node.metadata.labels.get(key))
|
142
|
+
|
143
|
+
# Exclude multi-host TPUs from being processed.
|
144
|
+
# TODO(Doyoung): Remove the logic when adding support for
|
145
|
+
# multi-host TPUs.
|
146
|
+
if kubernetes_utils.is_multi_host_tpu(node.metadata.labels):
|
147
|
+
continue
|
148
|
+
|
149
|
+
# Check if name_filter regex matches the accelerator_name
|
150
|
+
regex_flags = 0 if case_sensitive else re.IGNORECASE
|
151
|
+
if name_filter and not re.match(
|
152
|
+
name_filter, accelerator_name, flags=regex_flags):
|
153
|
+
continue
|
154
|
+
|
155
|
+
# Generate the accelerator quantities
|
156
|
+
accelerator_count = (
|
157
|
+
kubernetes_utils.get_node_accelerator_count(
|
158
|
+
node.status.allocatable))
|
159
|
+
|
160
|
+
if accelerator_name and accelerator_count > 0:
|
161
|
+
# TPUs are counted in a different way compared to GPUs.
|
162
|
+
# Multi-node GPUs can be split into smaller units and be
|
163
|
+
# provisioned, but TPUs are considered as an atomic unit.
|
164
|
+
if kubernetes_utils.is_tpu_on_gke(accelerator_name):
|
165
|
+
accelerators_qtys.add(
|
166
|
+
(accelerator_name, accelerator_count))
|
167
|
+
else:
|
168
|
+
count = 1
|
169
|
+
while count <= accelerator_count:
|
170
|
+
accelerators_qtys.add((accelerator_name, count))
|
171
|
+
count *= 2
|
172
|
+
# Add the accelerator count if it's not already in the
|
173
|
+
# set (e.g., if there's 12 GPUs, we should have qtys 1,
|
174
|
+
# 2, 4, 8, 12)
|
175
|
+
if accelerator_count not in accelerators_qtys:
|
176
|
+
accelerators_qtys.add(
|
177
|
+
(accelerator_name, accelerator_count))
|
178
|
+
|
179
|
+
if accelerator_count >= min_quantity_filter:
|
180
|
+
quantized_count = (
|
181
|
+
min_quantity_filter *
|
182
|
+
(accelerator_count // min_quantity_filter))
|
183
|
+
if accelerator_name not in total_accelerators_capacity:
|
184
|
+
total_accelerators_capacity[
|
185
|
+
accelerator_name] = quantized_count
|
186
|
+
else:
|
187
|
+
total_accelerators_capacity[
|
188
|
+
accelerator_name] += quantized_count
|
189
|
+
|
190
|
+
if pods is None:
|
191
|
+
# If we can't get the pods, we can't get the GPU usage
|
192
|
+
total_accelerators_available[accelerator_name] = -1
|
193
|
+
continue
|
194
|
+
|
195
|
+
for pod in pods:
|
196
|
+
# Get all the pods running on the node
|
197
|
+
if (pod.spec.node_name == node.metadata.name and
|
198
|
+
pod.status.phase in ['Running', 'Pending']):
|
199
|
+
# Iterate over all the containers in the pod and sum
|
200
|
+
# the GPU requests
|
201
|
+
for container in pod.spec.containers:
|
202
|
+
if container.resources.requests:
|
203
|
+
allocated_qty += (
|
204
|
+
kubernetes_utils.get_node_accelerator_count(
|
205
|
+
container.resources.requests))
|
206
|
+
|
207
|
+
accelerators_available = accelerator_count - allocated_qty
|
208
|
+
|
209
|
+
if accelerator_name not in total_accelerators_available:
|
210
|
+
total_accelerators_available[accelerator_name] = 0
|
211
|
+
if accelerators_available >= min_quantity_filter:
|
212
|
+
quantized_availability = min_quantity_filter * (
|
213
|
+
accelerators_available // min_quantity_filter)
|
214
|
+
total_accelerators_available[
|
215
|
+
accelerator_name] += quantized_availability
|
175
216
|
|
176
217
|
result = []
|
177
218
|
|
sky/clouds/utils/gcp_utils.py
CHANGED
@@ -17,6 +17,7 @@ import cachetools
|
|
17
17
|
from sky import sky_logging
|
18
18
|
from sky import skypilot_config
|
19
19
|
from sky.provision.gcp import constants
|
20
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
20
21
|
from sky.utils import subprocess_utils
|
21
22
|
|
22
23
|
if typing.TYPE_CHECKING:
|
@@ -35,7 +36,10 @@ def is_tpu(resources: Optional['resources_lib.Resources']) -> bool:
|
|
35
36
|
def is_tpu_vm(resources: Optional['resources_lib.Resources']) -> bool:
|
36
37
|
if not is_tpu(resources):
|
37
38
|
return False
|
38
|
-
assert resources is not None
|
39
|
+
assert (resources is not None and len(resources.accelerators) == 1)
|
40
|
+
acc, _ = list(resources.accelerators.items())[0]
|
41
|
+
if kubernetes_utils.is_tpu_on_gke(acc):
|
42
|
+
return False
|
39
43
|
if resources.accelerator_args is None:
|
40
44
|
return True
|
41
45
|
return resources.accelerator_args.get('tpu_vm', True)
|
sky/jobs/core.py
CHANGED
@@ -26,9 +26,11 @@ from sky.utils import controller_utils
|
|
26
26
|
from sky.utils import dag_utils
|
27
27
|
from sky.utils import rich_utils
|
28
28
|
from sky.utils import subprocess_utils
|
29
|
+
from sky.utils import timeline
|
29
30
|
from sky.utils import ux_utils
|
30
31
|
|
31
32
|
|
33
|
+
@timeline.event
|
32
34
|
@usage_lib.entrypoint
|
33
35
|
def launch(
|
34
36
|
task: Union['sky.Task', 'sky.Dag'],
|
sky/optimizer.py
CHANGED
@@ -22,6 +22,7 @@ from sky.utils import log_utils
|
|
22
22
|
from sky.utils import resources_utils
|
23
23
|
from sky.utils import rich_utils
|
24
24
|
from sky.utils import subprocess_utils
|
25
|
+
from sky.utils import timeline
|
25
26
|
from sky.utils import ux_utils
|
26
27
|
|
27
28
|
if typing.TYPE_CHECKING:
|
@@ -105,6 +106,7 @@ class Optimizer:
|
|
105
106
|
return egress_time
|
106
107
|
|
107
108
|
@staticmethod
|
109
|
+
@timeline.event
|
108
110
|
def optimize(dag: 'dag_lib.Dag',
|
109
111
|
minimize: OptimizeTarget = OptimizeTarget.COST,
|
110
112
|
blocked_resources: Optional[Iterable[
|
sky/provision/__init__.py
CHANGED
@@ -24,6 +24,7 @@ from sky.provision import oci
|
|
24
24
|
from sky.provision import runpod
|
25
25
|
from sky.provision import vsphere
|
26
26
|
from sky.utils import command_runner
|
27
|
+
from sky.utils import timeline
|
27
28
|
|
28
29
|
logger = sky_logging.init_logger(__name__)
|
29
30
|
|
@@ -59,6 +60,7 @@ def _route_to_cloud_impl(func):
|
|
59
60
|
# pylint: disable=unused-argument
|
60
61
|
|
61
62
|
|
63
|
+
@timeline.event
|
62
64
|
@_route_to_cloud_impl
|
63
65
|
def query_instances(
|
64
66
|
provider_name: str,
|
@@ -2,7 +2,7 @@
|
|
2
2
|
import copy
|
3
3
|
import json
|
4
4
|
import time
|
5
|
-
from typing import Any, Callable, Dict, List, Optional
|
5
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
6
6
|
import uuid
|
7
7
|
|
8
8
|
from sky import exceptions
|
@@ -47,6 +47,72 @@ def head_service_selector(cluster_name: str) -> Dict[str, str]:
|
|
47
47
|
return {'component': f'{cluster_name}-head'}
|
48
48
|
|
49
49
|
|
50
|
+
def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
|
51
|
+
# Returns a formatted string of resource requirements for a pod.
|
52
|
+
resource_requirements = {}
|
53
|
+
|
54
|
+
if isinstance(pod_or_spec, dict):
|
55
|
+
containers = pod_or_spec.get('spec', {}).get('containers', [])
|
56
|
+
else:
|
57
|
+
containers = pod_or_spec.spec.containers
|
58
|
+
|
59
|
+
for container in containers:
|
60
|
+
if isinstance(container, dict):
|
61
|
+
resources = container.get('resources', {})
|
62
|
+
requests = resources.get('requests', {})
|
63
|
+
else:
|
64
|
+
resources = container.resources
|
65
|
+
requests = resources.requests or {}
|
66
|
+
|
67
|
+
for resource, value in requests.items():
|
68
|
+
if resource not in resource_requirements:
|
69
|
+
resource_requirements[resource] = 0
|
70
|
+
if resource == 'memory':
|
71
|
+
int_value = kubernetes_utils.parse_memory_resource(value)
|
72
|
+
else:
|
73
|
+
int_value = kubernetes_utils.parse_cpu_or_gpu_resource(value)
|
74
|
+
resource_requirements[resource] += int(int_value)
|
75
|
+
return ', '.join(f'{resource}={value}'
|
76
|
+
for resource, value in resource_requirements.items())
|
77
|
+
|
78
|
+
|
79
|
+
def _formatted_node_selector(pod_or_spec: Union[Any, dict]) -> Optional[str]:
|
80
|
+
# Returns a formatted string of node selectors for a pod.
|
81
|
+
node_selectors = []
|
82
|
+
|
83
|
+
if isinstance(pod_or_spec, dict):
|
84
|
+
selectors = pod_or_spec.get('spec', {}).get('nodeSelector', {})
|
85
|
+
else:
|
86
|
+
selectors = pod_or_spec.spec.node_selector
|
87
|
+
|
88
|
+
if not selectors:
|
89
|
+
return None
|
90
|
+
|
91
|
+
for label_key, label_value in selectors.items():
|
92
|
+
node_selectors.append(f'{label_key}={label_value}')
|
93
|
+
return ', '.join(node_selectors)
|
94
|
+
|
95
|
+
|
96
|
+
def _lack_resource_msg(resource: str,
|
97
|
+
pod_or_spec: Union[Any, dict],
|
98
|
+
extra_msg: Optional[str] = None,
|
99
|
+
details: Optional[str] = None) -> str:
|
100
|
+
resource_requirements = _formatted_resource_requirements(pod_or_spec)
|
101
|
+
node_selectors = _formatted_node_selector(pod_or_spec)
|
102
|
+
node_selector_str = f' and labels ({node_selectors})' if (
|
103
|
+
node_selectors) else ''
|
104
|
+
msg = (f'Insufficient {resource} capacity on the cluster. '
|
105
|
+
f'Required resources ({resource_requirements}){node_selector_str} '
|
106
|
+
'were not found in a single node. Other SkyPilot tasks or pods may '
|
107
|
+
'be using resources. Check resource usage by running '
|
108
|
+
'`kubectl describe nodes`.')
|
109
|
+
if extra_msg:
|
110
|
+
msg += f' {extra_msg}'
|
111
|
+
if details:
|
112
|
+
msg += f'\nFull error: {details}'
|
113
|
+
return msg
|
114
|
+
|
115
|
+
|
50
116
|
def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
51
117
|
"""Raise pod scheduling failure reason.
|
52
118
|
|
@@ -54,52 +120,6 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
54
120
|
are recorded as events. This function retrieves those events and raises
|
55
121
|
descriptive errors for better debugging and user feedback.
|
56
122
|
"""
|
57
|
-
|
58
|
-
def _formatted_resource_requirements(pod):
|
59
|
-
# Returns a formatted string of resource requirements for a pod.
|
60
|
-
resource_requirements = {}
|
61
|
-
for container in pod.spec.containers:
|
62
|
-
for resource, value in container.resources.requests.items():
|
63
|
-
if resource not in resource_requirements:
|
64
|
-
resource_requirements[resource] = 0
|
65
|
-
if resource == 'memory':
|
66
|
-
int_value = kubernetes_utils.parse_memory_resource(value)
|
67
|
-
else:
|
68
|
-
int_value = kubernetes_utils.parse_cpu_or_gpu_resource(
|
69
|
-
value)
|
70
|
-
resource_requirements[resource] += int_value
|
71
|
-
return ', '.join(f'{resource}={value}'
|
72
|
-
for resource, value in resource_requirements.items())
|
73
|
-
|
74
|
-
def _formatted_node_selector(pod) -> Optional[str]:
|
75
|
-
# Returns a formatted string of node selectors for a pod.
|
76
|
-
node_selectors = []
|
77
|
-
if pod.spec.node_selector is None:
|
78
|
-
return None
|
79
|
-
for label_key, label_value in pod.spec.node_selector.items():
|
80
|
-
node_selectors.append(f'{label_key}={label_value}')
|
81
|
-
return ', '.join(node_selectors)
|
82
|
-
|
83
|
-
def _lack_resource_msg(resource: str,
|
84
|
-
pod,
|
85
|
-
extra_msg: Optional[str] = None,
|
86
|
-
details: Optional[str] = None) -> str:
|
87
|
-
resource_requirements = _formatted_resource_requirements(pod)
|
88
|
-
node_selectors = _formatted_node_selector(pod)
|
89
|
-
node_selector_str = f' and labels ({node_selectors})' if (
|
90
|
-
node_selectors) else ''
|
91
|
-
msg = (
|
92
|
-
f'Insufficient {resource} capacity on the cluster. '
|
93
|
-
f'Required resources ({resource_requirements}){node_selector_str} '
|
94
|
-
'were not found in a single node. Other SkyPilot tasks or pods may '
|
95
|
-
'be using resources. Check resource usage by running '
|
96
|
-
'`kubectl describe nodes`.')
|
97
|
-
if extra_msg:
|
98
|
-
msg += f' {extra_msg}'
|
99
|
-
if details:
|
100
|
-
msg += f'\nFull error: {details}'
|
101
|
-
return msg
|
102
|
-
|
103
123
|
for new_node in new_nodes:
|
104
124
|
pod = kubernetes.core_api(context).read_namespaced_pod(
|
105
125
|
new_node.metadata.name, namespace)
|
@@ -148,8 +168,8 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
148
168
|
'`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long
|
149
169
|
f' Full error: {event_message}')
|
150
170
|
gpu_lf_keys = [
|
151
|
-
lf.
|
152
|
-
for
|
171
|
+
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
172
|
+
for key in lf.get_label_keys()
|
153
173
|
]
|
154
174
|
if pod.spec.node_selector:
|
155
175
|
for label_key in pod.spec.node_selector.keys():
|
@@ -157,10 +177,24 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
157
177
|
# TODO(romilb): We may have additional node
|
158
178
|
# affinity selectors in the future - in that
|
159
179
|
# case we will need to update this logic.
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
180
|
+
# TODO(Doyoung): Update the error message raised
|
181
|
+
# with the multi-host TPU support.
|
182
|
+
if 'Insufficient google.com/tpu' in event_message:
|
183
|
+
extra_msg = (
|
184
|
+
f'Verify if '
|
185
|
+
f'{pod.spec.node_selector[label_key]}'
|
186
|
+
' is available in the cluster. Note '
|
187
|
+
'that multi-host TPU podslices are '
|
188
|
+
'currently not unsupported.')
|
189
|
+
raise config_lib.KubernetesError(
|
190
|
+
_lack_resource_msg('TPU',
|
191
|
+
pod,
|
192
|
+
extra_msg,
|
193
|
+
details=event_message))
|
194
|
+
elif (('Insufficient nvidia.com/gpu'
|
195
|
+
in event_message) or
|
196
|
+
('didn\'t match Pod\'s node affinity/selector'
|
197
|
+
in event_message)):
|
164
198
|
extra_msg = (
|
165
199
|
f'Verify if '
|
166
200
|
f'{pod.spec.node_selector[label_key]}'
|
@@ -553,6 +587,20 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
|
553
587
|
logger.info('Failed to create Pod without AppArmor annotation: '
|
554
588
|
f'{retry_exception}')
|
555
589
|
raise retry_exception
|
590
|
+
# Unlike other error from resource lackage on CPU/GPU/Memory, TPU
|
591
|
+
# lackage error is raised when pod is attemtped to be created.
|
592
|
+
# TODO(Doyoung): Update the error message raised with the multi-host
|
593
|
+
# TPU support.
|
594
|
+
elif 'Invalid resource requests for google.com/tpu.' in error_message:
|
595
|
+
extra_message = ('Verify if the cluster has a TPU slice node with '
|
596
|
+
'a topology matching the number of TPU(s) '
|
597
|
+
'requested. Note that multi-host TPU podslices '
|
598
|
+
'are currently not unsupported.')
|
599
|
+
raise config_lib.KubernetesError(
|
600
|
+
_lack_resource_msg('TPU',
|
601
|
+
pod_spec,
|
602
|
+
details=error_message,
|
603
|
+
extra_msg=extra_message))
|
556
604
|
else:
|
557
605
|
# Re-raise the exception if it's a different error
|
558
606
|
raise e
|
@@ -633,8 +681,14 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
633
681
|
'override runtimeClassName in ~/.sky/config.yaml. '
|
634
682
|
'For more details, refer to https://skypilot.readthedocs.io/en/latest/reference/config.html') # pylint: disable=line-too-long
|
635
683
|
|
636
|
-
needs_gpus =
|
637
|
-
|
684
|
+
needs_gpus = False
|
685
|
+
limits = pod_spec['spec']['containers'][0].get('resources',
|
686
|
+
{}).get('limits')
|
687
|
+
if limits is not None:
|
688
|
+
needs_gpus = limits.get(kubernetes_utils.GPU_RESOURCE_KEY, 0) > 0
|
689
|
+
|
690
|
+
# TPU pods provisioned on GKE use the default containerd runtime.
|
691
|
+
# Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
|
638
692
|
if nvidia_runtime_exists and needs_gpus:
|
639
693
|
pod_spec['spec']['runtimeClassName'] = 'nvidia'
|
640
694
|
|
@@ -679,6 +733,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
679
733
|
}
|
680
734
|
}
|
681
735
|
|
736
|
+
# TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule.
|
737
|
+
# This is to prevent from non-TPU workloads from being scheduled on TPU
|
738
|
+
# slice nodes. We need this toleration to allow the pod to be scheduled
|
739
|
+
# on TPU nodes.
|
740
|
+
# Reference: https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work # pylint: disable=line-too-long
|
741
|
+
tpu_label = kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY
|
742
|
+
if tpu_label in config.node_config.get('spec',
|
743
|
+
{}).get('nodeSelector', {}):
|
744
|
+
tpu_toleration = {
|
745
|
+
'key': kubernetes_utils.TPU_RESOURCE_KEY,
|
746
|
+
'operator': 'Equal',
|
747
|
+
'value': 'present',
|
748
|
+
'effect': 'NoSchedule'
|
749
|
+
}
|
750
|
+
pod_spec['spec']['tolerations'] = [tpu_toleration]
|
751
|
+
|
682
752
|
pod = _create_namespaced_pod_with_retries(namespace, pod_spec, context)
|
683
753
|
created_pods[pod.metadata.name] = pod
|
684
754
|
if head_pod_name is None:
|