skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ import copy
|
|
3
3
|
import json
|
4
4
|
import time
|
5
5
|
from typing import Any, Callable, Dict, List, Optional, Union
|
6
|
-
import uuid
|
7
6
|
|
8
7
|
from sky import exceptions
|
9
8
|
from sky import sky_logging
|
@@ -15,6 +14,7 @@ from sky.provision import docker_utils
|
|
15
14
|
from sky.provision.kubernetes import config as config_lib
|
16
15
|
from sky.provision.kubernetes import network_utils
|
17
16
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
17
|
+
from sky.provision.kubernetes import volume
|
18
18
|
from sky.utils import command_runner
|
19
19
|
from sky.utils import common_utils
|
20
20
|
from sky.utils import config_utils
|
@@ -240,7 +240,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
240
240
|
extra_msg,
|
241
241
|
details=event_message))
|
242
242
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
243
|
-
f'Pod status: {pod_status}'
|
243
|
+
f'Pod status: {pod_status} '
|
244
244
|
f'Details: \'{event_message}\' ')
|
245
245
|
raise config_lib.KubernetesError(f'{timeout_err_msg}')
|
246
246
|
|
@@ -673,21 +673,6 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
|
673
673
|
raise e
|
674
674
|
|
675
675
|
|
676
|
-
def _create_persistent_volume_claim(namespace: str, context: Optional[str],
|
677
|
-
pvc_spec: Dict[str, Any]) -> None:
|
678
|
-
"""Creates a persistent volume claim for SkyServe controller."""
|
679
|
-
try:
|
680
|
-
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
681
|
-
name=pvc_spec['metadata']['name'], namespace=namespace)
|
682
|
-
return
|
683
|
-
except kubernetes.api_exception() as e:
|
684
|
-
if e.status != 404: # Not found
|
685
|
-
raise
|
686
|
-
|
687
|
-
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
688
|
-
namespace=namespace, body=pvc_spec)
|
689
|
-
|
690
|
-
|
691
676
|
@timeline.event
|
692
677
|
def _wait_for_deployment_pod(context,
|
693
678
|
namespace,
|
@@ -832,9 +817,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
832
817
|
# Worker pods
|
833
818
|
pod_spec_copy['metadata']['labels'].update(
|
834
819
|
constants.WORKER_NODE_TAGS)
|
835
|
-
|
836
|
-
pod_name
|
837
|
-
|
820
|
+
pod_name = f'{cluster_name_on_cloud}-worker{i}'
|
821
|
+
if pod_name in running_pods:
|
822
|
+
# If the pod is already running, we skip creating it.
|
823
|
+
return
|
824
|
+
pod_spec_copy['metadata']['name'] = pod_name
|
825
|
+
pod_spec_copy['metadata']['labels']['component'] = pod_name
|
838
826
|
# For multi-node support, we put a soft-constraint to schedule
|
839
827
|
# worker pods on different nodes than the head pod.
|
840
828
|
# This is not set as a hard constraint because if different nodes
|
@@ -888,7 +876,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
888
876
|
]
|
889
877
|
|
890
878
|
if to_create_deployment:
|
891
|
-
|
879
|
+
volume.create_persistent_volume_claim(namespace, context, pvc_spec)
|
892
880
|
|
893
881
|
# It's safe to directly modify the template spec in the deployment spec
|
894
882
|
# because controller pod is singleton, i in [0].
|
@@ -910,6 +898,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
910
898
|
print('Deployment failed', e)
|
911
899
|
raise e
|
912
900
|
|
901
|
+
# Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
|
902
|
+
# is used by any pod in the namespace.
|
903
|
+
volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
|
904
|
+
|
913
905
|
return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
|
914
906
|
context)
|
915
907
|
|
@@ -1012,40 +1004,6 @@ def stop_instances(
|
|
1012
1004
|
raise NotImplementedError()
|
1013
1005
|
|
1014
1006
|
|
1015
|
-
def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
|
1016
|
-
resource_name: str) -> None:
|
1017
|
-
"""Helper to delete Kubernetes resources with 404 handling and retries.
|
1018
|
-
|
1019
|
-
Args:
|
1020
|
-
delete_func: Function to call to delete the resource
|
1021
|
-
resource_type: Type of resource being deleted (e.g. 'service'),
|
1022
|
-
used in logging
|
1023
|
-
resource_name: Name of the resource being deleted, used in logging
|
1024
|
-
"""
|
1025
|
-
max_retries = 3
|
1026
|
-
retry_delay = 5 # seconds
|
1027
|
-
|
1028
|
-
for attempt in range(max_retries):
|
1029
|
-
try:
|
1030
|
-
delete_func()
|
1031
|
-
return
|
1032
|
-
except kubernetes.api_exception() as e:
|
1033
|
-
if e.status == 404:
|
1034
|
-
logger.warning(
|
1035
|
-
f'terminate_instances: Tried to delete {resource_type} '
|
1036
|
-
f'{resource_name}, but the {resource_type} was not '
|
1037
|
-
'found (404).')
|
1038
|
-
return
|
1039
|
-
elif attempt < max_retries - 1:
|
1040
|
-
logger.warning(f'terminate_instances: Failed to delete '
|
1041
|
-
f'{resource_type} {resource_name} (attempt '
|
1042
|
-
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
1043
|
-
f'Retrying in {retry_delay} seconds...')
|
1044
|
-
time.sleep(retry_delay)
|
1045
|
-
else:
|
1046
|
-
raise
|
1047
|
-
|
1048
|
-
|
1049
1007
|
def _delete_services(name_prefix: str, namespace: str,
|
1050
1008
|
context: Optional[str]) -> None:
|
1051
1009
|
"""Delete services with the given name prefix.
|
@@ -1061,13 +1019,14 @@ def _delete_services(name_prefix: str, namespace: str,
|
|
1061
1019
|
# TODO(andyl): Wait for
|
1062
1020
|
# https://github.com/pylint-dev/pylint/issues/5263.
|
1063
1021
|
# pylint: disable=cell-var-from-loop
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1022
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1023
|
+
delete_func=lambda: kubernetes.core_api(
|
1024
|
+
context).delete_namespaced_service(name=service_name,
|
1025
|
+
namespace=namespace,
|
1026
|
+
_request_timeout=config_lib.
|
1027
|
+
DELETION_TIMEOUT),
|
1028
|
+
resource_type='service',
|
1029
|
+
resource_name=service_name)
|
1071
1030
|
|
1072
1031
|
|
1073
1032
|
def _terminate_node(namespace: str,
|
@@ -1087,7 +1046,7 @@ def _terminate_node(namespace: str,
|
|
1087
1046
|
# from within the pod, e.g., for autodown.
|
1088
1047
|
# Note - some misbehaving pods may not terminate gracefully if they have
|
1089
1048
|
# open file descriptors. We force delete pods to avoid this.
|
1090
|
-
|
1049
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1091
1050
|
delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
|
1092
1051
|
name=pod_name,
|
1093
1052
|
namespace=namespace,
|
@@ -1105,26 +1064,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
|
|
1105
1064
|
|
1106
1065
|
# Delete deployment
|
1107
1066
|
deployment_name = _get_deployment_name(cluster_name)
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1067
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1068
|
+
delete_func=lambda: kubernetes.apps_api(
|
1069
|
+
context).delete_namespaced_deployment(name=deployment_name,
|
1070
|
+
namespace=namespace,
|
1071
|
+
_request_timeout=config_lib.
|
1072
|
+
DELETION_TIMEOUT),
|
1073
|
+
resource_type='deployment',
|
1074
|
+
resource_name=deployment_name)
|
1115
1075
|
|
1116
1076
|
# Delete PVCs
|
1117
1077
|
pvc_name = _get_pvc_name(
|
1118
1078
|
cluster_name,
|
1119
1079
|
kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
|
1120
1080
|
# pylint: disable=cell-var-from-loop
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1081
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1082
|
+
delete_func=lambda: kubernetes.core_api(
|
1083
|
+
context).delete_namespaced_persistent_volume_claim(
|
1084
|
+
name=pvc_name,
|
1085
|
+
namespace=namespace,
|
1086
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
1087
|
+
resource_type='pvc',
|
1088
|
+
resource_name=pvc_name)
|
1128
1089
|
|
1129
1090
|
|
1130
1091
|
def terminate_instances(
|
@@ -10,7 +10,7 @@ import shutil
|
|
10
10
|
import subprocess
|
11
11
|
import time
|
12
12
|
import typing
|
13
|
-
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
13
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
14
14
|
from urllib.parse import urlparse
|
15
15
|
|
16
16
|
import sky
|
@@ -2734,6 +2734,21 @@ def get_kubernetes_node_info(
|
|
2734
2734
|
node.metadata.labels.get(label_key))
|
2735
2735
|
break
|
2736
2736
|
|
2737
|
+
# Extract IP address from node addresses (prefer external, fallback to internal)
|
2738
|
+
node_ip = None
|
2739
|
+
if node.status.addresses:
|
2740
|
+
# First try to find external IP
|
2741
|
+
for address in node.status.addresses:
|
2742
|
+
if address.type == 'ExternalIP':
|
2743
|
+
node_ip = address.address
|
2744
|
+
break
|
2745
|
+
# If no external IP, try to find internal IP
|
2746
|
+
if node_ip is None:
|
2747
|
+
for address in node.status.addresses:
|
2748
|
+
if address.type == 'InternalIP':
|
2749
|
+
node_ip = address.address
|
2750
|
+
break
|
2751
|
+
|
2737
2752
|
allocated_qty = 0
|
2738
2753
|
accelerator_count = get_node_accelerator_count(node.status.allocatable)
|
2739
2754
|
|
@@ -2765,7 +2780,8 @@ def get_kubernetes_node_info(
|
|
2765
2780
|
name=node.metadata.name,
|
2766
2781
|
accelerator_type=accelerator_name,
|
2767
2782
|
total={'accelerator_count': int(accelerator_count)},
|
2768
|
-
free={'accelerators_available': int(accelerators_available)}
|
2783
|
+
free={'accelerators_available': int(accelerators_available)},
|
2784
|
+
ip_address=node_ip)
|
2769
2785
|
hint = ''
|
2770
2786
|
if has_multi_host_tpu:
|
2771
2787
|
hint = ('(Note: Multi-host TPUs are detected and excluded from the '
|
@@ -3281,3 +3297,37 @@ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
|
|
3281
3297
|
|
3282
3298
|
format_kubeconfig_exec_auth(config, path)
|
3283
3299
|
return path
|
3300
|
+
|
3301
|
+
|
3302
|
+
def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
|
3303
|
+
resource_name: str) -> None:
|
3304
|
+
"""Helper to delete Kubernetes resources with 404 handling and retries.
|
3305
|
+
|
3306
|
+
Args:
|
3307
|
+
delete_func: Function to call to delete the resource
|
3308
|
+
resource_type: Type of resource being deleted (e.g. 'service'),
|
3309
|
+
used in logging
|
3310
|
+
resource_name: Name of the resource being deleted, used in logging
|
3311
|
+
"""
|
3312
|
+
max_retries = 3
|
3313
|
+
retry_delay = 5 # seconds
|
3314
|
+
|
3315
|
+
for attempt in range(max_retries):
|
3316
|
+
try:
|
3317
|
+
delete_func()
|
3318
|
+
return
|
3319
|
+
except kubernetes.api_exception() as e:
|
3320
|
+
if e.status == 404:
|
3321
|
+
logger.warning(
|
3322
|
+
f'terminate_instances: Tried to delete {resource_type} '
|
3323
|
+
f'{resource_name}, but the {resource_type} was not '
|
3324
|
+
'found (404).')
|
3325
|
+
return
|
3326
|
+
elif attempt < max_retries - 1:
|
3327
|
+
logger.warning(f'terminate_instances: Failed to delete '
|
3328
|
+
f'{resource_type} {resource_name} (attempt '
|
3329
|
+
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
3330
|
+
f'Retrying in {retry_delay} seconds...')
|
3331
|
+
time.sleep(retry_delay)
|
3332
|
+
else:
|
3333
|
+
raise
|
@@ -0,0 +1,147 @@
|
|
1
|
+
"""Kubernetes pvc provisioning."""
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple
|
3
|
+
|
4
|
+
from sky import models
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky.adaptors import kubernetes
|
7
|
+
from sky.provision.kubernetes import config as config_lib
|
8
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
9
|
+
from sky.volumes import volume as volume_lib
|
10
|
+
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
|
15
|
+
"""Gets the context and namespace of a volume."""
|
16
|
+
if config.region is None:
|
17
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
18
|
+
config.region = context
|
19
|
+
else:
|
20
|
+
context = config.region
|
21
|
+
namespace = config.config.get('namespace')
|
22
|
+
if namespace is None:
|
23
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
24
|
+
config.config['namespace'] = namespace
|
25
|
+
return context, namespace
|
26
|
+
|
27
|
+
|
28
|
+
def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
|
29
|
+
pod_spec: Dict[str, Any]) -> None:
|
30
|
+
"""Checks if the PVC is used by any pod in the namespace."""
|
31
|
+
volumes = pod_spec.get('spec', {}).get('volumes', [])
|
32
|
+
if not volumes:
|
33
|
+
return
|
34
|
+
once_modes = [
|
35
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value,
|
36
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value
|
37
|
+
]
|
38
|
+
for volume in volumes:
|
39
|
+
pvc_name = volume.get('persistentVolumeClaim', {}).get('claimName')
|
40
|
+
if not pvc_name:
|
41
|
+
continue
|
42
|
+
pvc = kubernetes.core_api(
|
43
|
+
context).read_namespaced_persistent_volume_claim(
|
44
|
+
name=pvc_name, namespace=namespace)
|
45
|
+
access_mode = pvc.spec.access_modes[0]
|
46
|
+
if access_mode not in once_modes:
|
47
|
+
continue
|
48
|
+
usedby = _get_volume_usedby(context, namespace, pvc_name)
|
49
|
+
if usedby:
|
50
|
+
raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
|
51
|
+
f'mode {access_mode} is already '
|
52
|
+
f'in use by {usedby}.')
|
53
|
+
|
54
|
+
|
55
|
+
def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
56
|
+
"""Creates or registers a volume."""
|
57
|
+
context, namespace = _get_context_namespace(config)
|
58
|
+
pvc_spec = _get_pvc_spec(namespace, config)
|
59
|
+
create_persistent_volume_claim(namespace, context, pvc_spec)
|
60
|
+
return config
|
61
|
+
|
62
|
+
|
63
|
+
def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
64
|
+
"""Deletes a volume."""
|
65
|
+
context, namespace = _get_context_namespace(config)
|
66
|
+
pvc_name = config.name_on_cloud
|
67
|
+
logger.info(f'Deleting PVC {pvc_name}')
|
68
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
69
|
+
delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
|
70
|
+
context).delete_namespaced_persistent_volume_claim(
|
71
|
+
name=pvc_name,
|
72
|
+
namespace=namespace,
|
73
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
74
|
+
resource_type='pvc',
|
75
|
+
resource_name=pvc_name)
|
76
|
+
return config
|
77
|
+
|
78
|
+
|
79
|
+
def _get_volume_usedby(context: Optional[str], namespace: str,
|
80
|
+
pvc_name: str) -> List[str]:
|
81
|
+
"""Gets the usedby resources of a volume."""
|
82
|
+
usedby = []
|
83
|
+
# Get all pods in the namespace
|
84
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(namespace=namespace)
|
85
|
+
for pod in pods.items:
|
86
|
+
if pod.spec.volumes is not None:
|
87
|
+
for volume in pod.spec.volumes:
|
88
|
+
if volume.persistent_volume_claim is not None:
|
89
|
+
if volume.persistent_volume_claim.claim_name == pvc_name:
|
90
|
+
usedby.append(pod.metadata.name)
|
91
|
+
return usedby
|
92
|
+
|
93
|
+
|
94
|
+
def get_volume_usedby(config: models.VolumeConfig) -> List[str]:
|
95
|
+
"""Gets the usedby resources of a volume."""
|
96
|
+
context, namespace = _get_context_namespace(config)
|
97
|
+
pvc_name = config.name_on_cloud
|
98
|
+
return _get_volume_usedby(context, namespace, pvc_name)
|
99
|
+
|
100
|
+
|
101
|
+
def create_persistent_volume_claim(namespace: str, context: Optional[str],
|
102
|
+
pvc_spec: Dict[str, Any]) -> None:
|
103
|
+
"""Creates a persistent volume claim for SkyServe controller."""
|
104
|
+
pvc_name = pvc_spec['metadata']['name']
|
105
|
+
try:
|
106
|
+
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
107
|
+
name=pvc_name, namespace=namespace)
|
108
|
+
logger.debug(f'PVC {pvc_name} already exists')
|
109
|
+
return
|
110
|
+
except kubernetes.api_exception() as e:
|
111
|
+
if e.status != 404: # Not found
|
112
|
+
raise
|
113
|
+
logger.info(f'Creating PVC {pvc_name}')
|
114
|
+
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
115
|
+
namespace=namespace, body=pvc_spec)
|
116
|
+
|
117
|
+
|
118
|
+
def _get_pvc_spec(namespace: str,
|
119
|
+
config: models.VolumeConfig) -> Dict[str, Any]:
|
120
|
+
"""Gets the PVC spec for the given storage config."""
|
121
|
+
access_mode = config.config.get('access_mode')
|
122
|
+
size = config.size
|
123
|
+
# The previous code assumes that the access_mode and size are always set.
|
124
|
+
assert access_mode is not None
|
125
|
+
assert size is not None
|
126
|
+
pvc_spec: Dict[str, Any] = {
|
127
|
+
'metadata': {
|
128
|
+
'name': config.name_on_cloud,
|
129
|
+
'namespace': namespace,
|
130
|
+
'labels': {
|
131
|
+
'parent': 'skypilot',
|
132
|
+
'skypilot-name': config.name,
|
133
|
+
}
|
134
|
+
},
|
135
|
+
'spec': {
|
136
|
+
'accessModes': [access_mode],
|
137
|
+
'resources': {
|
138
|
+
'requests': {
|
139
|
+
'storage': f'{size}Gi'
|
140
|
+
}
|
141
|
+
},
|
142
|
+
}
|
143
|
+
}
|
144
|
+
storage_class = config.config.get('storage_class_name')
|
145
|
+
if storage_class is not None:
|
146
|
+
pvc_spec['spec']['storageClassName'] = storage_class
|
147
|
+
return pvc_spec
|
sky/resources.py
CHANGED
@@ -30,6 +30,9 @@ from sky.utils import resources_utils
|
|
30
30
|
from sky.utils import schemas
|
31
31
|
from sky.utils import ux_utils
|
32
32
|
|
33
|
+
if typing.TYPE_CHECKING:
|
34
|
+
from sky.volumes import volume as volume_lib
|
35
|
+
|
33
36
|
logger = sky_logging.init_logger(__name__)
|
34
37
|
|
35
38
|
_DEFAULT_DISK_SIZE_GB = 256
|
@@ -289,7 +292,8 @@ class Resources:
|
|
289
292
|
self._job_recovery = job_recovery
|
290
293
|
|
291
294
|
if disk_size is not None:
|
292
|
-
self._disk_size = int(
|
295
|
+
self._disk_size = int(
|
296
|
+
resources_utils.parse_memory_resource(disk_size, 'disk_size'))
|
293
297
|
else:
|
294
298
|
self._disk_size = _DEFAULT_DISK_SIZE_GB
|
295
299
|
|
@@ -707,11 +711,11 @@ class Resources:
|
|
707
711
|
self._memory = None
|
708
712
|
return
|
709
713
|
|
710
|
-
memory = parse_memory_resource(str(memory),
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
714
|
+
memory = resources_utils.parse_memory_resource(str(memory),
|
715
|
+
'memory',
|
716
|
+
ret_type=float,
|
717
|
+
allow_plus=True,
|
718
|
+
allow_x=True)
|
715
719
|
self._memory = memory
|
716
720
|
if memory.endswith(('+', 'x')):
|
717
721
|
# 'x' is used internally for make sure our resources used by
|
@@ -1465,11 +1469,15 @@ class Resources:
|
|
1465
1469
|
def get_spot_str(self) -> str:
|
1466
1470
|
return '[Spot]' if self.use_spot else ''
|
1467
1471
|
|
1468
|
-
def make_deploy_variables(
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1472
|
+
def make_deploy_variables(
|
1473
|
+
self,
|
1474
|
+
cluster_name: resources_utils.ClusterName,
|
1475
|
+
region: clouds.Region,
|
1476
|
+
zones: Optional[List[clouds.Zone]],
|
1477
|
+
num_nodes: int,
|
1478
|
+
dryrun: bool,
|
1479
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
1480
|
+
) -> Dict[str, Optional[str]]:
|
1473
1481
|
"""Converts planned sky.Resources to resource variables.
|
1474
1482
|
|
1475
1483
|
These variables are divided into two categories: cloud-specific and
|
@@ -1491,7 +1499,7 @@ class Resources:
|
|
1491
1499
|
# Cloud specific variables
|
1492
1500
|
assert self.cloud is not None, 'Cloud must be specified'
|
1493
1501
|
cloud_specific_variables = self.cloud.make_deploy_resources_variables(
|
1494
|
-
self, cluster_name, region, zones, num_nodes, dryrun)
|
1502
|
+
self, cluster_name, region, zones, num_nodes, dryrun, volume_mounts)
|
1495
1503
|
|
1496
1504
|
# TODO(andyl): Should we print some warnings if users' envs share
|
1497
1505
|
# same names with the cloud specific variables, but not enabled
|
@@ -2291,67 +2299,3 @@ def parse_time_minutes(time: str) -> int:
|
|
2291
2299
|
continue
|
2292
2300
|
|
2293
2301
|
raise ValueError(f'Invalid time format: {time}')
|
2294
|
-
|
2295
|
-
|
2296
|
-
def parse_memory_resource(resource_qty_str: Union[str, int, float],
|
2297
|
-
field_name: str,
|
2298
|
-
ret_type: type = int,
|
2299
|
-
unit: str = 'g',
|
2300
|
-
allow_plus: bool = False,
|
2301
|
-
allow_x: bool = False,
|
2302
|
-
allow_rounding: bool = False) -> str:
|
2303
|
-
"""Returns memory size in chosen units given a resource quantity string.
|
2304
|
-
|
2305
|
-
Args:
|
2306
|
-
resource_qty_str: Resource quantity string
|
2307
|
-
unit: Unit to convert to
|
2308
|
-
allow_plus: Whether to allow '+' prefix
|
2309
|
-
allow_x: Whether to allow 'x' suffix
|
2310
|
-
"""
|
2311
|
-
assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
|
2312
|
-
|
2313
|
-
error_msg = f'"{field_name}" field should be a <int><b|k|m|g|t|p><+?>,'\
|
2314
|
-
f' got {resource_qty_str}'
|
2315
|
-
|
2316
|
-
resource_str = str(resource_qty_str)
|
2317
|
-
|
2318
|
-
# Handle plus and x suffixes, x is only used internally for jobs controller
|
2319
|
-
plus = ''
|
2320
|
-
if resource_str.endswith('+'):
|
2321
|
-
if allow_plus:
|
2322
|
-
resource_str = resource_str[:-1]
|
2323
|
-
plus = '+'
|
2324
|
-
else:
|
2325
|
-
raise ValueError(error_msg)
|
2326
|
-
|
2327
|
-
x = ''
|
2328
|
-
if resource_str.endswith('x'):
|
2329
|
-
if allow_x:
|
2330
|
-
resource_str = resource_str[:-1]
|
2331
|
-
x = 'x'
|
2332
|
-
else:
|
2333
|
-
raise ValueError(error_msg)
|
2334
|
-
|
2335
|
-
try:
|
2336
|
-
# We assume it is already in the wanted units to maintain backwards
|
2337
|
-
# compatibility
|
2338
|
-
ret_type(resource_str)
|
2339
|
-
return f'{resource_str}{plus}{x}'
|
2340
|
-
except ValueError:
|
2341
|
-
pass
|
2342
|
-
|
2343
|
-
resource_str = resource_str.lower()
|
2344
|
-
for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
|
2345
|
-
if resource_str.endswith(mem_unit):
|
2346
|
-
try:
|
2347
|
-
value = ret_type(resource_str[:-len(mem_unit)])
|
2348
|
-
converted = (value * multiplier /
|
2349
|
-
constants.MEMORY_SIZE_UNITS[unit])
|
2350
|
-
if not allow_rounding and ret_type(converted) != converted:
|
2351
|
-
raise ValueError(error_msg)
|
2352
|
-
converted = ret_type(converted)
|
2353
|
-
return f'{converted}{plus}{x}'
|
2354
|
-
except ValueError:
|
2355
|
-
continue
|
2356
|
-
|
2357
|
-
raise ValueError(error_msg)
|
sky/serve/client/sdk.py
CHANGED
@@ -5,9 +5,9 @@ from typing import List, Optional, Union
|
|
5
5
|
|
6
6
|
import click
|
7
7
|
|
8
|
-
from sky.adaptors import common as adaptors_common
|
9
8
|
from sky.client import common as client_common
|
10
9
|
from sky.server import common as server_common
|
10
|
+
from sky.server import rest
|
11
11
|
from sky.server.requests import payloads
|
12
12
|
from sky.usage import usage_lib
|
13
13
|
from sky.utils import admin_policy_utils
|
@@ -17,12 +17,8 @@ from sky.utils import dag_utils
|
|
17
17
|
if typing.TYPE_CHECKING:
|
18
18
|
import io
|
19
19
|
|
20
|
-
import requests
|
21
|
-
|
22
20
|
import sky
|
23
21
|
from sky.serve import serve_utils
|
24
|
-
else:
|
25
|
-
requests = adaptors_common.LazyImport('requests')
|
26
22
|
|
27
23
|
|
28
24
|
@context.contextual
|
@@ -78,7 +74,7 @@ def up(
|
|
78
74
|
task=dag_str,
|
79
75
|
service_name=service_name,
|
80
76
|
)
|
81
|
-
response =
|
77
|
+
response = rest.post(
|
82
78
|
f'{server_common.get_server_url()}/serve/up',
|
83
79
|
json=json.loads(body.model_dump_json()),
|
84
80
|
timeout=(5, None),
|
@@ -140,7 +136,7 @@ def update(
|
|
140
136
|
mode=mode,
|
141
137
|
)
|
142
138
|
|
143
|
-
response =
|
139
|
+
response = rest.post(
|
144
140
|
f'{server_common.get_server_url()}/serve/update',
|
145
141
|
json=json.loads(body.model_dump_json()),
|
146
142
|
timeout=(5, None),
|
@@ -182,7 +178,7 @@ def down(
|
|
182
178
|
all=all,
|
183
179
|
purge=purge,
|
184
180
|
)
|
185
|
-
response =
|
181
|
+
response = rest.post(
|
186
182
|
f'{server_common.get_server_url()}/serve/down',
|
187
183
|
json=json.loads(body.model_dump_json()),
|
188
184
|
timeout=(5, None),
|
@@ -217,7 +213,7 @@ def terminate_replica(service_name: str, replica_id: int,
|
|
217
213
|
replica_id=replica_id,
|
218
214
|
purge=purge,
|
219
215
|
)
|
220
|
-
response =
|
216
|
+
response = rest.post(
|
221
217
|
f'{server_common.get_server_url()}/serve/terminate-replica',
|
222
218
|
json=json.loads(body.model_dump_json()),
|
223
219
|
timeout=(5, None),
|
@@ -290,7 +286,7 @@ def status(
|
|
290
286
|
exceptions.ClusterNotUpError: if the sky serve controller is not up.
|
291
287
|
"""
|
292
288
|
body = payloads.ServeStatusBody(service_names=service_names,)
|
293
|
-
response =
|
289
|
+
response = rest.post(
|
294
290
|
f'{server_common.get_server_url()}/serve/status',
|
295
291
|
json=json.loads(body.model_dump_json()),
|
296
292
|
timeout=(5, None),
|
@@ -301,6 +297,7 @@ def status(
|
|
301
297
|
|
302
298
|
@usage_lib.entrypoint
|
303
299
|
@server_common.check_server_healthy_or_start
|
300
|
+
@rest.retry_on_server_unavailable()
|
304
301
|
def tail_logs(service_name: str,
|
305
302
|
target: Union[str, 'serve_utils.ServiceComponent'],
|
306
303
|
replica_id: Optional[int] = None,
|
@@ -376,7 +373,7 @@ def tail_logs(service_name: str,
|
|
376
373
|
replica_id=replica_id,
|
377
374
|
follow=follow,
|
378
375
|
)
|
379
|
-
response =
|
376
|
+
response = rest.post(
|
380
377
|
f'{server_common.get_server_url()}/serve/logs',
|
381
378
|
json=json.loads(body.model_dump_json()),
|
382
379
|
timeout=(5, None),
|
@@ -384,7 +381,10 @@ def tail_logs(service_name: str,
|
|
384
381
|
cookies=server_common.get_api_cookie_jar(),
|
385
382
|
)
|
386
383
|
request_id = server_common.get_request_id(response)
|
387
|
-
sdk.stream_response(request_id,
|
384
|
+
return sdk.stream_response(request_id=request_id,
|
385
|
+
response=response,
|
386
|
+
output_stream=output_stream,
|
387
|
+
resumable=True)
|
388
388
|
|
389
389
|
|
390
390
|
@usage_lib.entrypoint
|
@@ -436,7 +436,7 @@ def sync_down_logs(service_name: str,
|
|
436
436
|
targets=targets,
|
437
437
|
replica_ids=replica_ids,
|
438
438
|
)
|
439
|
-
response =
|
439
|
+
response = rest.post(
|
440
440
|
f'{server_common.get_server_url()}/serve/sync-down-logs',
|
441
441
|
json=json.loads(body.model_dump_json()),
|
442
442
|
timeout=(5, None),
|
sky/serve/server/core.py
CHANGED
@@ -28,6 +28,7 @@ from sky.utils import command_runner
|
|
28
28
|
from sky.utils import common
|
29
29
|
from sky.utils import common_utils
|
30
30
|
from sky.utils import controller_utils
|
31
|
+
from sky.utils import dag_utils
|
31
32
|
from sky.utils import rich_utils
|
32
33
|
from sky.utils import subprocess_utils
|
33
34
|
from sky.utils import ux_utils
|
@@ -139,10 +140,13 @@ def up(
|
|
139
140
|
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
140
141
|
|
141
142
|
serve_utils.validate_service_task(task)
|
143
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
144
|
+
dag.resolve_and_validate_volumes()
|
142
145
|
# Always apply the policy again here, even though it might have been applied
|
143
146
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
144
147
|
# and get the mutated config.
|
145
|
-
dag, mutated_user_config = admin_policy_utils.apply(
|
148
|
+
dag, mutated_user_config = admin_policy_utils.apply(dag)
|
149
|
+
dag.pre_mount_volumes()
|
146
150
|
task = dag.tasks[0]
|
147
151
|
|
148
152
|
with rich_utils.safe_status(
|