skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ import copy
3
3
  import json
4
4
  import time
5
5
  from typing import Any, Callable, Dict, List, Optional, Union
6
- import uuid
7
6
 
8
7
  from sky import exceptions
9
8
  from sky import sky_logging
@@ -15,6 +14,7 @@ from sky.provision import docker_utils
15
14
  from sky.provision.kubernetes import config as config_lib
16
15
  from sky.provision.kubernetes import network_utils
17
16
  from sky.provision.kubernetes import utils as kubernetes_utils
17
+ from sky.provision.kubernetes import volume
18
18
  from sky.utils import command_runner
19
19
  from sky.utils import common_utils
20
20
  from sky.utils import config_utils
@@ -240,7 +240,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
240
240
  extra_msg,
241
241
  details=event_message))
242
242
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
243
- f'Pod status: {pod_status}'
243
+ f'Pod status: {pod_status} '
244
244
  f'Details: \'{event_message}\' ')
245
245
  raise config_lib.KubernetesError(f'{timeout_err_msg}')
246
246
 
@@ -673,21 +673,6 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
673
673
  raise e
674
674
 
675
675
 
676
- def _create_persistent_volume_claim(namespace: str, context: Optional[str],
677
- pvc_spec: Dict[str, Any]) -> None:
678
- """Creates a persistent volume claim for SkyServe controller."""
679
- try:
680
- kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
681
- name=pvc_spec['metadata']['name'], namespace=namespace)
682
- return
683
- except kubernetes.api_exception() as e:
684
- if e.status != 404: # Not found
685
- raise
686
-
687
- kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
688
- namespace=namespace, body=pvc_spec)
689
-
690
-
691
676
  @timeline.event
692
677
  def _wait_for_deployment_pod(context,
693
678
  namespace,
@@ -832,9 +817,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
832
817
  # Worker pods
833
818
  pod_spec_copy['metadata']['labels'].update(
834
819
  constants.WORKER_NODE_TAGS)
835
- pod_uuid = str(uuid.uuid4())[:6]
836
- pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
837
- pod_spec_copy['metadata']['name'] = f'{pod_name}-worker'
820
+ pod_name = f'{cluster_name_on_cloud}-worker{i}'
821
+ if pod_name in running_pods:
822
+ # If the pod is already running, we skip creating it.
823
+ return
824
+ pod_spec_copy['metadata']['name'] = pod_name
825
+ pod_spec_copy['metadata']['labels']['component'] = pod_name
838
826
  # For multi-node support, we put a soft-constraint to schedule
839
827
  # worker pods on different nodes than the head pod.
840
828
  # This is not set as a hard constraint because if different nodes
@@ -888,7 +876,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
888
876
  ]
889
877
 
890
878
  if to_create_deployment:
891
- _create_persistent_volume_claim(namespace, context, pvc_spec)
879
+ volume.create_persistent_volume_claim(namespace, context, pvc_spec)
892
880
 
893
881
  # It's safe to directly modify the template spec in the deployment spec
894
882
  # because controller pod is singleton, i in [0].
@@ -910,6 +898,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
910
898
  print('Deployment failed', e)
911
899
  raise e
912
900
 
901
+ # Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
902
+ # is used by any pod in the namespace.
903
+ volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
904
+
913
905
  return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
914
906
  context)
915
907
 
@@ -1012,40 +1004,6 @@ def stop_instances(
1012
1004
  raise NotImplementedError()
1013
1005
 
1014
1006
 
1015
- def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
1016
- resource_name: str) -> None:
1017
- """Helper to delete Kubernetes resources with 404 handling and retries.
1018
-
1019
- Args:
1020
- delete_func: Function to call to delete the resource
1021
- resource_type: Type of resource being deleted (e.g. 'service'),
1022
- used in logging
1023
- resource_name: Name of the resource being deleted, used in logging
1024
- """
1025
- max_retries = 3
1026
- retry_delay = 5 # seconds
1027
-
1028
- for attempt in range(max_retries):
1029
- try:
1030
- delete_func()
1031
- return
1032
- except kubernetes.api_exception() as e:
1033
- if e.status == 404:
1034
- logger.warning(
1035
- f'terminate_instances: Tried to delete {resource_type} '
1036
- f'{resource_name}, but the {resource_type} was not '
1037
- 'found (404).')
1038
- return
1039
- elif attempt < max_retries - 1:
1040
- logger.warning(f'terminate_instances: Failed to delete '
1041
- f'{resource_type} {resource_name} (attempt '
1042
- f'{attempt + 1}/{max_retries}). Error: {e}. '
1043
- f'Retrying in {retry_delay} seconds...')
1044
- time.sleep(retry_delay)
1045
- else:
1046
- raise
1047
-
1048
-
1049
1007
  def _delete_services(name_prefix: str, namespace: str,
1050
1008
  context: Optional[str]) -> None:
1051
1009
  """Delete services with the given name prefix.
@@ -1061,13 +1019,14 @@ def _delete_services(name_prefix: str, namespace: str,
1061
1019
  # TODO(andyl): Wait for
1062
1020
  # https://github.com/pylint-dev/pylint/issues/5263.
1063
1021
  # pylint: disable=cell-var-from-loop
1064
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1065
- context).delete_namespaced_service(name=service_name,
1066
- namespace=namespace,
1067
- _request_timeout=config_lib.
1068
- DELETION_TIMEOUT),
1069
- resource_type='service',
1070
- resource_name=service_name)
1022
+ kubernetes_utils.delete_k8s_resource_with_retry(
1023
+ delete_func=lambda: kubernetes.core_api(
1024
+ context).delete_namespaced_service(name=service_name,
1025
+ namespace=namespace,
1026
+ _request_timeout=config_lib.
1027
+ DELETION_TIMEOUT),
1028
+ resource_type='service',
1029
+ resource_name=service_name)
1071
1030
 
1072
1031
 
1073
1032
  def _terminate_node(namespace: str,
@@ -1087,7 +1046,7 @@ def _terminate_node(namespace: str,
1087
1046
  # from within the pod, e.g., for autodown.
1088
1047
  # Note - some misbehaving pods may not terminate gracefully if they have
1089
1048
  # open file descriptors. We force delete pods to avoid this.
1090
- _delete_k8s_resource_with_retry(
1049
+ kubernetes_utils.delete_k8s_resource_with_retry(
1091
1050
  delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
1092
1051
  name=pod_name,
1093
1052
  namespace=namespace,
@@ -1105,26 +1064,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
1105
1064
 
1106
1065
  # Delete deployment
1107
1066
  deployment_name = _get_deployment_name(cluster_name)
1108
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.apps_api(
1109
- context).delete_namespaced_deployment(name=deployment_name,
1110
- namespace=namespace,
1111
- _request_timeout=config_lib.
1112
- DELETION_TIMEOUT),
1113
- resource_type='deployment',
1114
- resource_name=deployment_name)
1067
+ kubernetes_utils.delete_k8s_resource_with_retry(
1068
+ delete_func=lambda: kubernetes.apps_api(
1069
+ context).delete_namespaced_deployment(name=deployment_name,
1070
+ namespace=namespace,
1071
+ _request_timeout=config_lib.
1072
+ DELETION_TIMEOUT),
1073
+ resource_type='deployment',
1074
+ resource_name=deployment_name)
1115
1075
 
1116
1076
  # Delete PVCs
1117
1077
  pvc_name = _get_pvc_name(
1118
1078
  cluster_name,
1119
1079
  kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
1120
1080
  # pylint: disable=cell-var-from-loop
1121
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1122
- context).delete_namespaced_persistent_volume_claim(
1123
- name=pvc_name,
1124
- namespace=namespace,
1125
- _request_timeout=config_lib.DELETION_TIMEOUT),
1126
- resource_type='pvc',
1127
- resource_name=pvc_name)
1081
+ kubernetes_utils.delete_k8s_resource_with_retry(
1082
+ delete_func=lambda: kubernetes.core_api(
1083
+ context).delete_namespaced_persistent_volume_claim(
1084
+ name=pvc_name,
1085
+ namespace=namespace,
1086
+ _request_timeout=config_lib.DELETION_TIMEOUT),
1087
+ resource_type='pvc',
1088
+ resource_name=pvc_name)
1128
1089
 
1129
1090
 
1130
1091
  def terminate_instances(
@@ -10,7 +10,7 @@ import shutil
10
10
  import subprocess
11
11
  import time
12
12
  import typing
13
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
13
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
14
14
  from urllib.parse import urlparse
15
15
 
16
16
  import sky
@@ -2734,6 +2734,21 @@ def get_kubernetes_node_info(
2734
2734
  node.metadata.labels.get(label_key))
2735
2735
  break
2736
2736
 
2737
+ # Extract IP address from node addresses (prefer external, fallback to internal)
2738
+ node_ip = None
2739
+ if node.status.addresses:
2740
+ # First try to find external IP
2741
+ for address in node.status.addresses:
2742
+ if address.type == 'ExternalIP':
2743
+ node_ip = address.address
2744
+ break
2745
+ # If no external IP, try to find internal IP
2746
+ if node_ip is None:
2747
+ for address in node.status.addresses:
2748
+ if address.type == 'InternalIP':
2749
+ node_ip = address.address
2750
+ break
2751
+
2737
2752
  allocated_qty = 0
2738
2753
  accelerator_count = get_node_accelerator_count(node.status.allocatable)
2739
2754
 
@@ -2765,7 +2780,8 @@ def get_kubernetes_node_info(
2765
2780
  name=node.metadata.name,
2766
2781
  accelerator_type=accelerator_name,
2767
2782
  total={'accelerator_count': int(accelerator_count)},
2768
- free={'accelerators_available': int(accelerators_available)})
2783
+ free={'accelerators_available': int(accelerators_available)},
2784
+ ip_address=node_ip)
2769
2785
  hint = ''
2770
2786
  if has_multi_host_tpu:
2771
2787
  hint = ('(Note: Multi-host TPUs are detected and excluded from the '
@@ -3281,3 +3297,37 @@ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
3281
3297
 
3282
3298
  format_kubeconfig_exec_auth(config, path)
3283
3299
  return path
3300
+
3301
+
3302
+ def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
3303
+ resource_name: str) -> None:
3304
+ """Helper to delete Kubernetes resources with 404 handling and retries.
3305
+
3306
+ Args:
3307
+ delete_func: Function to call to delete the resource
3308
+ resource_type: Type of resource being deleted (e.g. 'service'),
3309
+ used in logging
3310
+ resource_name: Name of the resource being deleted, used in logging
3311
+ """
3312
+ max_retries = 3
3313
+ retry_delay = 5 # seconds
3314
+
3315
+ for attempt in range(max_retries):
3316
+ try:
3317
+ delete_func()
3318
+ return
3319
+ except kubernetes.api_exception() as e:
3320
+ if e.status == 404:
3321
+ logger.warning(
3322
+ f'terminate_instances: Tried to delete {resource_type} '
3323
+ f'{resource_name}, but the {resource_type} was not '
3324
+ 'found (404).')
3325
+ return
3326
+ elif attempt < max_retries - 1:
3327
+ logger.warning(f'terminate_instances: Failed to delete '
3328
+ f'{resource_type} {resource_name} (attempt '
3329
+ f'{attempt + 1}/{max_retries}). Error: {e}. '
3330
+ f'Retrying in {retry_delay} seconds...')
3331
+ time.sleep(retry_delay)
3332
+ else:
3333
+ raise
@@ -0,0 +1,147 @@
1
+ """Kubernetes pvc provisioning."""
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ from sky import models
5
+ from sky import sky_logging
6
+ from sky.adaptors import kubernetes
7
+ from sky.provision.kubernetes import config as config_lib
8
+ from sky.provision.kubernetes import utils as kubernetes_utils
9
+ from sky.volumes import volume as volume_lib
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
15
+ """Gets the context and namespace of a volume."""
16
+ if config.region is None:
17
+ context = kubernetes_utils.get_current_kube_config_context_name()
18
+ config.region = context
19
+ else:
20
+ context = config.region
21
+ namespace = config.config.get('namespace')
22
+ if namespace is None:
23
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
24
+ config.config['namespace'] = namespace
25
+ return context, namespace
26
+
27
+
28
+ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
29
+ pod_spec: Dict[str, Any]) -> None:
30
+ """Checks if the PVC is used by any pod in the namespace."""
31
+ volumes = pod_spec.get('spec', {}).get('volumes', [])
32
+ if not volumes:
33
+ return
34
+ once_modes = [
35
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value,
36
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value
37
+ ]
38
+ for volume in volumes:
39
+ pvc_name = volume.get('persistentVolumeClaim', {}).get('claimName')
40
+ if not pvc_name:
41
+ continue
42
+ pvc = kubernetes.core_api(
43
+ context).read_namespaced_persistent_volume_claim(
44
+ name=pvc_name, namespace=namespace)
45
+ access_mode = pvc.spec.access_modes[0]
46
+ if access_mode not in once_modes:
47
+ continue
48
+ usedby = _get_volume_usedby(context, namespace, pvc_name)
49
+ if usedby:
50
+ raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
51
+ f'mode {access_mode} is already '
52
+ f'in use by {usedby}.')
53
+
54
+
55
+ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
56
+ """Creates or registers a volume."""
57
+ context, namespace = _get_context_namespace(config)
58
+ pvc_spec = _get_pvc_spec(namespace, config)
59
+ create_persistent_volume_claim(namespace, context, pvc_spec)
60
+ return config
61
+
62
+
63
+ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
64
+ """Deletes a volume."""
65
+ context, namespace = _get_context_namespace(config)
66
+ pvc_name = config.name_on_cloud
67
+ logger.info(f'Deleting PVC {pvc_name}')
68
+ kubernetes_utils.delete_k8s_resource_with_retry(
69
+ delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
70
+ context).delete_namespaced_persistent_volume_claim(
71
+ name=pvc_name,
72
+ namespace=namespace,
73
+ _request_timeout=config_lib.DELETION_TIMEOUT),
74
+ resource_type='pvc',
75
+ resource_name=pvc_name)
76
+ return config
77
+
78
+
79
+ def _get_volume_usedby(context: Optional[str], namespace: str,
80
+ pvc_name: str) -> List[str]:
81
+ """Gets the usedby resources of a volume."""
82
+ usedby = []
83
+ # Get all pods in the namespace
84
+ pods = kubernetes.core_api(context).list_namespaced_pod(namespace=namespace)
85
+ for pod in pods.items:
86
+ if pod.spec.volumes is not None:
87
+ for volume in pod.spec.volumes:
88
+ if volume.persistent_volume_claim is not None:
89
+ if volume.persistent_volume_claim.claim_name == pvc_name:
90
+ usedby.append(pod.metadata.name)
91
+ return usedby
92
+
93
+
94
+ def get_volume_usedby(config: models.VolumeConfig) -> List[str]:
95
+ """Gets the usedby resources of a volume."""
96
+ context, namespace = _get_context_namespace(config)
97
+ pvc_name = config.name_on_cloud
98
+ return _get_volume_usedby(context, namespace, pvc_name)
99
+
100
+
101
+ def create_persistent_volume_claim(namespace: str, context: Optional[str],
102
+ pvc_spec: Dict[str, Any]) -> None:
103
+ """Creates a persistent volume claim for SkyServe controller."""
104
+ pvc_name = pvc_spec['metadata']['name']
105
+ try:
106
+ kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
107
+ name=pvc_name, namespace=namespace)
108
+ logger.debug(f'PVC {pvc_name} already exists')
109
+ return
110
+ except kubernetes.api_exception() as e:
111
+ if e.status != 404: # Not found
112
+ raise
113
+ logger.info(f'Creating PVC {pvc_name}')
114
+ kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
115
+ namespace=namespace, body=pvc_spec)
116
+
117
+
118
+ def _get_pvc_spec(namespace: str,
119
+ config: models.VolumeConfig) -> Dict[str, Any]:
120
+ """Gets the PVC spec for the given storage config."""
121
+ access_mode = config.config.get('access_mode')
122
+ size = config.size
123
+ # The previous code assumes that the access_mode and size are always set.
124
+ assert access_mode is not None
125
+ assert size is not None
126
+ pvc_spec: Dict[str, Any] = {
127
+ 'metadata': {
128
+ 'name': config.name_on_cloud,
129
+ 'namespace': namespace,
130
+ 'labels': {
131
+ 'parent': 'skypilot',
132
+ 'skypilot-name': config.name,
133
+ }
134
+ },
135
+ 'spec': {
136
+ 'accessModes': [access_mode],
137
+ 'resources': {
138
+ 'requests': {
139
+ 'storage': f'{size}Gi'
140
+ }
141
+ },
142
+ }
143
+ }
144
+ storage_class = config.config.get('storage_class_name')
145
+ if storage_class is not None:
146
+ pvc_spec['spec']['storageClassName'] = storage_class
147
+ return pvc_spec
sky/resources.py CHANGED
@@ -30,6 +30,9 @@ from sky.utils import resources_utils
30
30
  from sky.utils import schemas
31
31
  from sky.utils import ux_utils
32
32
 
33
+ if typing.TYPE_CHECKING:
34
+ from sky.volumes import volume as volume_lib
35
+
33
36
  logger = sky_logging.init_logger(__name__)
34
37
 
35
38
  _DEFAULT_DISK_SIZE_GB = 256
@@ -289,7 +292,8 @@ class Resources:
289
292
  self._job_recovery = job_recovery
290
293
 
291
294
  if disk_size is not None:
292
- self._disk_size = int(parse_memory_resource(disk_size, 'disk_size'))
295
+ self._disk_size = int(
296
+ resources_utils.parse_memory_resource(disk_size, 'disk_size'))
293
297
  else:
294
298
  self._disk_size = _DEFAULT_DISK_SIZE_GB
295
299
 
@@ -707,11 +711,11 @@ class Resources:
707
711
  self._memory = None
708
712
  return
709
713
 
710
- memory = parse_memory_resource(str(memory),
711
- 'memory',
712
- ret_type=float,
713
- allow_plus=True,
714
- allow_x=True)
714
+ memory = resources_utils.parse_memory_resource(str(memory),
715
+ 'memory',
716
+ ret_type=float,
717
+ allow_plus=True,
718
+ allow_x=True)
715
719
  self._memory = memory
716
720
  if memory.endswith(('+', 'x')):
717
721
  # 'x' is used internally for make sure our resources used by
@@ -1465,11 +1469,15 @@ class Resources:
1465
1469
  def get_spot_str(self) -> str:
1466
1470
  return '[Spot]' if self.use_spot else ''
1467
1471
 
1468
- def make_deploy_variables(self, cluster_name: resources_utils.ClusterName,
1469
- region: clouds.Region,
1470
- zones: Optional[List[clouds.Zone]],
1471
- num_nodes: int,
1472
- dryrun: bool) -> Dict[str, Optional[str]]:
1472
+ def make_deploy_variables(
1473
+ self,
1474
+ cluster_name: resources_utils.ClusterName,
1475
+ region: clouds.Region,
1476
+ zones: Optional[List[clouds.Zone]],
1477
+ num_nodes: int,
1478
+ dryrun: bool,
1479
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
1480
+ ) -> Dict[str, Optional[str]]:
1473
1481
  """Converts planned sky.Resources to resource variables.
1474
1482
 
1475
1483
  These variables are divided into two categories: cloud-specific and
@@ -1491,7 +1499,7 @@ class Resources:
1491
1499
  # Cloud specific variables
1492
1500
  assert self.cloud is not None, 'Cloud must be specified'
1493
1501
  cloud_specific_variables = self.cloud.make_deploy_resources_variables(
1494
- self, cluster_name, region, zones, num_nodes, dryrun)
1502
+ self, cluster_name, region, zones, num_nodes, dryrun, volume_mounts)
1495
1503
 
1496
1504
  # TODO(andyl): Should we print some warnings if users' envs share
1497
1505
  # same names with the cloud specific variables, but not enabled
@@ -2291,67 +2299,3 @@ def parse_time_minutes(time: str) -> int:
2291
2299
  continue
2292
2300
 
2293
2301
  raise ValueError(f'Invalid time format: {time}')
2294
-
2295
-
2296
- def parse_memory_resource(resource_qty_str: Union[str, int, float],
2297
- field_name: str,
2298
- ret_type: type = int,
2299
- unit: str = 'g',
2300
- allow_plus: bool = False,
2301
- allow_x: bool = False,
2302
- allow_rounding: bool = False) -> str:
2303
- """Returns memory size in chosen units given a resource quantity string.
2304
-
2305
- Args:
2306
- resource_qty_str: Resource quantity string
2307
- unit: Unit to convert to
2308
- allow_plus: Whether to allow '+' prefix
2309
- allow_x: Whether to allow 'x' suffix
2310
- """
2311
- assert unit in constants.MEMORY_SIZE_UNITS, f'Invalid unit: {unit}'
2312
-
2313
- error_msg = f'"{field_name}" field should be a <int><b|k|m|g|t|p><+?>,'\
2314
- f' got {resource_qty_str}'
2315
-
2316
- resource_str = str(resource_qty_str)
2317
-
2318
- # Handle plus and x suffixes, x is only used internally for jobs controller
2319
- plus = ''
2320
- if resource_str.endswith('+'):
2321
- if allow_plus:
2322
- resource_str = resource_str[:-1]
2323
- plus = '+'
2324
- else:
2325
- raise ValueError(error_msg)
2326
-
2327
- x = ''
2328
- if resource_str.endswith('x'):
2329
- if allow_x:
2330
- resource_str = resource_str[:-1]
2331
- x = 'x'
2332
- else:
2333
- raise ValueError(error_msg)
2334
-
2335
- try:
2336
- # We assume it is already in the wanted units to maintain backwards
2337
- # compatibility
2338
- ret_type(resource_str)
2339
- return f'{resource_str}{plus}{x}'
2340
- except ValueError:
2341
- pass
2342
-
2343
- resource_str = resource_str.lower()
2344
- for mem_unit, multiplier in constants.MEMORY_SIZE_UNITS.items():
2345
- if resource_str.endswith(mem_unit):
2346
- try:
2347
- value = ret_type(resource_str[:-len(mem_unit)])
2348
- converted = (value * multiplier /
2349
- constants.MEMORY_SIZE_UNITS[unit])
2350
- if not allow_rounding and ret_type(converted) != converted:
2351
- raise ValueError(error_msg)
2352
- converted = ret_type(converted)
2353
- return f'{converted}{plus}{x}'
2354
- except ValueError:
2355
- continue
2356
-
2357
- raise ValueError(error_msg)
sky/serve/client/sdk.py CHANGED
@@ -5,9 +5,9 @@ from typing import List, Optional, Union
5
5
 
6
6
  import click
7
7
 
8
- from sky.adaptors import common as adaptors_common
9
8
  from sky.client import common as client_common
10
9
  from sky.server import common as server_common
10
+ from sky.server import rest
11
11
  from sky.server.requests import payloads
12
12
  from sky.usage import usage_lib
13
13
  from sky.utils import admin_policy_utils
@@ -17,12 +17,8 @@ from sky.utils import dag_utils
17
17
  if typing.TYPE_CHECKING:
18
18
  import io
19
19
 
20
- import requests
21
-
22
20
  import sky
23
21
  from sky.serve import serve_utils
24
- else:
25
- requests = adaptors_common.LazyImport('requests')
26
22
 
27
23
 
28
24
  @context.contextual
@@ -78,7 +74,7 @@ def up(
78
74
  task=dag_str,
79
75
  service_name=service_name,
80
76
  )
81
- response = requests.post(
77
+ response = rest.post(
82
78
  f'{server_common.get_server_url()}/serve/up',
83
79
  json=json.loads(body.model_dump_json()),
84
80
  timeout=(5, None),
@@ -140,7 +136,7 @@ def update(
140
136
  mode=mode,
141
137
  )
142
138
 
143
- response = requests.post(
139
+ response = rest.post(
144
140
  f'{server_common.get_server_url()}/serve/update',
145
141
  json=json.loads(body.model_dump_json()),
146
142
  timeout=(5, None),
@@ -182,7 +178,7 @@ def down(
182
178
  all=all,
183
179
  purge=purge,
184
180
  )
185
- response = requests.post(
181
+ response = rest.post(
186
182
  f'{server_common.get_server_url()}/serve/down',
187
183
  json=json.loads(body.model_dump_json()),
188
184
  timeout=(5, None),
@@ -217,7 +213,7 @@ def terminate_replica(service_name: str, replica_id: int,
217
213
  replica_id=replica_id,
218
214
  purge=purge,
219
215
  )
220
- response = requests.post(
216
+ response = rest.post(
221
217
  f'{server_common.get_server_url()}/serve/terminate-replica',
222
218
  json=json.loads(body.model_dump_json()),
223
219
  timeout=(5, None),
@@ -290,7 +286,7 @@ def status(
290
286
  exceptions.ClusterNotUpError: if the sky serve controller is not up.
291
287
  """
292
288
  body = payloads.ServeStatusBody(service_names=service_names,)
293
- response = requests.post(
289
+ response = rest.post(
294
290
  f'{server_common.get_server_url()}/serve/status',
295
291
  json=json.loads(body.model_dump_json()),
296
292
  timeout=(5, None),
@@ -301,6 +297,7 @@ def status(
301
297
 
302
298
  @usage_lib.entrypoint
303
299
  @server_common.check_server_healthy_or_start
300
+ @rest.retry_on_server_unavailable()
304
301
  def tail_logs(service_name: str,
305
302
  target: Union[str, 'serve_utils.ServiceComponent'],
306
303
  replica_id: Optional[int] = None,
@@ -376,7 +373,7 @@ def tail_logs(service_name: str,
376
373
  replica_id=replica_id,
377
374
  follow=follow,
378
375
  )
379
- response = requests.post(
376
+ response = rest.post(
380
377
  f'{server_common.get_server_url()}/serve/logs',
381
378
  json=json.loads(body.model_dump_json()),
382
379
  timeout=(5, None),
@@ -384,7 +381,10 @@ def tail_logs(service_name: str,
384
381
  cookies=server_common.get_api_cookie_jar(),
385
382
  )
386
383
  request_id = server_common.get_request_id(response)
387
- sdk.stream_response(request_id, response, output_stream)
384
+ return sdk.stream_response(request_id=request_id,
385
+ response=response,
386
+ output_stream=output_stream,
387
+ resumable=True)
388
388
 
389
389
 
390
390
  @usage_lib.entrypoint
@@ -436,7 +436,7 @@ def sync_down_logs(service_name: str,
436
436
  targets=targets,
437
437
  replica_ids=replica_ids,
438
438
  )
439
- response = requests.post(
439
+ response = rest.post(
440
440
  f'{server_common.get_server_url()}/serve/sync-down-logs',
441
441
  json=json.loads(body.model_dump_json()),
442
442
  timeout=(5, None),
sky/serve/server/core.py CHANGED
@@ -28,6 +28,7 @@ from sky.utils import command_runner
28
28
  from sky.utils import common
29
29
  from sky.utils import common_utils
30
30
  from sky.utils import controller_utils
31
+ from sky.utils import dag_utils
31
32
  from sky.utils import rich_utils
32
33
  from sky.utils import subprocess_utils
33
34
  from sky.utils import ux_utils
@@ -139,10 +140,13 @@ def up(
139
140
  f'{constants.CLUSTER_NAME_VALID_REGEX}')
140
141
 
141
142
  serve_utils.validate_service_task(task)
143
+ dag = dag_utils.convert_entrypoint_to_dag(task)
144
+ dag.resolve_and_validate_volumes()
142
145
  # Always apply the policy again here, even though it might have been applied
143
146
  # in the CLI. This is to ensure that we apply the policy to the final DAG
144
147
  # and get the mutated config.
145
- dag, mutated_user_config = admin_policy_utils.apply(task)
148
+ dag, mutated_user_config = admin_policy_utils.apply(dag)
149
+ dag.pre_mount_volumes()
146
150
  task = dag.tasks[0]
147
151
 
148
152
  with rich_utils.safe_status(