skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +1 -6
  3. sky/backends/backend_utils.py +26 -11
  4. sky/backends/cloud_vm_ray_backend.py +16 -5
  5. sky/client/cli/command.py +232 -9
  6. sky/client/sdk.py +195 -91
  7. sky/clouds/aws.py +10 -7
  8. sky/clouds/azure.py +10 -7
  9. sky/clouds/cloud.py +2 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +10 -7
  12. sky/clouds/fluidstack.py +2 -0
  13. sky/clouds/gcp.py +10 -7
  14. sky/clouds/hyperbolic.py +10 -7
  15. sky/clouds/ibm.py +2 -0
  16. sky/clouds/kubernetes.py +26 -9
  17. sky/clouds/lambda_cloud.py +10 -7
  18. sky/clouds/nebius.py +10 -7
  19. sky/clouds/oci.py +10 -7
  20. sky/clouds/paperspace.py +10 -7
  21. sky/clouds/runpod.py +10 -7
  22. sky/clouds/scp.py +10 -7
  23. sky/clouds/ssh.py +36 -0
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +21 -0
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
  31. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
  39. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  40. sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
  43. sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
  59. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
  62. sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
  63. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  64. sky/dashboard/out/clusters/[cluster].html +1 -1
  65. sky/dashboard/out/clusters.html +1 -1
  66. sky/dashboard/out/config.html +1 -1
  67. sky/dashboard/out/index.html +1 -1
  68. sky/dashboard/out/infra/[context].html +1 -1
  69. sky/dashboard/out/infra.html +1 -1
  70. sky/dashboard/out/jobs/[job].html +1 -1
  71. sky/dashboard/out/jobs.html +1 -1
  72. sky/dashboard/out/users.html +1 -1
  73. sky/dashboard/out/volumes.html +1 -0
  74. sky/dashboard/out/workspace/new.html +1 -1
  75. sky/dashboard/out/workspaces/[name].html +1 -1
  76. sky/dashboard/out/workspaces.html +1 -1
  77. sky/data/storage_utils.py +2 -4
  78. sky/exceptions.py +15 -0
  79. sky/execution.py +5 -0
  80. sky/global_user_state.py +129 -0
  81. sky/jobs/client/sdk.py +13 -11
  82. sky/jobs/server/core.py +4 -0
  83. sky/models.py +16 -0
  84. sky/provision/__init__.py +26 -0
  85. sky/provision/kubernetes/__init__.py +3 -0
  86. sky/provision/kubernetes/instance.py +38 -77
  87. sky/provision/kubernetes/utils.py +70 -4
  88. sky/provision/kubernetes/volume.py +147 -0
  89. sky/resources.py +20 -76
  90. sky/serve/client/sdk.py +13 -13
  91. sky/serve/server/core.py +5 -1
  92. sky/server/common.py +40 -5
  93. sky/server/constants.py +5 -1
  94. sky/server/metrics.py +105 -0
  95. sky/server/requests/executor.py +30 -14
  96. sky/server/requests/payloads.py +16 -0
  97. sky/server/requests/requests.py +35 -1
  98. sky/server/rest.py +153 -0
  99. sky/server/server.py +70 -43
  100. sky/server/state.py +20 -0
  101. sky/server/stream_utils.py +8 -3
  102. sky/server/uvicorn.py +153 -13
  103. sky/setup_files/dependencies.py +2 -0
  104. sky/skylet/constants.py +19 -3
  105. sky/skypilot_config.py +3 -0
  106. sky/ssh_node_pools/__init__.py +1 -0
  107. sky/ssh_node_pools/core.py +133 -0
  108. sky/ssh_node_pools/server.py +232 -0
  109. sky/task.py +141 -18
  110. sky/templates/kubernetes-ray.yml.j2 +30 -1
  111. sky/users/permission.py +2 -0
  112. sky/utils/context.py +3 -1
  113. sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
  114. sky/utils/kubernetes/ssh_utils.py +221 -0
  115. sky/utils/resources_utils.py +66 -0
  116. sky/utils/rich_utils.py +6 -0
  117. sky/utils/schemas.py +146 -3
  118. sky/utils/status_lib.py +10 -0
  119. sky/utils/validator.py +11 -1
  120. sky/volumes/__init__.py +0 -0
  121. sky/volumes/client/__init__.py +0 -0
  122. sky/volumes/client/sdk.py +64 -0
  123. sky/volumes/server/__init__.py +0 -0
  124. sky/volumes/server/core.py +199 -0
  125. sky/volumes/server/server.py +85 -0
  126. sky/volumes/utils.py +158 -0
  127. sky/volumes/volume.py +198 -0
  128. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
  130. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  134. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  139. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
  141. sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  145. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  156. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  157. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  158. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
  159. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  160. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
  161. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
  162. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
  163. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py CHANGED
@@ -7,10 +7,10 @@ import webbrowser
7
7
  import click
8
8
 
9
9
  from sky import sky_logging
10
- from sky.adaptors import common as adaptors_common
11
10
  from sky.client import common as client_common
12
11
  from sky.client import sdk
13
12
  from sky.server import common as server_common
13
+ from sky.server import rest
14
14
  from sky.server.requests import payloads
15
15
  from sky.skylet import constants
16
16
  from sky.usage import usage_lib
@@ -22,11 +22,7 @@ from sky.utils import dag_utils
22
22
  if typing.TYPE_CHECKING:
23
23
  import io
24
24
 
25
- import requests
26
-
27
25
  import sky
28
- else:
29
- requests = adaptors_common.LazyImport('requests')
30
26
 
31
27
  logger = sky_logging.init_logger(__name__)
32
28
 
@@ -86,7 +82,7 @@ def launch(
86
82
  task=dag_str,
87
83
  name=name,
88
84
  )
89
- response = requests.post(
85
+ response = rest.post(
90
86
  f'{server_common.get_server_url()}/jobs/launch',
91
87
  json=json.loads(body.model_dump_json()),
92
88
  timeout=(5, None),
@@ -146,7 +142,7 @@ def queue(refresh: bool,
146
142
  all_users=all_users,
147
143
  job_ids=job_ids,
148
144
  )
149
- response = requests.post(
145
+ response = rest.post(
150
146
  f'{server_common.get_server_url()}/jobs/queue',
151
147
  json=json.loads(body.model_dump_json()),
152
148
  timeout=(5, None),
@@ -186,7 +182,7 @@ def cancel(
186
182
  all=all,
187
183
  all_users=all_users,
188
184
  )
189
- response = requests.post(
185
+ response = rest.post(
190
186
  f'{server_common.get_server_url()}/jobs/cancel',
191
187
  json=json.loads(body.model_dump_json()),
192
188
  timeout=(5, None),
@@ -197,6 +193,7 @@ def cancel(
197
193
 
198
194
  @usage_lib.entrypoint
199
195
  @server_common.check_server_healthy_or_start
196
+ @rest.retry_on_server_unavailable()
200
197
  def tail_logs(name: Optional[str] = None,
201
198
  job_id: Optional[int] = None,
202
199
  follow: bool = True,
@@ -236,7 +233,7 @@ def tail_logs(name: Optional[str] = None,
236
233
  refresh=refresh,
237
234
  tail=tail,
238
235
  )
239
- response = requests.post(
236
+ response = rest.post(
240
237
  f'{server_common.get_server_url()}/jobs/logs',
241
238
  json=json.loads(body.model_dump_json()),
242
239
  stream=True,
@@ -244,7 +241,12 @@ def tail_logs(name: Optional[str] = None,
244
241
  cookies=server_common.get_api_cookie_jar(),
245
242
  )
246
243
  request_id = server_common.get_request_id(response)
247
- return sdk.stream_response(request_id, response, output_stream)
244
+ # Log request is idempotent when tail is 0, thus can resume previous
245
+ # streaming point on retry.
246
+ return sdk.stream_response(request_id=request_id,
247
+ response=response,
248
+ output_stream=output_stream,
249
+ resumable=(tail == 0))
248
250
 
249
251
 
250
252
  @usage_lib.entrypoint
@@ -281,7 +283,7 @@ def download_logs(
281
283
  controller=controller,
282
284
  local_dir=local_dir,
283
285
  )
284
- response = requests.post(
286
+ response = rest.post(
285
287
  f'{server_common.get_server_url()}/jobs/download_logs',
286
288
  json=json.loads(body.model_dump_json()),
287
289
  timeout=(5, None),
sky/jobs/server/core.py CHANGED
@@ -145,6 +145,7 @@ def launch(
145
145
  entrypoint = task
146
146
  dag_uuid = str(uuid.uuid4().hex[:4])
147
147
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
148
+ dag.resolve_and_validate_volumes()
148
149
  # Always apply the policy again here, even though it might have been applied
149
150
  # in the CLI. This is to ensure that we apply the policy to the final DAG
150
151
  # and get the mutated config.
@@ -154,6 +155,9 @@ def launch(
154
155
  raise ValueError('Only single-task or chain DAG is '
155
156
  f'allowed for job_launch. Dag: {dag}')
156
157
  dag.validate()
158
+ # TODO(aylei): use consolidated job controller instead of performing
159
+ # pre-mount operations when submitting jobs.
160
+ dag.pre_mount_volumes()
157
161
 
158
162
  user_dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
159
163
 
sky/models.py CHANGED
@@ -6,6 +6,8 @@ import getpass
6
6
  import os
7
7
  from typing import Any, Dict, Optional
8
8
 
9
+ import pydantic
10
+
9
11
  from sky.skylet import constants
10
12
  from sky.utils import common_utils
11
13
 
@@ -48,6 +50,8 @@ class KubernetesNodeInfo:
48
50
  # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
49
51
  total: Dict[str, int]
50
52
  free: Dict[str, int]
53
+ # IP address of the node (external IP preferred, fallback to internal IP)
54
+ ip_address: Optional[str] = None
51
55
 
52
56
 
53
57
  @dataclasses.dataclass
@@ -76,3 +80,15 @@ class KubernetesNodesInfo:
76
80
  },
77
81
  hint=data['hint'],
78
82
  )
83
+
84
+
85
+ class VolumeConfig(pydantic.BaseModel):
86
+ """Configuration for creating a volume."""
87
+ name: str
88
+ type: str
89
+ cloud: str
90
+ region: Optional[str]
91
+ zone: Optional[str]
92
+ name_on_cloud: str
93
+ size: Optional[str]
94
+ config: Dict[str, Any] = {}
sky/provision/__init__.py CHANGED
@@ -8,6 +8,7 @@ import inspect
8
8
  import typing
9
9
  from typing import Any, Dict, List, Optional, Type
10
10
 
11
+ from sky import models
11
12
  from sky import sky_logging
12
13
  # These provision.<cloud> modules should never fail even if underlying cloud SDK
13
14
  # dependencies are not installed. This is ensured by using sky.adaptors inside
@@ -103,6 +104,31 @@ def bootstrap_instances(
103
104
  raise NotImplementedError
104
105
 
105
106
 
107
+ @_route_to_cloud_impl
108
+ def apply_volume(provider_name: str,
109
+ config: models.VolumeConfig) -> models.VolumeConfig:
110
+ """Create or register a volume.
111
+
112
+ This function creates or registers a volume with the provided configuration,
113
+ and returns a VolumeConfig object with updated configuration.
114
+ """
115
+ raise NotImplementedError
116
+
117
+
118
+ @_route_to_cloud_impl
119
+ def delete_volume(provider_name: str,
120
+ config: models.VolumeConfig) -> models.VolumeConfig:
121
+ """Delete a volume."""
122
+ raise NotImplementedError
123
+
124
+
125
+ @_route_to_cloud_impl
126
+ def get_volume_usedby(provider_name: str,
127
+ config: models.VolumeConfig) -> List[str]:
128
+ """Get the usedby of a volume."""
129
+ raise NotImplementedError
130
+
131
+
106
132
  @_route_to_cloud_impl
107
133
  def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
108
134
  config: common.ProvisionConfig) -> common.ProvisionRecord:
@@ -11,3 +11,6 @@ from sky.provision.kubernetes.instance import wait_instances
11
11
  from sky.provision.kubernetes.network import cleanup_ports
12
12
  from sky.provision.kubernetes.network import open_ports
13
13
  from sky.provision.kubernetes.network import query_ports
14
+ from sky.provision.kubernetes.volume import apply_volume
15
+ from sky.provision.kubernetes.volume import delete_volume
16
+ from sky.provision.kubernetes.volume import get_volume_usedby
@@ -3,7 +3,6 @@ import copy
3
3
  import json
4
4
  import time
5
5
  from typing import Any, Callable, Dict, List, Optional, Union
6
- import uuid
7
6
 
8
7
  from sky import exceptions
9
8
  from sky import sky_logging
@@ -15,6 +14,7 @@ from sky.provision import docker_utils
15
14
  from sky.provision.kubernetes import config as config_lib
16
15
  from sky.provision.kubernetes import network_utils
17
16
  from sky.provision.kubernetes import utils as kubernetes_utils
17
+ from sky.provision.kubernetes import volume
18
18
  from sky.utils import command_runner
19
19
  from sky.utils import common_utils
20
20
  from sky.utils import config_utils
@@ -240,7 +240,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
240
240
  extra_msg,
241
241
  details=event_message))
242
242
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
243
- f'Pod status: {pod_status}'
243
+ f'Pod status: {pod_status} '
244
244
  f'Details: \'{event_message}\' ')
245
245
  raise config_lib.KubernetesError(f'{timeout_err_msg}')
246
246
 
@@ -673,21 +673,6 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
673
673
  raise e
674
674
 
675
675
 
676
- def _create_persistent_volume_claim(namespace: str, context: Optional[str],
677
- pvc_spec: Dict[str, Any]) -> None:
678
- """Creates a persistent volume claim for SkyServe controller."""
679
- try:
680
- kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
681
- name=pvc_spec['metadata']['name'], namespace=namespace)
682
- return
683
- except kubernetes.api_exception() as e:
684
- if e.status != 404: # Not found
685
- raise
686
-
687
- kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
688
- namespace=namespace, body=pvc_spec)
689
-
690
-
691
676
  @timeline.event
692
677
  def _wait_for_deployment_pod(context,
693
678
  namespace,
@@ -832,9 +817,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
832
817
  # Worker pods
833
818
  pod_spec_copy['metadata']['labels'].update(
834
819
  constants.WORKER_NODE_TAGS)
835
- pod_uuid = str(uuid.uuid4())[:6]
836
- pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
837
- pod_spec_copy['metadata']['name'] = f'{pod_name}-worker'
820
+ pod_name = f'{cluster_name_on_cloud}-worker{i}'
821
+ if pod_name in running_pods:
822
+ # If the pod is already running, we skip creating it.
823
+ return
824
+ pod_spec_copy['metadata']['name'] = pod_name
825
+ pod_spec_copy['metadata']['labels']['component'] = pod_name
838
826
  # For multi-node support, we put a soft-constraint to schedule
839
827
  # worker pods on different nodes than the head pod.
840
828
  # This is not set as a hard constraint because if different nodes
@@ -888,7 +876,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
888
876
  ]
889
877
 
890
878
  if to_create_deployment:
891
- _create_persistent_volume_claim(namespace, context, pvc_spec)
879
+ volume.create_persistent_volume_claim(namespace, context, pvc_spec)
892
880
 
893
881
  # It's safe to directly modify the template spec in the deployment spec
894
882
  # because controller pod is singleton, i in [0].
@@ -910,6 +898,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
910
898
  print('Deployment failed', e)
911
899
  raise e
912
900
 
901
+ # Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
902
+ # is used by any pod in the namespace.
903
+ volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
904
+
913
905
  return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
914
906
  context)
915
907
 
@@ -1012,40 +1004,6 @@ def stop_instances(
1012
1004
  raise NotImplementedError()
1013
1005
 
1014
1006
 
1015
- def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
1016
- resource_name: str) -> None:
1017
- """Helper to delete Kubernetes resources with 404 handling and retries.
1018
-
1019
- Args:
1020
- delete_func: Function to call to delete the resource
1021
- resource_type: Type of resource being deleted (e.g. 'service'),
1022
- used in logging
1023
- resource_name: Name of the resource being deleted, used in logging
1024
- """
1025
- max_retries = 3
1026
- retry_delay = 5 # seconds
1027
-
1028
- for attempt in range(max_retries):
1029
- try:
1030
- delete_func()
1031
- return
1032
- except kubernetes.api_exception() as e:
1033
- if e.status == 404:
1034
- logger.warning(
1035
- f'terminate_instances: Tried to delete {resource_type} '
1036
- f'{resource_name}, but the {resource_type} was not '
1037
- 'found (404).')
1038
- return
1039
- elif attempt < max_retries - 1:
1040
- logger.warning(f'terminate_instances: Failed to delete '
1041
- f'{resource_type} {resource_name} (attempt '
1042
- f'{attempt + 1}/{max_retries}). Error: {e}. '
1043
- f'Retrying in {retry_delay} seconds...')
1044
- time.sleep(retry_delay)
1045
- else:
1046
- raise
1047
-
1048
-
1049
1007
  def _delete_services(name_prefix: str, namespace: str,
1050
1008
  context: Optional[str]) -> None:
1051
1009
  """Delete services with the given name prefix.
@@ -1061,13 +1019,14 @@ def _delete_services(name_prefix: str, namespace: str,
1061
1019
  # TODO(andyl): Wait for
1062
1020
  # https://github.com/pylint-dev/pylint/issues/5263.
1063
1021
  # pylint: disable=cell-var-from-loop
1064
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1065
- context).delete_namespaced_service(name=service_name,
1066
- namespace=namespace,
1067
- _request_timeout=config_lib.
1068
- DELETION_TIMEOUT),
1069
- resource_type='service',
1070
- resource_name=service_name)
1022
+ kubernetes_utils.delete_k8s_resource_with_retry(
1023
+ delete_func=lambda: kubernetes.core_api(
1024
+ context).delete_namespaced_service(name=service_name,
1025
+ namespace=namespace,
1026
+ _request_timeout=config_lib.
1027
+ DELETION_TIMEOUT),
1028
+ resource_type='service',
1029
+ resource_name=service_name)
1071
1030
 
1072
1031
 
1073
1032
  def _terminate_node(namespace: str,
@@ -1087,7 +1046,7 @@ def _terminate_node(namespace: str,
1087
1046
  # from within the pod, e.g., for autodown.
1088
1047
  # Note - some misbehaving pods may not terminate gracefully if they have
1089
1048
  # open file descriptors. We force delete pods to avoid this.
1090
- _delete_k8s_resource_with_retry(
1049
+ kubernetes_utils.delete_k8s_resource_with_retry(
1091
1050
  delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
1092
1051
  name=pod_name,
1093
1052
  namespace=namespace,
@@ -1105,26 +1064,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
1105
1064
 
1106
1065
  # Delete deployment
1107
1066
  deployment_name = _get_deployment_name(cluster_name)
1108
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.apps_api(
1109
- context).delete_namespaced_deployment(name=deployment_name,
1110
- namespace=namespace,
1111
- _request_timeout=config_lib.
1112
- DELETION_TIMEOUT),
1113
- resource_type='deployment',
1114
- resource_name=deployment_name)
1067
+ kubernetes_utils.delete_k8s_resource_with_retry(
1068
+ delete_func=lambda: kubernetes.apps_api(
1069
+ context).delete_namespaced_deployment(name=deployment_name,
1070
+ namespace=namespace,
1071
+ _request_timeout=config_lib.
1072
+ DELETION_TIMEOUT),
1073
+ resource_type='deployment',
1074
+ resource_name=deployment_name)
1115
1075
 
1116
1076
  # Delete PVCs
1117
1077
  pvc_name = _get_pvc_name(
1118
1078
  cluster_name,
1119
1079
  kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
1120
1080
  # pylint: disable=cell-var-from-loop
1121
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1122
- context).delete_namespaced_persistent_volume_claim(
1123
- name=pvc_name,
1124
- namespace=namespace,
1125
- _request_timeout=config_lib.DELETION_TIMEOUT),
1126
- resource_type='pvc',
1127
- resource_name=pvc_name)
1081
+ kubernetes_utils.delete_k8s_resource_with_retry(
1082
+ delete_func=lambda: kubernetes.core_api(
1083
+ context).delete_namespaced_persistent_volume_claim(
1084
+ name=pvc_name,
1085
+ namespace=namespace,
1086
+ _request_timeout=config_lib.DELETION_TIMEOUT),
1087
+ resource_type='pvc',
1088
+ resource_name=pvc_name)
1128
1089
 
1129
1090
 
1130
1091
  def terminate_instances(
@@ -10,7 +10,7 @@ import shutil
10
10
  import subprocess
11
11
  import time
12
12
  import typing
13
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
13
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
14
14
  from urllib.parse import urlparse
15
15
 
16
16
  import sky
@@ -1625,9 +1625,15 @@ def _get_kubeconfig_text_for_context(context: Optional[str] = None) -> str:
1625
1625
  command = 'kubectl config view --minify'
1626
1626
  if context is not None:
1627
1627
  command += f' --context={context}'
1628
+
1629
+ # Ensure subprocess inherits the current environment properly
1630
+ # This fixes the issue where kubectl can't find ~/.kube/config in API server context
1631
+ env = os.environ.copy()
1632
+
1628
1633
  proc = subprocess.run(command,
1629
1634
  shell=True,
1630
1635
  check=False,
1636
+ env=env,
1631
1637
  stdout=subprocess.PIPE,
1632
1638
  stderr=subprocess.PIPE)
1633
1639
  if proc.returncode != 0:
@@ -2734,6 +2740,21 @@ def get_kubernetes_node_info(
2734
2740
  node.metadata.labels.get(label_key))
2735
2741
  break
2736
2742
 
2743
+ # Extract IP address from node addresses (prefer external, fallback to internal)
2744
+ node_ip = None
2745
+ if node.status.addresses:
2746
+ # First try to find external IP
2747
+ for address in node.status.addresses:
2748
+ if address.type == 'ExternalIP':
2749
+ node_ip = address.address
2750
+ break
2751
+ # If no external IP, try to find internal IP
2752
+ if node_ip is None:
2753
+ for address in node.status.addresses:
2754
+ if address.type == 'InternalIP':
2755
+ node_ip = address.address
2756
+ break
2757
+
2737
2758
  allocated_qty = 0
2738
2759
  accelerator_count = get_node_accelerator_count(node.status.allocatable)
2739
2760
 
@@ -2765,7 +2786,8 @@ def get_kubernetes_node_info(
2765
2786
  name=node.metadata.name,
2766
2787
  accelerator_type=accelerator_name,
2767
2788
  total={'accelerator_count': int(accelerator_count)},
2768
- free={'accelerators_available': int(accelerators_available)})
2789
+ free={'accelerators_available': int(accelerators_available)},
2790
+ ip_address=node_ip)
2769
2791
  hint = ''
2770
2792
  if has_multi_host_tpu:
2771
2793
  hint = ('(Note: Multi-host TPUs are detected and excluded from the '
@@ -3279,5 +3301,49 @@ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
3279
3301
  if os.path.isfile(path):
3280
3302
  return path
3281
3303
 
3282
- format_kubeconfig_exec_auth(config, path)
3283
- return path
3304
+ try:
3305
+ format_kubeconfig_exec_auth(config, path)
3306
+ return path
3307
+ except Exception as e: # pylint: disable=broad-except
3308
+ # There may be problems with kubeconfig, but the user is not actually
3309
+ # using Kubernetes (or SSH Node Pools)
3310
+ logger.warning(
3311
+ f'Failed to format kubeconfig at {kubeconfig_path}. '
3312
+ 'Please check if the kubeconfig is valid. This may cause '
3313
+ 'problems when Kubernetes infra is used. '
3314
+ f'Reason: {common_utils.format_exception(e)}')
3315
+ return kubeconfig_path
3316
+
3317
+
3318
+ def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
3319
+ resource_name: str) -> None:
3320
+ """Helper to delete Kubernetes resources with 404 handling and retries.
3321
+
3322
+ Args:
3323
+ delete_func: Function to call to delete the resource
3324
+ resource_type: Type of resource being deleted (e.g. 'service'),
3325
+ used in logging
3326
+ resource_name: Name of the resource being deleted, used in logging
3327
+ """
3328
+ max_retries = 3
3329
+ retry_delay = 5 # seconds
3330
+
3331
+ for attempt in range(max_retries):
3332
+ try:
3333
+ delete_func()
3334
+ return
3335
+ except kubernetes.api_exception() as e:
3336
+ if e.status == 404:
3337
+ logger.warning(
3338
+ f'terminate_instances: Tried to delete {resource_type} '
3339
+ f'{resource_name}, but the {resource_type} was not '
3340
+ 'found (404).')
3341
+ return
3342
+ elif attempt < max_retries - 1:
3343
+ logger.warning(f'terminate_instances: Failed to delete '
3344
+ f'{resource_type} {resource_name} (attempt '
3345
+ f'{attempt + 1}/{max_retries}). Error: {e}. '
3346
+ f'Retrying in {retry_delay} seconds...')
3347
+ time.sleep(retry_delay)
3348
+ else:
3349
+ raise
@@ -0,0 +1,147 @@
1
+ """Kubernetes pvc provisioning."""
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ from sky import models
5
+ from sky import sky_logging
6
+ from sky.adaptors import kubernetes
7
+ from sky.provision.kubernetes import config as config_lib
8
+ from sky.provision.kubernetes import utils as kubernetes_utils
9
+ from sky.volumes import volume as volume_lib
10
+
11
+ logger = sky_logging.init_logger(__name__)
12
+
13
+
14
+ def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
15
+ """Gets the context and namespace of a volume."""
16
+ if config.region is None:
17
+ context = kubernetes_utils.get_current_kube_config_context_name()
18
+ config.region = context
19
+ else:
20
+ context = config.region
21
+ namespace = config.config.get('namespace')
22
+ if namespace is None:
23
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
24
+ config.config['namespace'] = namespace
25
+ return context, namespace
26
+
27
+
28
+ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
29
+ pod_spec: Dict[str, Any]) -> None:
30
+ """Checks if the PVC is used by any pod in the namespace."""
31
+ volumes = pod_spec.get('spec', {}).get('volumes', [])
32
+ if not volumes:
33
+ return
34
+ once_modes = [
35
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value,
36
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value
37
+ ]
38
+ for volume in volumes:
39
+ pvc_name = volume.get('persistentVolumeClaim', {}).get('claimName')
40
+ if not pvc_name:
41
+ continue
42
+ pvc = kubernetes.core_api(
43
+ context).read_namespaced_persistent_volume_claim(
44
+ name=pvc_name, namespace=namespace)
45
+ access_mode = pvc.spec.access_modes[0]
46
+ if access_mode not in once_modes:
47
+ continue
48
+ usedby = _get_volume_usedby(context, namespace, pvc_name)
49
+ if usedby:
50
+ raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
51
+ f'mode {access_mode} is already '
52
+ f'in use by {usedby}.')
53
+
54
+
55
+ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
56
+ """Creates or registers a volume."""
57
+ context, namespace = _get_context_namespace(config)
58
+ pvc_spec = _get_pvc_spec(namespace, config)
59
+ create_persistent_volume_claim(namespace, context, pvc_spec)
60
+ return config
61
+
62
+
63
+ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
64
+ """Deletes a volume."""
65
+ context, namespace = _get_context_namespace(config)
66
+ pvc_name = config.name_on_cloud
67
+ logger.info(f'Deleting PVC {pvc_name}')
68
+ kubernetes_utils.delete_k8s_resource_with_retry(
69
+ delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
70
+ context).delete_namespaced_persistent_volume_claim(
71
+ name=pvc_name,
72
+ namespace=namespace,
73
+ _request_timeout=config_lib.DELETION_TIMEOUT),
74
+ resource_type='pvc',
75
+ resource_name=pvc_name)
76
+ return config
77
+
78
+
79
+ def _get_volume_usedby(context: Optional[str], namespace: str,
80
+ pvc_name: str) -> List[str]:
81
+ """Gets the usedby resources of a volume."""
82
+ usedby = []
83
+ # Get all pods in the namespace
84
+ pods = kubernetes.core_api(context).list_namespaced_pod(namespace=namespace)
85
+ for pod in pods.items:
86
+ if pod.spec.volumes is not None:
87
+ for volume in pod.spec.volumes:
88
+ if volume.persistent_volume_claim is not None:
89
+ if volume.persistent_volume_claim.claim_name == pvc_name:
90
+ usedby.append(pod.metadata.name)
91
+ return usedby
92
+
93
+
94
+ def get_volume_usedby(config: models.VolumeConfig) -> List[str]:
95
+ """Gets the usedby resources of a volume."""
96
+ context, namespace = _get_context_namespace(config)
97
+ pvc_name = config.name_on_cloud
98
+ return _get_volume_usedby(context, namespace, pvc_name)
99
+
100
+
101
+ def create_persistent_volume_claim(namespace: str, context: Optional[str],
102
+ pvc_spec: Dict[str, Any]) -> None:
103
+ """Creates a persistent volume claim for SkyServe controller."""
104
+ pvc_name = pvc_spec['metadata']['name']
105
+ try:
106
+ kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
107
+ name=pvc_name, namespace=namespace)
108
+ logger.debug(f'PVC {pvc_name} already exists')
109
+ return
110
+ except kubernetes.api_exception() as e:
111
+ if e.status != 404: # Not found
112
+ raise
113
+ logger.info(f'Creating PVC {pvc_name}')
114
+ kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
115
+ namespace=namespace, body=pvc_spec)
116
+
117
+
118
+ def _get_pvc_spec(namespace: str,
119
+ config: models.VolumeConfig) -> Dict[str, Any]:
120
+ """Gets the PVC spec for the given storage config."""
121
+ access_mode = config.config.get('access_mode')
122
+ size = config.size
123
+ # The previous code assumes that the access_mode and size are always set.
124
+ assert access_mode is not None
125
+ assert size is not None
126
+ pvc_spec: Dict[str, Any] = {
127
+ 'metadata': {
128
+ 'name': config.name_on_cloud,
129
+ 'namespace': namespace,
130
+ 'labels': {
131
+ 'parent': 'skypilot',
132
+ 'skypilot-name': config.name,
133
+ }
134
+ },
135
+ 'spec': {
136
+ 'accessModes': [access_mode],
137
+ 'resources': {
138
+ 'requests': {
139
+ 'storage': f'{size}Gi'
140
+ }
141
+ },
142
+ }
143
+ }
144
+ storage_class = config.config.get('storage_class_name')
145
+ if storage_class is not None:
146
+ pvc_spec['spec']['storageClassName'] = storage_class
147
+ return pvc_spec