skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +1 -6
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +232 -9
- sky/client/sdk.py +195 -91
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/ssh.py +36 -0
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +21 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
- sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
- sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
- sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
- sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
- sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
- sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
- sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +70 -4
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +153 -0
- sky/server/server.py +70 -43
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -3
- sky/skypilot_config.py +3 -0
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +133 -0
- sky/ssh_node_pools/server.py +232 -0
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk.py
CHANGED
@@ -7,10 +7,10 @@ import webbrowser
|
|
7
7
|
import click
|
8
8
|
|
9
9
|
from sky import sky_logging
|
10
|
-
from sky.adaptors import common as adaptors_common
|
11
10
|
from sky.client import common as client_common
|
12
11
|
from sky.client import sdk
|
13
12
|
from sky.server import common as server_common
|
13
|
+
from sky.server import rest
|
14
14
|
from sky.server.requests import payloads
|
15
15
|
from sky.skylet import constants
|
16
16
|
from sky.usage import usage_lib
|
@@ -22,11 +22,7 @@ from sky.utils import dag_utils
|
|
22
22
|
if typing.TYPE_CHECKING:
|
23
23
|
import io
|
24
24
|
|
25
|
-
import requests
|
26
|
-
|
27
25
|
import sky
|
28
|
-
else:
|
29
|
-
requests = adaptors_common.LazyImport('requests')
|
30
26
|
|
31
27
|
logger = sky_logging.init_logger(__name__)
|
32
28
|
|
@@ -86,7 +82,7 @@ def launch(
|
|
86
82
|
task=dag_str,
|
87
83
|
name=name,
|
88
84
|
)
|
89
|
-
response =
|
85
|
+
response = rest.post(
|
90
86
|
f'{server_common.get_server_url()}/jobs/launch',
|
91
87
|
json=json.loads(body.model_dump_json()),
|
92
88
|
timeout=(5, None),
|
@@ -146,7 +142,7 @@ def queue(refresh: bool,
|
|
146
142
|
all_users=all_users,
|
147
143
|
job_ids=job_ids,
|
148
144
|
)
|
149
|
-
response =
|
145
|
+
response = rest.post(
|
150
146
|
f'{server_common.get_server_url()}/jobs/queue',
|
151
147
|
json=json.loads(body.model_dump_json()),
|
152
148
|
timeout=(5, None),
|
@@ -186,7 +182,7 @@ def cancel(
|
|
186
182
|
all=all,
|
187
183
|
all_users=all_users,
|
188
184
|
)
|
189
|
-
response =
|
185
|
+
response = rest.post(
|
190
186
|
f'{server_common.get_server_url()}/jobs/cancel',
|
191
187
|
json=json.loads(body.model_dump_json()),
|
192
188
|
timeout=(5, None),
|
@@ -197,6 +193,7 @@ def cancel(
|
|
197
193
|
|
198
194
|
@usage_lib.entrypoint
|
199
195
|
@server_common.check_server_healthy_or_start
|
196
|
+
@rest.retry_on_server_unavailable()
|
200
197
|
def tail_logs(name: Optional[str] = None,
|
201
198
|
job_id: Optional[int] = None,
|
202
199
|
follow: bool = True,
|
@@ -236,7 +233,7 @@ def tail_logs(name: Optional[str] = None,
|
|
236
233
|
refresh=refresh,
|
237
234
|
tail=tail,
|
238
235
|
)
|
239
|
-
response =
|
236
|
+
response = rest.post(
|
240
237
|
f'{server_common.get_server_url()}/jobs/logs',
|
241
238
|
json=json.loads(body.model_dump_json()),
|
242
239
|
stream=True,
|
@@ -244,7 +241,12 @@ def tail_logs(name: Optional[str] = None,
|
|
244
241
|
cookies=server_common.get_api_cookie_jar(),
|
245
242
|
)
|
246
243
|
request_id = server_common.get_request_id(response)
|
247
|
-
|
244
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
245
|
+
# streaming point on retry.
|
246
|
+
return sdk.stream_response(request_id=request_id,
|
247
|
+
response=response,
|
248
|
+
output_stream=output_stream,
|
249
|
+
resumable=(tail == 0))
|
248
250
|
|
249
251
|
|
250
252
|
@usage_lib.entrypoint
|
@@ -281,7 +283,7 @@ def download_logs(
|
|
281
283
|
controller=controller,
|
282
284
|
local_dir=local_dir,
|
283
285
|
)
|
284
|
-
response =
|
286
|
+
response = rest.post(
|
285
287
|
f'{server_common.get_server_url()}/jobs/download_logs',
|
286
288
|
json=json.loads(body.model_dump_json()),
|
287
289
|
timeout=(5, None),
|
sky/jobs/server/core.py
CHANGED
@@ -145,6 +145,7 @@ def launch(
|
|
145
145
|
entrypoint = task
|
146
146
|
dag_uuid = str(uuid.uuid4().hex[:4])
|
147
147
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
148
|
+
dag.resolve_and_validate_volumes()
|
148
149
|
# Always apply the policy again here, even though it might have been applied
|
149
150
|
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
150
151
|
# and get the mutated config.
|
@@ -154,6 +155,9 @@ def launch(
|
|
154
155
|
raise ValueError('Only single-task or chain DAG is '
|
155
156
|
f'allowed for job_launch. Dag: {dag}')
|
156
157
|
dag.validate()
|
158
|
+
# TODO(aylei): use consolidated job controller instead of performing
|
159
|
+
# pre-mount operations when submitting jobs.
|
160
|
+
dag.pre_mount_volumes()
|
157
161
|
|
158
162
|
user_dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
159
163
|
|
sky/models.py
CHANGED
@@ -6,6 +6,8 @@ import getpass
|
|
6
6
|
import os
|
7
7
|
from typing import Any, Dict, Optional
|
8
8
|
|
9
|
+
import pydantic
|
10
|
+
|
9
11
|
from sky.skylet import constants
|
10
12
|
from sky.utils import common_utils
|
11
13
|
|
@@ -48,6 +50,8 @@ class KubernetesNodeInfo:
|
|
48
50
|
# Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
|
49
51
|
total: Dict[str, int]
|
50
52
|
free: Dict[str, int]
|
53
|
+
# IP address of the node (external IP preferred, fallback to internal IP)
|
54
|
+
ip_address: Optional[str] = None
|
51
55
|
|
52
56
|
|
53
57
|
@dataclasses.dataclass
|
@@ -76,3 +80,15 @@ class KubernetesNodesInfo:
|
|
76
80
|
},
|
77
81
|
hint=data['hint'],
|
78
82
|
)
|
83
|
+
|
84
|
+
|
85
|
+
class VolumeConfig(pydantic.BaseModel):
|
86
|
+
"""Configuration for creating a volume."""
|
87
|
+
name: str
|
88
|
+
type: str
|
89
|
+
cloud: str
|
90
|
+
region: Optional[str]
|
91
|
+
zone: Optional[str]
|
92
|
+
name_on_cloud: str
|
93
|
+
size: Optional[str]
|
94
|
+
config: Dict[str, Any] = {}
|
sky/provision/__init__.py
CHANGED
@@ -8,6 +8,7 @@ import inspect
|
|
8
8
|
import typing
|
9
9
|
from typing import Any, Dict, List, Optional, Type
|
10
10
|
|
11
|
+
from sky import models
|
11
12
|
from sky import sky_logging
|
12
13
|
# These provision.<cloud> modules should never fail even if underlying cloud SDK
|
13
14
|
# dependencies are not installed. This is ensured by using sky.adaptors inside
|
@@ -103,6 +104,31 @@ def bootstrap_instances(
|
|
103
104
|
raise NotImplementedError
|
104
105
|
|
105
106
|
|
107
|
+
@_route_to_cloud_impl
|
108
|
+
def apply_volume(provider_name: str,
|
109
|
+
config: models.VolumeConfig) -> models.VolumeConfig:
|
110
|
+
"""Create or register a volume.
|
111
|
+
|
112
|
+
This function creates or registers a volume with the provided configuration,
|
113
|
+
and returns a VolumeConfig object with updated configuration.
|
114
|
+
"""
|
115
|
+
raise NotImplementedError
|
116
|
+
|
117
|
+
|
118
|
+
@_route_to_cloud_impl
|
119
|
+
def delete_volume(provider_name: str,
|
120
|
+
config: models.VolumeConfig) -> models.VolumeConfig:
|
121
|
+
"""Delete a volume."""
|
122
|
+
raise NotImplementedError
|
123
|
+
|
124
|
+
|
125
|
+
@_route_to_cloud_impl
|
126
|
+
def get_volume_usedby(provider_name: str,
|
127
|
+
config: models.VolumeConfig) -> List[str]:
|
128
|
+
"""Get the usedby of a volume."""
|
129
|
+
raise NotImplementedError
|
130
|
+
|
131
|
+
|
106
132
|
@_route_to_cloud_impl
|
107
133
|
def run_instances(provider_name: str, region: str, cluster_name_on_cloud: str,
|
108
134
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
@@ -11,3 +11,6 @@ from sky.provision.kubernetes.instance import wait_instances
|
|
11
11
|
from sky.provision.kubernetes.network import cleanup_ports
|
12
12
|
from sky.provision.kubernetes.network import open_ports
|
13
13
|
from sky.provision.kubernetes.network import query_ports
|
14
|
+
from sky.provision.kubernetes.volume import apply_volume
|
15
|
+
from sky.provision.kubernetes.volume import delete_volume
|
16
|
+
from sky.provision.kubernetes.volume import get_volume_usedby
|
@@ -3,7 +3,6 @@ import copy
|
|
3
3
|
import json
|
4
4
|
import time
|
5
5
|
from typing import Any, Callable, Dict, List, Optional, Union
|
6
|
-
import uuid
|
7
6
|
|
8
7
|
from sky import exceptions
|
9
8
|
from sky import sky_logging
|
@@ -15,6 +14,7 @@ from sky.provision import docker_utils
|
|
15
14
|
from sky.provision.kubernetes import config as config_lib
|
16
15
|
from sky.provision.kubernetes import network_utils
|
17
16
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
17
|
+
from sky.provision.kubernetes import volume
|
18
18
|
from sky.utils import command_runner
|
19
19
|
from sky.utils import common_utils
|
20
20
|
from sky.utils import config_utils
|
@@ -240,7 +240,7 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
240
240
|
extra_msg,
|
241
241
|
details=event_message))
|
242
242
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
243
|
-
f'Pod status: {pod_status}'
|
243
|
+
f'Pod status: {pod_status} '
|
244
244
|
f'Details: \'{event_message}\' ')
|
245
245
|
raise config_lib.KubernetesError(f'{timeout_err_msg}')
|
246
246
|
|
@@ -673,21 +673,6 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
|
673
673
|
raise e
|
674
674
|
|
675
675
|
|
676
|
-
def _create_persistent_volume_claim(namespace: str, context: Optional[str],
|
677
|
-
pvc_spec: Dict[str, Any]) -> None:
|
678
|
-
"""Creates a persistent volume claim for SkyServe controller."""
|
679
|
-
try:
|
680
|
-
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
681
|
-
name=pvc_spec['metadata']['name'], namespace=namespace)
|
682
|
-
return
|
683
|
-
except kubernetes.api_exception() as e:
|
684
|
-
if e.status != 404: # Not found
|
685
|
-
raise
|
686
|
-
|
687
|
-
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
688
|
-
namespace=namespace, body=pvc_spec)
|
689
|
-
|
690
|
-
|
691
676
|
@timeline.event
|
692
677
|
def _wait_for_deployment_pod(context,
|
693
678
|
namespace,
|
@@ -832,9 +817,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
832
817
|
# Worker pods
|
833
818
|
pod_spec_copy['metadata']['labels'].update(
|
834
819
|
constants.WORKER_NODE_TAGS)
|
835
|
-
|
836
|
-
pod_name
|
837
|
-
|
820
|
+
pod_name = f'{cluster_name_on_cloud}-worker{i}'
|
821
|
+
if pod_name in running_pods:
|
822
|
+
# If the pod is already running, we skip creating it.
|
823
|
+
return
|
824
|
+
pod_spec_copy['metadata']['name'] = pod_name
|
825
|
+
pod_spec_copy['metadata']['labels']['component'] = pod_name
|
838
826
|
# For multi-node support, we put a soft-constraint to schedule
|
839
827
|
# worker pods on different nodes than the head pod.
|
840
828
|
# This is not set as a hard constraint because if different nodes
|
@@ -888,7 +876,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
888
876
|
]
|
889
877
|
|
890
878
|
if to_create_deployment:
|
891
|
-
|
879
|
+
volume.create_persistent_volume_claim(namespace, context, pvc_spec)
|
892
880
|
|
893
881
|
# It's safe to directly modify the template spec in the deployment spec
|
894
882
|
# because controller pod is singleton, i in [0].
|
@@ -910,6 +898,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
910
898
|
print('Deployment failed', e)
|
911
899
|
raise e
|
912
900
|
|
901
|
+
# Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
|
902
|
+
# is used by any pod in the namespace.
|
903
|
+
volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
|
904
|
+
|
913
905
|
return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
|
914
906
|
context)
|
915
907
|
|
@@ -1012,40 +1004,6 @@ def stop_instances(
|
|
1012
1004
|
raise NotImplementedError()
|
1013
1005
|
|
1014
1006
|
|
1015
|
-
def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
|
1016
|
-
resource_name: str) -> None:
|
1017
|
-
"""Helper to delete Kubernetes resources with 404 handling and retries.
|
1018
|
-
|
1019
|
-
Args:
|
1020
|
-
delete_func: Function to call to delete the resource
|
1021
|
-
resource_type: Type of resource being deleted (e.g. 'service'),
|
1022
|
-
used in logging
|
1023
|
-
resource_name: Name of the resource being deleted, used in logging
|
1024
|
-
"""
|
1025
|
-
max_retries = 3
|
1026
|
-
retry_delay = 5 # seconds
|
1027
|
-
|
1028
|
-
for attempt in range(max_retries):
|
1029
|
-
try:
|
1030
|
-
delete_func()
|
1031
|
-
return
|
1032
|
-
except kubernetes.api_exception() as e:
|
1033
|
-
if e.status == 404:
|
1034
|
-
logger.warning(
|
1035
|
-
f'terminate_instances: Tried to delete {resource_type} '
|
1036
|
-
f'{resource_name}, but the {resource_type} was not '
|
1037
|
-
'found (404).')
|
1038
|
-
return
|
1039
|
-
elif attempt < max_retries - 1:
|
1040
|
-
logger.warning(f'terminate_instances: Failed to delete '
|
1041
|
-
f'{resource_type} {resource_name} (attempt '
|
1042
|
-
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
1043
|
-
f'Retrying in {retry_delay} seconds...')
|
1044
|
-
time.sleep(retry_delay)
|
1045
|
-
else:
|
1046
|
-
raise
|
1047
|
-
|
1048
|
-
|
1049
1007
|
def _delete_services(name_prefix: str, namespace: str,
|
1050
1008
|
context: Optional[str]) -> None:
|
1051
1009
|
"""Delete services with the given name prefix.
|
@@ -1061,13 +1019,14 @@ def _delete_services(name_prefix: str, namespace: str,
|
|
1061
1019
|
# TODO(andyl): Wait for
|
1062
1020
|
# https://github.com/pylint-dev/pylint/issues/5263.
|
1063
1021
|
# pylint: disable=cell-var-from-loop
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1022
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1023
|
+
delete_func=lambda: kubernetes.core_api(
|
1024
|
+
context).delete_namespaced_service(name=service_name,
|
1025
|
+
namespace=namespace,
|
1026
|
+
_request_timeout=config_lib.
|
1027
|
+
DELETION_TIMEOUT),
|
1028
|
+
resource_type='service',
|
1029
|
+
resource_name=service_name)
|
1071
1030
|
|
1072
1031
|
|
1073
1032
|
def _terminate_node(namespace: str,
|
@@ -1087,7 +1046,7 @@ def _terminate_node(namespace: str,
|
|
1087
1046
|
# from within the pod, e.g., for autodown.
|
1088
1047
|
# Note - some misbehaving pods may not terminate gracefully if they have
|
1089
1048
|
# open file descriptors. We force delete pods to avoid this.
|
1090
|
-
|
1049
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1091
1050
|
delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
|
1092
1051
|
name=pod_name,
|
1093
1052
|
namespace=namespace,
|
@@ -1105,26 +1064,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
|
|
1105
1064
|
|
1106
1065
|
# Delete deployment
|
1107
1066
|
deployment_name = _get_deployment_name(cluster_name)
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1067
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1068
|
+
delete_func=lambda: kubernetes.apps_api(
|
1069
|
+
context).delete_namespaced_deployment(name=deployment_name,
|
1070
|
+
namespace=namespace,
|
1071
|
+
_request_timeout=config_lib.
|
1072
|
+
DELETION_TIMEOUT),
|
1073
|
+
resource_type='deployment',
|
1074
|
+
resource_name=deployment_name)
|
1115
1075
|
|
1116
1076
|
# Delete PVCs
|
1117
1077
|
pvc_name = _get_pvc_name(
|
1118
1078
|
cluster_name,
|
1119
1079
|
kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
|
1120
1080
|
# pylint: disable=cell-var-from-loop
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1081
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
1082
|
+
delete_func=lambda: kubernetes.core_api(
|
1083
|
+
context).delete_namespaced_persistent_volume_claim(
|
1084
|
+
name=pvc_name,
|
1085
|
+
namespace=namespace,
|
1086
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
1087
|
+
resource_type='pvc',
|
1088
|
+
resource_name=pvc_name)
|
1128
1089
|
|
1129
1090
|
|
1130
1091
|
def terminate_instances(
|
@@ -10,7 +10,7 @@ import shutil
|
|
10
10
|
import subprocess
|
11
11
|
import time
|
12
12
|
import typing
|
13
|
-
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
13
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
14
14
|
from urllib.parse import urlparse
|
15
15
|
|
16
16
|
import sky
|
@@ -1625,9 +1625,15 @@ def _get_kubeconfig_text_for_context(context: Optional[str] = None) -> str:
|
|
1625
1625
|
command = 'kubectl config view --minify'
|
1626
1626
|
if context is not None:
|
1627
1627
|
command += f' --context={context}'
|
1628
|
+
|
1629
|
+
# Ensure subprocess inherits the current environment properly
|
1630
|
+
# This fixes the issue where kubectl can't find ~/.kube/config in API server context
|
1631
|
+
env = os.environ.copy()
|
1632
|
+
|
1628
1633
|
proc = subprocess.run(command,
|
1629
1634
|
shell=True,
|
1630
1635
|
check=False,
|
1636
|
+
env=env,
|
1631
1637
|
stdout=subprocess.PIPE,
|
1632
1638
|
stderr=subprocess.PIPE)
|
1633
1639
|
if proc.returncode != 0:
|
@@ -2734,6 +2740,21 @@ def get_kubernetes_node_info(
|
|
2734
2740
|
node.metadata.labels.get(label_key))
|
2735
2741
|
break
|
2736
2742
|
|
2743
|
+
# Extract IP address from node addresses (prefer external, fallback to internal)
|
2744
|
+
node_ip = None
|
2745
|
+
if node.status.addresses:
|
2746
|
+
# First try to find external IP
|
2747
|
+
for address in node.status.addresses:
|
2748
|
+
if address.type == 'ExternalIP':
|
2749
|
+
node_ip = address.address
|
2750
|
+
break
|
2751
|
+
# If no external IP, try to find internal IP
|
2752
|
+
if node_ip is None:
|
2753
|
+
for address in node.status.addresses:
|
2754
|
+
if address.type == 'InternalIP':
|
2755
|
+
node_ip = address.address
|
2756
|
+
break
|
2757
|
+
|
2737
2758
|
allocated_qty = 0
|
2738
2759
|
accelerator_count = get_node_accelerator_count(node.status.allocatable)
|
2739
2760
|
|
@@ -2765,7 +2786,8 @@ def get_kubernetes_node_info(
|
|
2765
2786
|
name=node.metadata.name,
|
2766
2787
|
accelerator_type=accelerator_name,
|
2767
2788
|
total={'accelerator_count': int(accelerator_count)},
|
2768
|
-
free={'accelerators_available': int(accelerators_available)}
|
2789
|
+
free={'accelerators_available': int(accelerators_available)},
|
2790
|
+
ip_address=node_ip)
|
2769
2791
|
hint = ''
|
2770
2792
|
if has_multi_host_tpu:
|
2771
2793
|
hint = ('(Note: Multi-host TPUs are detected and excluded from the '
|
@@ -3279,5 +3301,49 @@ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
|
|
3279
3301
|
if os.path.isfile(path):
|
3280
3302
|
return path
|
3281
3303
|
|
3282
|
-
|
3283
|
-
|
3304
|
+
try:
|
3305
|
+
format_kubeconfig_exec_auth(config, path)
|
3306
|
+
return path
|
3307
|
+
except Exception as e: # pylint: disable=broad-except
|
3308
|
+
# There may be problems with kubeconfig, but the user is not actually
|
3309
|
+
# using Kubernetes (or SSH Node Pools)
|
3310
|
+
logger.warning(
|
3311
|
+
f'Failed to format kubeconfig at {kubeconfig_path}. '
|
3312
|
+
'Please check if the kubeconfig is valid. This may cause '
|
3313
|
+
'problems when Kubernetes infra is used. '
|
3314
|
+
f'Reason: {common_utils.format_exception(e)}')
|
3315
|
+
return kubeconfig_path
|
3316
|
+
|
3317
|
+
|
3318
|
+
def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
|
3319
|
+
resource_name: str) -> None:
|
3320
|
+
"""Helper to delete Kubernetes resources with 404 handling and retries.
|
3321
|
+
|
3322
|
+
Args:
|
3323
|
+
delete_func: Function to call to delete the resource
|
3324
|
+
resource_type: Type of resource being deleted (e.g. 'service'),
|
3325
|
+
used in logging
|
3326
|
+
resource_name: Name of the resource being deleted, used in logging
|
3327
|
+
"""
|
3328
|
+
max_retries = 3
|
3329
|
+
retry_delay = 5 # seconds
|
3330
|
+
|
3331
|
+
for attempt in range(max_retries):
|
3332
|
+
try:
|
3333
|
+
delete_func()
|
3334
|
+
return
|
3335
|
+
except kubernetes.api_exception() as e:
|
3336
|
+
if e.status == 404:
|
3337
|
+
logger.warning(
|
3338
|
+
f'terminate_instances: Tried to delete {resource_type} '
|
3339
|
+
f'{resource_name}, but the {resource_type} was not '
|
3340
|
+
'found (404).')
|
3341
|
+
return
|
3342
|
+
elif attempt < max_retries - 1:
|
3343
|
+
logger.warning(f'terminate_instances: Failed to delete '
|
3344
|
+
f'{resource_type} {resource_name} (attempt '
|
3345
|
+
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
3346
|
+
f'Retrying in {retry_delay} seconds...')
|
3347
|
+
time.sleep(retry_delay)
|
3348
|
+
else:
|
3349
|
+
raise
|
@@ -0,0 +1,147 @@
|
|
1
|
+
"""Kubernetes pvc provisioning."""
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple
|
3
|
+
|
4
|
+
from sky import models
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky.adaptors import kubernetes
|
7
|
+
from sky.provision.kubernetes import config as config_lib
|
8
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
9
|
+
from sky.volumes import volume as volume_lib
|
10
|
+
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
|
15
|
+
"""Gets the context and namespace of a volume."""
|
16
|
+
if config.region is None:
|
17
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
18
|
+
config.region = context
|
19
|
+
else:
|
20
|
+
context = config.region
|
21
|
+
namespace = config.config.get('namespace')
|
22
|
+
if namespace is None:
|
23
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
24
|
+
config.config['namespace'] = namespace
|
25
|
+
return context, namespace
|
26
|
+
|
27
|
+
|
28
|
+
def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
|
29
|
+
pod_spec: Dict[str, Any]) -> None:
|
30
|
+
"""Checks if the PVC is used by any pod in the namespace."""
|
31
|
+
volumes = pod_spec.get('spec', {}).get('volumes', [])
|
32
|
+
if not volumes:
|
33
|
+
return
|
34
|
+
once_modes = [
|
35
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value,
|
36
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value
|
37
|
+
]
|
38
|
+
for volume in volumes:
|
39
|
+
pvc_name = volume.get('persistentVolumeClaim', {}).get('claimName')
|
40
|
+
if not pvc_name:
|
41
|
+
continue
|
42
|
+
pvc = kubernetes.core_api(
|
43
|
+
context).read_namespaced_persistent_volume_claim(
|
44
|
+
name=pvc_name, namespace=namespace)
|
45
|
+
access_mode = pvc.spec.access_modes[0]
|
46
|
+
if access_mode not in once_modes:
|
47
|
+
continue
|
48
|
+
usedby = _get_volume_usedby(context, namespace, pvc_name)
|
49
|
+
if usedby:
|
50
|
+
raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
|
51
|
+
f'mode {access_mode} is already '
|
52
|
+
f'in use by {usedby}.')
|
53
|
+
|
54
|
+
|
55
|
+
def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
56
|
+
"""Creates or registers a volume."""
|
57
|
+
context, namespace = _get_context_namespace(config)
|
58
|
+
pvc_spec = _get_pvc_spec(namespace, config)
|
59
|
+
create_persistent_volume_claim(namespace, context, pvc_spec)
|
60
|
+
return config
|
61
|
+
|
62
|
+
|
63
|
+
def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
64
|
+
"""Deletes a volume."""
|
65
|
+
context, namespace = _get_context_namespace(config)
|
66
|
+
pvc_name = config.name_on_cloud
|
67
|
+
logger.info(f'Deleting PVC {pvc_name}')
|
68
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
69
|
+
delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
|
70
|
+
context).delete_namespaced_persistent_volume_claim(
|
71
|
+
name=pvc_name,
|
72
|
+
namespace=namespace,
|
73
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
74
|
+
resource_type='pvc',
|
75
|
+
resource_name=pvc_name)
|
76
|
+
return config
|
77
|
+
|
78
|
+
|
79
|
+
def _get_volume_usedby(context: Optional[str], namespace: str,
|
80
|
+
pvc_name: str) -> List[str]:
|
81
|
+
"""Gets the usedby resources of a volume."""
|
82
|
+
usedby = []
|
83
|
+
# Get all pods in the namespace
|
84
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(namespace=namespace)
|
85
|
+
for pod in pods.items:
|
86
|
+
if pod.spec.volumes is not None:
|
87
|
+
for volume in pod.spec.volumes:
|
88
|
+
if volume.persistent_volume_claim is not None:
|
89
|
+
if volume.persistent_volume_claim.claim_name == pvc_name:
|
90
|
+
usedby.append(pod.metadata.name)
|
91
|
+
return usedby
|
92
|
+
|
93
|
+
|
94
|
+
def get_volume_usedby(config: models.VolumeConfig) -> List[str]:
|
95
|
+
"""Gets the usedby resources of a volume."""
|
96
|
+
context, namespace = _get_context_namespace(config)
|
97
|
+
pvc_name = config.name_on_cloud
|
98
|
+
return _get_volume_usedby(context, namespace, pvc_name)
|
99
|
+
|
100
|
+
|
101
|
+
def create_persistent_volume_claim(namespace: str, context: Optional[str],
|
102
|
+
pvc_spec: Dict[str, Any]) -> None:
|
103
|
+
"""Creates a persistent volume claim for SkyServe controller."""
|
104
|
+
pvc_name = pvc_spec['metadata']['name']
|
105
|
+
try:
|
106
|
+
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
107
|
+
name=pvc_name, namespace=namespace)
|
108
|
+
logger.debug(f'PVC {pvc_name} already exists')
|
109
|
+
return
|
110
|
+
except kubernetes.api_exception() as e:
|
111
|
+
if e.status != 404: # Not found
|
112
|
+
raise
|
113
|
+
logger.info(f'Creating PVC {pvc_name}')
|
114
|
+
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
115
|
+
namespace=namespace, body=pvc_spec)
|
116
|
+
|
117
|
+
|
118
|
+
def _get_pvc_spec(namespace: str,
|
119
|
+
config: models.VolumeConfig) -> Dict[str, Any]:
|
120
|
+
"""Gets the PVC spec for the given storage config."""
|
121
|
+
access_mode = config.config.get('access_mode')
|
122
|
+
size = config.size
|
123
|
+
# The previous code assumes that the access_mode and size are always set.
|
124
|
+
assert access_mode is not None
|
125
|
+
assert size is not None
|
126
|
+
pvc_spec: Dict[str, Any] = {
|
127
|
+
'metadata': {
|
128
|
+
'name': config.name_on_cloud,
|
129
|
+
'namespace': namespace,
|
130
|
+
'labels': {
|
131
|
+
'parent': 'skypilot',
|
132
|
+
'skypilot-name': config.name,
|
133
|
+
}
|
134
|
+
},
|
135
|
+
'spec': {
|
136
|
+
'accessModes': [access_mode],
|
137
|
+
'resources': {
|
138
|
+
'requests': {
|
139
|
+
'storage': f'{size}Gi'
|
140
|
+
}
|
141
|
+
},
|
142
|
+
}
|
143
|
+
}
|
144
|
+
storage_class = config.config.get('storage_class_name')
|
145
|
+
if storage_class is not None:
|
146
|
+
pvc_spec['spec']['storageClassName'] = storage_class
|
147
|
+
return pvc_spec
|