skypilot-nightly 1.0.0.dev20250628__py3-none-any.whl → 1.0.0.dev20250701__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +3 -3
- sky/client/cli/command.py +1 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Md3rlE87jmL5uv7gSo8mR/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1043-1b39779691bb4030.js +1 -0
- sky/dashboard/out/_next/static/chunks/{141-fa5a20cbf401b351.js → 1141-726e5a3f00b67185.js} +2 -2
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +1 -0
- sky/dashboard/out/_next/static/chunks/{691.fd9292250ab089af.js → 1691.44e378727a41f3b5.js} +2 -2
- sky/dashboard/out/_next/static/chunks/{871-e547295e7e21399c.js → 1871-80dea41717729fa5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +1 -0
- sky/dashboard/out/_next/static/chunks/{875.52c962183328b3f2.js → 2875.c24c6d57dc82e436.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3256.7257acd01b481bed.js +11 -0
- sky/dashboard/out/_next/static/chunks/3698-52ad1ca228faa776.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.b3cc2bc1d49d2c3c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +1 -0
- sky/dashboard/out/_next/static/chunks/{947-6620842ef80ae879.js → 3947-b059261d6fa88a1f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{697.6460bf72e760addd.js → 4697.f5421144224da9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.4c849b1e05c8e9ad.js +1 -0
- sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +1 -0
- sky/dashboard/out/_next/static/chunks/{491.b3d264269613fe09.js → 5491.918ffed0ba7a5294.js} +1 -1
- sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +8 -0
- sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +39 -0
- sky/dashboard/out/_next/static/chunks/6601-fcfad0ddf92ec7ab.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-6ff4e45dfb49d11d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-d0dc765474fa0eca.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-909d53833da080cb.js +1 -0
- sky/dashboard/out/_next/static/chunks/8982.a2e214068f30a857.js +1 -0
- sky/dashboard/out/_next/static/chunks/{25.76c246239df93d50.js → 9025.a7c44babfe56ce09.js} +2 -2
- sky/dashboard/out/_next/static/chunks/{938-0a770415b5ce4649.js → 938-044ad21de8b4626b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/9470-21d059a1dfa03f61.js +1 -0
- sky/dashboard/out/_next/static/chunks/9984.739ae958a066298d.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +1 -0
- sky/dashboard/out/_next/static/chunks/{framework-87d061ee6ed71b28.js → framework-efc06c2733009cd3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +1 -0
- sky/dashboard/out/_next/static/chunks/{main-e0e2335212e72357.js → main-c0a4f1ea606d48d2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-050a9e637b057b24.js → _app-a37b06ddb64521fd.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8135aba0712bda37.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-77d4816945b04793.js → [cluster]-b8e1114e6d38218c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-9744c271a1642f76.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-c4d5cfac7fbc0668.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-cd43fb3c122eedde.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-4ebf6484f7216387.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-06bde99155fa6292.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-d427db53e54de9ce.js +1 -0
- sky/dashboard/out/_next/static/css/0da6afe66176678a.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/jobs/controller.py +4 -0
- sky/jobs/server/core.py +5 -9
- sky/jobs/state.py +820 -670
- sky/jobs/utils.py +7 -15
- sky/optimizer.py +46 -0
- sky/provision/__init__.py +14 -6
- sky/provision/kubernetes/constants.py +9 -0
- sky/provision/kubernetes/instance.py +17 -14
- sky/provision/kubernetes/volume.py +77 -15
- sky/server/common.py +1 -0
- sky/server/server.py +37 -15
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +1 -0
- sky/task.py +13 -1
- sky/utils/dag_utils.py +4 -2
- sky/utils/log_utils.py +68 -0
- sky/volumes/server/core.py +103 -78
- sky/volumes/utils.py +22 -5
- {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/RECORD +96 -94
- sky/dashboard/out/_next/static/ZYLkkWSYZjJhLVsObh20y/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +0 -1
- sky/dashboard/out/_next/static/chunks/43-f38a531f6692f281.js +0 -1
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +0 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/601-111d06d9ded11d00.js +0 -1
- sky/dashboard/out/_next/static/chunks/616-50a620ac4a23deb4.js +0 -39
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +0 -1
- sky/dashboard/out/_next/static/chunks/664-047bc03493fda379.js +0 -1
- sky/dashboard/out/_next/static/chunks/785.3446c12ffdf3d188.js +0 -1
- sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +0 -1
- sky/dashboard/out/_next/static/chunks/799-3625946b2ec2eb30.js +0 -8
- sky/dashboard/out/_next/static/chunks/937.72796f7afe54075b.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +0 -1
- sky/dashboard/out/_next/static/chunks/982.d7bd80ed18cad4cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +0 -1
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +0 -1
- sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-21080826c6095f21.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-65b2c90320b8afb8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-6b0d9e5031b70c58.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-64bdc0b2d3a44709.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-df7407b5e37d3750.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-d7684eaa04c4f58f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-5b59bce9eb208d84.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-04e1b3ad4207b1e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-c470366a6179f16e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-75a3310ef922a299.js +0 -1
- sky/dashboard/out/_next/static/css/605ac87514049058.css +0 -3
- /sky/dashboard/out/_next/static/{ZYLkkWSYZjJhLVsObh20y → Md3rlE87jmL5uv7gSo8mR}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{804-4c9fc53aa74bc191.js → 804-9f5e98ce84d46bdd.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250628.dist-info → skypilot_nightly-1.0.0.dev20250701.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
@@ -180,12 +180,6 @@ def is_consolidation_mode() -> bool:
|
|
180
180
|
return consolidation_mode
|
181
181
|
|
182
182
|
|
183
|
-
def get_ha_dump_script_path(job_id: int) -> pathlib.Path:
|
184
|
-
"""Get the path to the HA dump script for a job."""
|
185
|
-
return pathlib.Path(constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser(
|
186
|
-
).resolve() / f'sky_job_{job_id}'
|
187
|
-
|
188
|
-
|
189
183
|
def ha_recovery_for_consolidation_mode():
|
190
184
|
"""Recovery logic for HA mode."""
|
191
185
|
# No setup recovery is needed in consolidation mode, as the API server
|
@@ -221,17 +215,15 @@ def ha_recovery_for_consolidation_mode():
|
|
221
215
|
managed_job_state.ManagedJobScheduleState.DONE,
|
222
216
|
managed_job_state.ManagedJobScheduleState.WAITING
|
223
217
|
]:
|
224
|
-
|
225
|
-
if
|
226
|
-
f.write(f'Job {job_id}\'s recovery
|
227
|
-
'
|
228
|
-
f'
|
218
|
+
script = managed_job_state.get_ha_recovery_script(job_id)
|
219
|
+
if script is None:
|
220
|
+
f.write(f'Job {job_id}\'s recovery script does not exist. '
|
221
|
+
'Skipping recovery. Job schedule state: '
|
222
|
+
f'{job["schedule_state"]}\n')
|
229
223
|
continue
|
230
|
-
with open(dump_script_path, 'r', encoding='utf-8') as script_f:
|
231
|
-
script = script_f.read()
|
232
224
|
runner.run(script)
|
233
|
-
f.write(f'Job {job_id}
|
234
|
-
f'
|
225
|
+
f.write(f'Job {job_id} completed recovery at '
|
226
|
+
f'{datetime.datetime.now()}\n')
|
235
227
|
f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
|
236
228
|
f.write(f'Total recovery time: {time.time() - start} seconds\n')
|
237
229
|
|
sky/optimizer.py
CHANGED
@@ -1252,6 +1252,52 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
1252
1252
|
logger.warning(
|
1253
1253
|
f'{colorama.Fore.YELLOW}{msg}{colorama.Style.RESET_ALL}')
|
1254
1254
|
|
1255
|
+
_check_specified_regions(task)
|
1256
|
+
|
1257
|
+
|
1258
|
+
def _check_specified_regions(task: task_lib.Task) -> None:
|
1259
|
+
"""Check if specified regions (Kubernetes contexts) are enabled.
|
1260
|
+
|
1261
|
+
Args:
|
1262
|
+
task: The task to check.
|
1263
|
+
"""
|
1264
|
+
# Only check for Kubernetes now
|
1265
|
+
if not all(
|
1266
|
+
isinstance(resources.cloud, clouds.Kubernetes)
|
1267
|
+
for resources in task.resources):
|
1268
|
+
return
|
1269
|
+
# Kubernetes region is a context if set
|
1270
|
+
for resources in task.resources:
|
1271
|
+
if resources.region is None:
|
1272
|
+
continue
|
1273
|
+
existing_contexts = clouds.Kubernetes.existing_allowed_contexts()
|
1274
|
+
region = resources.region
|
1275
|
+
task_name = f' {task.name!r}' if task.name is not None else ''
|
1276
|
+
msg = f'Task{task_name} requires '
|
1277
|
+
if region not in existing_contexts:
|
1278
|
+
infra_str = f'Kubernetes/{region}'
|
1279
|
+
logger.warning(f'{infra_str} is not enabled.')
|
1280
|
+
volume_mounts_str = ''
|
1281
|
+
if task.volume_mounts:
|
1282
|
+
if len(task.volume_mounts) > 1:
|
1283
|
+
volume_mounts_str += 'volumes '
|
1284
|
+
else:
|
1285
|
+
volume_mounts_str += 'volume '
|
1286
|
+
volume_mounts_str += ', '.join(
|
1287
|
+
[f'{v.volume_name}' for v in task.volume_mounts])
|
1288
|
+
volume_mounts_str += f' with infra {infra_str}'
|
1289
|
+
if volume_mounts_str:
|
1290
|
+
msg += volume_mounts_str
|
1291
|
+
else:
|
1292
|
+
msg += f'infra {infra_str}'
|
1293
|
+
msg += (
|
1294
|
+
f' which is not enabled. To enable access, change '
|
1295
|
+
f'the task infra requirement or run: {colorama.Style.BRIGHT}'
|
1296
|
+
f'sky check {colorama.Style.RESET_ALL}'
|
1297
|
+
f'to ensure the infra is enabled.')
|
1298
|
+
with ux_utils.print_exception_no_traceback():
|
1299
|
+
raise exceptions.ResourcesUnavailableError(msg)
|
1300
|
+
|
1255
1301
|
|
1256
1302
|
def _fill_in_launchable_resources(
|
1257
1303
|
task: task_lib.Task,
|
sky/provision/__init__.py
CHANGED
@@ -6,7 +6,7 @@ providers supported by SkyPilot need to follow.
|
|
6
6
|
import functools
|
7
7
|
import inspect
|
8
8
|
import typing
|
9
|
-
from typing import Any, Dict, List, Optional, Type
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Type
|
10
10
|
|
11
11
|
from sky import models
|
12
12
|
from sky import sky_logging
|
@@ -106,7 +106,7 @@ def bootstrap_instances(
|
|
106
106
|
|
107
107
|
@_route_to_cloud_impl
|
108
108
|
def apply_volume(provider_name: str,
|
109
|
-
|
109
|
+
volume_config: models.VolumeConfig) -> models.VolumeConfig:
|
110
110
|
"""Create or register a volume.
|
111
111
|
|
112
112
|
This function creates or registers a volume with the provided configuration,
|
@@ -117,15 +117,23 @@ def apply_volume(provider_name: str,
|
|
117
117
|
|
118
118
|
@_route_to_cloud_impl
|
119
119
|
def delete_volume(provider_name: str,
|
120
|
-
|
120
|
+
volume_config: models.VolumeConfig) -> models.VolumeConfig:
|
121
121
|
"""Delete a volume."""
|
122
122
|
raise NotImplementedError
|
123
123
|
|
124
124
|
|
125
125
|
@_route_to_cloud_impl
|
126
|
-
def get_volume_usedby(
|
127
|
-
|
128
|
-
|
126
|
+
def get_volume_usedby(
|
127
|
+
provider_name: str,
|
128
|
+
volume_config: models.VolumeConfig,
|
129
|
+
) -> Tuple[List[str], List[str]]:
|
130
|
+
"""Get the usedby of a volume.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
usedby_pods: List of pods using the volume. These may include pods
|
134
|
+
not created by SkyPilot.
|
135
|
+
usedby_clusters: List of clusters using the volume.
|
136
|
+
"""
|
129
137
|
raise NotImplementedError
|
130
138
|
|
131
139
|
|
@@ -15,3 +15,12 @@ SKY_K8S_EXEC_AUTH_PATH = '$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk/bin:
|
|
15
15
|
|
16
16
|
# cache directory for kubeconfig with modified exec auth
|
17
17
|
SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE = '~/.sky/generated/kubeconfigs'
|
18
|
+
|
19
|
+
# Labels for the Pods created by SkyPilot
|
20
|
+
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
21
|
+
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
22
|
+
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
23
|
+
TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
|
24
|
+
|
25
|
+
# Pod phases that are not holding PVCs
|
26
|
+
PVC_NOT_HOLD_POD_PHASES = ['Succeeded', 'Failed']
|
@@ -12,6 +12,7 @@ from sky.provision import common
|
|
12
12
|
from sky.provision import constants
|
13
13
|
from sky.provision import docker_utils
|
14
14
|
from sky.provision.kubernetes import config as config_lib
|
15
|
+
from sky.provision.kubernetes import constants as k8s_constants
|
15
16
|
from sky.provision.kubernetes import network_utils
|
16
17
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
17
18
|
from sky.provision.kubernetes import volume
|
@@ -30,14 +31,10 @@ _MAX_RETRIES = 3
|
|
30
31
|
_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
|
31
32
|
|
32
33
|
logger = sky_logging.init_logger(__name__)
|
33
|
-
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
34
|
-
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
35
|
-
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
36
|
-
TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
|
37
34
|
|
38
35
|
|
39
36
|
def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
|
40
|
-
return {TAG_RAY_CLUSTER_NAME: cluster_name}
|
37
|
+
return {k8s_constants.TAG_RAY_CLUSTER_NAME: cluster_name}
|
41
38
|
|
42
39
|
|
43
40
|
def _is_head(pod) -> bool:
|
@@ -75,7 +72,8 @@ def is_high_availability_cluster_by_kubectl(
|
|
75
72
|
deployment_list = kubernetes.apps_api(
|
76
73
|
context).list_namespaced_deployment(
|
77
74
|
namespace,
|
78
|
-
label_selector=
|
75
|
+
label_selector=
|
76
|
+
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
|
79
77
|
except kubernetes.api_exception():
|
80
78
|
return False
|
81
79
|
# It is a high availability cluster if there is at least one deployment
|
@@ -280,10 +278,12 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
280
278
|
while _evaluate_timeout():
|
281
279
|
# Get all pods in a single API call using the cluster name label
|
282
280
|
# which all pods in new_nodes should share
|
283
|
-
cluster_name = new_nodes[0].metadata.labels[
|
281
|
+
cluster_name = new_nodes[0].metadata.labels[
|
282
|
+
k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
|
284
283
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
285
284
|
namespace,
|
286
|
-
label_selector=
|
285
|
+
label_selector=
|
286
|
+
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
|
287
287
|
|
288
288
|
# Get the set of found pod names and check if we have all expected pods
|
289
289
|
found_pod_names = {pod.metadata.name for pod in pods}
|
@@ -361,10 +361,12 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
361
361
|
|
362
362
|
while True:
|
363
363
|
# Get all pods in a single API call
|
364
|
-
cluster_name = new_nodes[0].metadata.labels[
|
364
|
+
cluster_name = new_nodes[0].metadata.labels[
|
365
|
+
k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
|
365
366
|
all_pods = kubernetes.core_api(context).list_namespaced_pod(
|
366
367
|
namespace,
|
367
|
-
label_selector=
|
368
|
+
label_selector=
|
369
|
+
f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
|
368
370
|
|
369
371
|
# Get the set of found pod names and check if we have all expected pods
|
370
372
|
found_pod_names = {pod.metadata.name for pod in all_pods}
|
@@ -732,7 +734,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
732
734
|
else:
|
733
735
|
pod_spec['metadata']['labels'] = tags
|
734
736
|
pod_spec['metadata']['labels'].update(
|
735
|
-
{TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
737
|
+
{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
736
738
|
|
737
739
|
terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
738
740
|
['Terminating'])
|
@@ -841,7 +843,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
841
843
|
'podAffinityTerm': {
|
842
844
|
'labelSelector': {
|
843
845
|
'matchExpressions': [{
|
844
|
-
'key': TAG_SKYPILOT_CLUSTER_NAME,
|
846
|
+
'key': k8s_constants.TAG_SKYPILOT_CLUSTER_NAME,
|
845
847
|
'operator': 'In',
|
846
848
|
'values': [cluster_name_on_cloud]
|
847
849
|
}]
|
@@ -884,7 +886,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
884
886
|
# Add the deployment name as a label to the pod spec
|
885
887
|
deployment_name = deployment_spec['metadata']['name']
|
886
888
|
pod_spec_copy['metadata']['labels'][
|
887
|
-
TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
|
889
|
+
k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
|
888
890
|
template_pod_spec['metadata'] = pod_spec_copy['metadata']
|
889
891
|
template_pod_spec['spec'].update(pod_spec_copy['spec'])
|
890
892
|
# Propagate the labels to the deployment for identification.
|
@@ -1289,7 +1291,8 @@ def get_command_runners(
|
|
1289
1291
|
|
1290
1292
|
# Try to get deployment name from label first
|
1291
1293
|
head_instance_info = instances[pod_name][0]
|
1292
|
-
deployment = head_instance_info.tags.get(
|
1294
|
+
deployment = head_instance_info.tags.get(
|
1295
|
+
k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME)
|
1293
1296
|
|
1294
1297
|
node_list = [((namespace, context), pod_name)]
|
1295
1298
|
head_runner = command_runner.KubernetesCommandRunner(
|
@@ -1,10 +1,12 @@
|
|
1
1
|
"""Kubernetes pvc provisioning."""
|
2
2
|
from typing import Any, Dict, List, Optional, Tuple
|
3
3
|
|
4
|
+
from sky import global_user_state
|
4
5
|
from sky import models
|
5
6
|
from sky import sky_logging
|
6
7
|
from sky.adaptors import kubernetes
|
7
8
|
from sky.provision.kubernetes import config as config_lib
|
9
|
+
from sky.provision.kubernetes import constants as k8s_constants
|
8
10
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
9
11
|
from sky.volumes import volume as volume_lib
|
10
12
|
|
@@ -45,17 +47,26 @@ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
|
|
45
47
|
access_mode = pvc.spec.access_modes[0]
|
46
48
|
if access_mode not in once_modes:
|
47
49
|
continue
|
48
|
-
|
49
|
-
if
|
50
|
+
usedby_pods, _ = _get_volume_usedby(context, namespace, pvc_name)
|
51
|
+
if usedby_pods:
|
50
52
|
raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
|
51
53
|
f'mode {access_mode} is already '
|
52
|
-
f'in use by {
|
54
|
+
f'in use by Pods {usedby_pods}.')
|
53
55
|
|
54
56
|
|
55
57
|
def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
56
58
|
"""Creates or registers a volume."""
|
57
59
|
context, namespace = _get_context_namespace(config)
|
58
60
|
pvc_spec = _get_pvc_spec(namespace, config)
|
61
|
+
# Check if the storage class exists
|
62
|
+
storage_class_name = pvc_spec['spec'].get('storageClassName')
|
63
|
+
if storage_class_name is not None:
|
64
|
+
try:
|
65
|
+
kubernetes.storage_api(context).read_storage_class(
|
66
|
+
name=storage_class_name)
|
67
|
+
except kubernetes.api_exception() as e:
|
68
|
+
raise config_lib.KubernetesError(
|
69
|
+
f'Check storage class {storage_class_name} error: {e}')
|
59
70
|
create_persistent_volume_claim(namespace, context, pvc_spec)
|
60
71
|
return config
|
61
72
|
|
@@ -76,22 +87,73 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
76
87
|
return config
|
77
88
|
|
78
89
|
|
79
|
-
def _get_volume_usedby(
|
80
|
-
|
81
|
-
|
82
|
-
|
90
|
+
def _get_volume_usedby(
|
91
|
+
context: Optional[str],
|
92
|
+
namespace: str,
|
93
|
+
pvc_name: str,
|
94
|
+
) -> Tuple[List[str], List[str]]:
|
95
|
+
"""Gets the usedby resources of a volume.
|
96
|
+
|
97
|
+
This function returns the pods and clusters that are using the volume.
|
98
|
+
The usedby_pods is accurate, which also includes the Pods that are not
|
99
|
+
managed by SkyPilot.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
context: Kubernetes context
|
103
|
+
namespace: Kubernetes namespace
|
104
|
+
pvc_name: PVC name
|
105
|
+
|
106
|
+
Returns:
|
107
|
+
usedby_pods: List of pods using the volume. These may include pods
|
108
|
+
not created by SkyPilot.
|
109
|
+
usedby_clusters: List of clusters using the volume.
|
110
|
+
"""
|
111
|
+
usedby_pods = []
|
112
|
+
usedby_clusters = []
|
113
|
+
field_selector = ','.join([
|
114
|
+
f'status.phase!={phase}'
|
115
|
+
for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
|
116
|
+
])
|
117
|
+
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
83
118
|
# Get all pods in the namespace
|
84
|
-
pods = kubernetes.core_api(context).list_namespaced_pod(
|
119
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
120
|
+
namespace=namespace, field_selector=field_selector)
|
85
121
|
for pod in pods.items:
|
86
|
-
if pod.spec.volumes is
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
122
|
+
if pod.spec.volumes is None:
|
123
|
+
continue
|
124
|
+
for volume in pod.spec.volumes:
|
125
|
+
if volume.persistent_volume_claim is None:
|
126
|
+
continue
|
127
|
+
if volume.persistent_volume_claim.claim_name == pvc_name:
|
128
|
+
usedby_pods.append(pod.metadata.name)
|
129
|
+
# Get the real cluster name
|
130
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
131
|
+
k8s_constants.TAG_SKYPILOT_CLUSTER_NAME)
|
132
|
+
if cluster_name_on_cloud is None:
|
133
|
+
continue
|
134
|
+
cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
|
135
|
+
if cluster_name is not None:
|
136
|
+
usedby_clusters.append(cluster_name)
|
137
|
+
if usedby_pods:
|
138
|
+
logger.debug(f'Volume {pvc_name} is used by Pods {usedby_pods}'
|
139
|
+
f' and clusters {usedby_clusters}')
|
140
|
+
return usedby_pods, usedby_clusters
|
141
|
+
|
142
|
+
|
143
|
+
def _get_cluster_name_on_cloud_to_cluster_name_map() -> Dict[str, str]:
|
144
|
+
"""Gets the map from cluster name on cloud to cluster name."""
|
145
|
+
clusters = global_user_state.get_clusters()
|
146
|
+
cloud_to_name_map = {}
|
147
|
+
for cluster in clusters:
|
148
|
+
handle = cluster['handle']
|
149
|
+
if handle is None:
|
150
|
+
continue
|
151
|
+
cloud_to_name_map[handle.cluster_name_on_cloud] = cluster['name']
|
152
|
+
return cloud_to_name_map
|
92
153
|
|
93
154
|
|
94
|
-
def get_volume_usedby(
|
155
|
+
def get_volume_usedby(
|
156
|
+
config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
|
95
157
|
"""Gets the usedby resources of a volume."""
|
96
158
|
context, namespace = _get_context_namespace(config)
|
97
159
|
pvc_name = config.name_on_cloud
|
sky/server/common.py
CHANGED
@@ -293,6 +293,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
293
293
|
try:
|
294
294
|
response = make_authenticated_request('GET',
|
295
295
|
'/api/health',
|
296
|
+
server_url=server_url,
|
296
297
|
timeout=2.5)
|
297
298
|
except requests.exceptions.Timeout:
|
298
299
|
if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
|
sky/server/server.py
CHANGED
@@ -277,29 +277,51 @@ class BearerTokenMiddleware(starlette.middleware.base.BaseHTTPMiddleware):
|
|
277
277
|
"""Middleware to handle Bearer Token Auth (Service Accounts)."""
|
278
278
|
|
279
279
|
async def dispatch(self, request: fastapi.Request, call_next):
|
280
|
-
|
280
|
+
"""Make sure correct bearer token auth is present.
|
281
|
+
|
282
|
+
1. If the request has the X-Skypilot-Auth-Mode: token header, it must
|
283
|
+
have a valid bearer token.
|
284
|
+
2. For backwards compatibility, if the request has a Bearer token
|
285
|
+
beginning with "sky_" (even if X-Skypilot-Auth-Mode is not present),
|
286
|
+
it must be a valid token.
|
287
|
+
3. If X-Skypilot-Auth-Mode is not set to "token", and there is no Bearer
|
288
|
+
token beginning with "sky_", allow the request to continue.
|
289
|
+
|
290
|
+
In conjunction with an auth proxy, the idea is to make the auth proxy
|
291
|
+
bypass requests with bearer tokens, instead setting the
|
292
|
+
X-Skypilot-Auth-Mode header. The auth proxy should either validate the
|
293
|
+
auth or set the header X-Skypilot-Auth-Mode: token.
|
294
|
+
"""
|
295
|
+
has_skypilot_auth_header = (
|
296
|
+
request.headers.get('X-Skypilot-Auth-Mode') == 'token')
|
281
297
|
auth_header = request.headers.get('authorization')
|
282
|
-
|
298
|
+
has_bearer_token_starting_with_sky = (
|
299
|
+
auth_header and auth_header.lower().startswith('bearer ') and
|
300
|
+
auth_header.split(' ', 1)[1].startswith('sky_'))
|
301
|
+
|
302
|
+
if (not has_skypilot_auth_header and
|
303
|
+
not has_bearer_token_starting_with_sky):
|
304
|
+
# This is case #3 above. We do not need to validate the request.
|
283
305
|
# No Bearer token, continue with normal processing (OAuth2 cookies,
|
284
306
|
# etc.)
|
285
307
|
return await call_next(request)
|
308
|
+
# After this point, all requests must be validated.
|
309
|
+
|
310
|
+
if auth_header is None:
|
311
|
+
return fastapi.responses.JSONResponse(
|
312
|
+
status_code=401, content={'detail': 'Authentication required'})
|
286
313
|
|
287
314
|
# Extract token
|
288
|
-
|
315
|
+
split_header = auth_header.split(' ', 1)
|
316
|
+
if split_header[0].lower() != 'bearer':
|
317
|
+
return fastapi.responses.JSONResponse(
|
318
|
+
status_code=401,
|
319
|
+
content={'detail': 'Invalid authentication method'})
|
320
|
+
sa_token = split_header[1]
|
289
321
|
|
290
322
|
# Handle SkyPilot service account tokens
|
291
|
-
|
292
|
-
|
293
|
-
request, sa_token, call_next)
|
294
|
-
|
295
|
-
# Handle other Bearer tokens (OAuth2 access tokens, etc.)
|
296
|
-
# These requests bypassed OAuth2 proxy, so let the application decide
|
297
|
-
# how to handle them
|
298
|
-
# For now, we'll let them continue through normal processing
|
299
|
-
logger.debug(
|
300
|
-
'Non-SkyPilot Bearer token detected, continuing with normal '
|
301
|
-
'processing')
|
302
|
-
return await call_next(request)
|
323
|
+
return await self._handle_service_account_token(request, sa_token,
|
324
|
+
call_next)
|
303
325
|
|
304
326
|
async def _handle_service_account_token(self, request: fastapi.Request,
|
305
327
|
sa_token: str, call_next):
|
sky/setup_files/dependencies.py
CHANGED
sky/skylet/constants.py
CHANGED
sky/task.py
CHANGED
@@ -884,6 +884,18 @@ class Task:
|
|
884
884
|
def volumes(self) -> Dict[str, str]:
|
885
885
|
return self._volumes
|
886
886
|
|
887
|
+
def set_volumes(self, volumes: Dict[str, str]) -> None:
|
888
|
+
"""Sets the volumes for this task.
|
889
|
+
|
890
|
+
Args:
|
891
|
+
volumes: a dict of ``{mount_path: volume_name}``.
|
892
|
+
"""
|
893
|
+
self._volumes = volumes
|
894
|
+
|
895
|
+
def update_volumes(self, volumes: Dict[str, str]) -> None:
|
896
|
+
"""Updates the volumes for this task."""
|
897
|
+
self._volumes.update(volumes)
|
898
|
+
|
887
899
|
def update_envs(
|
888
900
|
self, envs: Union[None, List[Tuple[str, str]],
|
889
901
|
Dict[str, str]]) -> 'Task':
|
@@ -1500,7 +1512,7 @@ class Task:
|
|
1500
1512
|
d[k] = v
|
1501
1513
|
return d
|
1502
1514
|
|
1503
|
-
def to_yaml_config(self, redact_secrets: bool =
|
1515
|
+
def to_yaml_config(self, redact_secrets: bool = False) -> Dict[str, Any]:
|
1504
1516
|
"""Returns a yaml-style dict representation of the task.
|
1505
1517
|
|
1506
1518
|
INTERNAL: this method is internal-facing.
|
sky/utils/dag_utils.py
CHANGED
@@ -147,11 +147,13 @@ def load_chain_dag_from_yaml_str(
|
|
147
147
|
return _load_chain_dag(configs, env_overrides, secrets_overrides)
|
148
148
|
|
149
149
|
|
150
|
-
def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag
|
150
|
+
def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag,
|
151
|
+
redact_secrets: bool = False) -> str:
|
151
152
|
"""Dumps a chain DAG to a YAML string.
|
152
153
|
|
153
154
|
Args:
|
154
155
|
dag: the DAG to dump.
|
156
|
+
redact_secrets: whether to redact secrets in the YAML string.
|
155
157
|
|
156
158
|
Returns:
|
157
159
|
The YAML string.
|
@@ -159,7 +161,7 @@ def dump_chain_dag_to_yaml_str(dag: dag_lib.Dag) -> str:
|
|
159
161
|
assert dag.is_chain(), dag
|
160
162
|
configs = [{'name': dag.name}]
|
161
163
|
for task in dag.tasks:
|
162
|
-
configs.append(task.to_yaml_config())
|
164
|
+
configs.append(task.to_yaml_config(redact_secrets=redact_secrets))
|
163
165
|
return common_utils.dump_yaml_str(configs)
|
164
166
|
|
165
167
|
|
sky/utils/log_utils.py
CHANGED
@@ -573,6 +573,74 @@ def readable_time_duration(start: Optional[float],
|
|
573
573
|
return diff
|
574
574
|
|
575
575
|
|
576
|
+
def human_duration(start: int, end: Optional[int] = None) -> str:
|
577
|
+
"""Calculates the time elapsed between two timestamps and returns
|
578
|
+
it as a human-readable string, similar to Kubernetes' duration format.
|
579
|
+
|
580
|
+
Args:
|
581
|
+
start: The start time as a Unix timestamp (seconds since epoch).
|
582
|
+
end: The end time as a Unix timestamp (seconds since epoch).
|
583
|
+
If None, current time is used.
|
584
|
+
|
585
|
+
Returns:
|
586
|
+
A string representing the duration, e.g., "2d3h", "15m", "30s".
|
587
|
+
Returns "0s" for zero, negative durations, or if the timestamp
|
588
|
+
is invalid.
|
589
|
+
"""
|
590
|
+
if not start or start <= 0:
|
591
|
+
return '0s'
|
592
|
+
|
593
|
+
if end is None:
|
594
|
+
end = int(time.time())
|
595
|
+
duration_seconds = end - start
|
596
|
+
|
597
|
+
units = {
|
598
|
+
'y': 365 * 24 * 60 * 60,
|
599
|
+
'd': 60 * 60 * 24,
|
600
|
+
'h': 60 * 60,
|
601
|
+
'm': 60,
|
602
|
+
's': 1,
|
603
|
+
}
|
604
|
+
|
605
|
+
if duration_seconds <= 0:
|
606
|
+
return '0s'
|
607
|
+
elif duration_seconds < 60 * 2:
|
608
|
+
return f'{duration_seconds}s'
|
609
|
+
|
610
|
+
minutes = int(duration_seconds / units['m'])
|
611
|
+
if minutes < 10:
|
612
|
+
s = int(duration_seconds / units['s']) % 60
|
613
|
+
if s == 0:
|
614
|
+
return f'{minutes}m'
|
615
|
+
return f'{minutes}m{s}s'
|
616
|
+
elif minutes < 60 * 3:
|
617
|
+
return f'{minutes}m'
|
618
|
+
|
619
|
+
hours = int(duration_seconds / units['h'])
|
620
|
+
days = int(hours / 24)
|
621
|
+
years = int(hours / 24 / 365)
|
622
|
+
if hours < 8:
|
623
|
+
m = int(duration_seconds / units['m']) % 60
|
624
|
+
if m == 0:
|
625
|
+
return f'{hours}h'
|
626
|
+
return f'{hours}h{m}m'
|
627
|
+
elif hours < 48:
|
628
|
+
return f'{hours}h'
|
629
|
+
elif hours < 24 * 8:
|
630
|
+
h = hours % 24
|
631
|
+
if h == 0:
|
632
|
+
return f'{days}d'
|
633
|
+
return f'{days}d{h}h'
|
634
|
+
elif hours < 24 * 365 * 2:
|
635
|
+
return f'{days}d'
|
636
|
+
elif hours < 24 * 365 * 8:
|
637
|
+
dy = int(hours / 24) % 365
|
638
|
+
if dy == 0:
|
639
|
+
return f'{years}y'
|
640
|
+
return f'{years}y{dy}d'
|
641
|
+
return f'{years}y'
|
642
|
+
|
643
|
+
|
576
644
|
def follow_logs(
|
577
645
|
file: TextIO,
|
578
646
|
*,
|