skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend.py +5 -3
- sky/backends/backend_utils.py +22 -7
- sky/backends/cloud_vm_ray_backend.py +50 -18
- sky/backends/local_docker_backend.py +8 -3
- sky/client/cli/command.py +25 -10
- sky/client/sdk.py +51 -1
- sky/clouds/kubernetes.py +2 -6
- sky/clouds/nebius.py +3 -1
- sky/core.py +9 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
- sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +29 -9
- sky/execution.py +13 -10
- sky/global_user_state.py +131 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/recovery_strategy.py +0 -3
- sky/jobs/scheduler.py +14 -21
- sky/jobs/server/core.py +64 -10
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +1 -3
- sky/jobs/utils.py +159 -11
- sky/provision/aws/config.py +19 -3
- sky/provision/aws/instance.py +2 -1
- sky/provision/kubernetes/instance.py +2 -1
- sky/provision/nebius/utils.py +101 -86
- sky/provision/provisioner.py +13 -8
- sky/resources.py +5 -5
- sky/schemas/api/responses.py +50 -1
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/serve/replica_managers.py +123 -101
- sky/serve/serve_state.py +32 -0
- sky/serve/serve_utils.py +37 -16
- sky/serve/service.py +51 -17
- sky/server/common.py +2 -3
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +20 -5
- sky/server/requests/serializers/encoders.py +21 -8
- sky/server/server.py +57 -11
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/cli_utils/status_utils.py +2 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/controller_utils.py +17 -4
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +36 -5
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
- sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
- /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
- /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import urllib.request
|
|
6
6
|
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'bff0c2a2d33d0990092c7c33a532359ffe1b6c56'
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _get_git_commit():
|
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
__commit__ = _get_git_commit()
|
|
38
|
-
__version__ = '1.0.0.
|
|
38
|
+
__version__ = '1.0.0.dev20250816'
|
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
40
40
|
|
|
41
41
|
|
sky/adaptors/nebius.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
"""Nebius cloud adaptor."""
|
|
2
|
+
import asyncio
|
|
2
3
|
import os
|
|
3
4
|
import threading
|
|
4
|
-
from typing import List, Optional
|
|
5
|
+
from typing import Any, Awaitable, List, Optional
|
|
5
6
|
|
|
6
7
|
from sky import sky_logging
|
|
7
8
|
from sky import skypilot_config
|
|
@@ -9,8 +10,49 @@ from sky.adaptors import common
|
|
|
9
10
|
from sky.utils import annotations
|
|
10
11
|
from sky.utils import ux_utils
|
|
11
12
|
|
|
13
|
+
# Default read timeout for nebius SDK
|
|
14
|
+
READ_TIMEOUT = 10
|
|
15
|
+
|
|
12
16
|
logger = sky_logging.init_logger(__name__)
|
|
13
17
|
|
|
18
|
+
_loop_lock = threading.Lock()
|
|
19
|
+
_loop = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_event_loop() -> asyncio.AbstractEventLoop:
|
|
23
|
+
"""Get event loop for nebius sdk."""
|
|
24
|
+
global _loop
|
|
25
|
+
|
|
26
|
+
if _loop is not None:
|
|
27
|
+
return _loop
|
|
28
|
+
|
|
29
|
+
with _loop_lock:
|
|
30
|
+
if _loop is None:
|
|
31
|
+
# Create a new event loop in a dedicated thread
|
|
32
|
+
_loop = asyncio.new_event_loop()
|
|
33
|
+
threading.Thread(target=_loop.run_forever, daemon=True).start()
|
|
34
|
+
|
|
35
|
+
return _loop
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def sync_call(awaitable: Awaitable[Any]) -> Any:
|
|
39
|
+
"""Synchronously run an awaitable in coroutine.
|
|
40
|
+
|
|
41
|
+
This wrapper is used to workaround:
|
|
42
|
+
https://github.com/nebius/pysdk/issues/76
|
|
43
|
+
|
|
44
|
+
Uses a dedicated background event loop to avoid conflicts
|
|
45
|
+
with existing asyncio contexts and prevent BlockingIOError.
|
|
46
|
+
"""
|
|
47
|
+
loop = _get_event_loop()
|
|
48
|
+
future = asyncio.run_coroutine_threadsafe(_coro(awaitable), loop)
|
|
49
|
+
return future.result()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def _coro(awaitable: Awaitable[Any]) -> Any:
|
|
53
|
+
"""Wrapper coroutine for awaitable."""
|
|
54
|
+
return await awaitable
|
|
55
|
+
|
|
14
56
|
|
|
15
57
|
def tenant_id_path() -> str:
|
|
16
58
|
return '~/.nebius/NEBIUS_TENANT_ID.txt'
|
sky/backends/backend.py
CHANGED
|
@@ -147,8 +147,9 @@ class Backend(Generic[_ResourceHandleType]):
|
|
|
147
147
|
def teardown(self,
|
|
148
148
|
handle: _ResourceHandleType,
|
|
149
149
|
terminate: bool,
|
|
150
|
-
purge: bool = False
|
|
151
|
-
|
|
150
|
+
purge: bool = False,
|
|
151
|
+
explicitly_requested: bool = False) -> None:
|
|
152
|
+
self._teardown(handle, terminate, purge, explicitly_requested)
|
|
152
153
|
|
|
153
154
|
def register_info(self, **kwargs) -> None:
|
|
154
155
|
"""Register backend-specific information."""
|
|
@@ -200,5 +201,6 @@ class Backend(Generic[_ResourceHandleType]):
|
|
|
200
201
|
def _teardown(self,
|
|
201
202
|
handle: _ResourceHandleType,
|
|
202
203
|
terminate: bool,
|
|
203
|
-
purge: bool = False
|
|
204
|
+
purge: bool = False,
|
|
205
|
+
explicitly_requested: bool = False):
|
|
204
206
|
raise NotImplementedError
|
sky/backends/backend_utils.py
CHANGED
|
@@ -2017,7 +2017,15 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2017
2017
|
if handle.cluster_yaml is None:
|
|
2018
2018
|
# Remove cluster from db since this cluster does not have a config file
|
|
2019
2019
|
# or any other ongoing requests
|
|
2020
|
-
global_user_state.
|
|
2020
|
+
global_user_state.add_cluster_event(
|
|
2021
|
+
cluster_name,
|
|
2022
|
+
None,
|
|
2023
|
+
'Cluster has no YAML file. Removing the cluster from cache.',
|
|
2024
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2025
|
+
nop_if_duplicate=True)
|
|
2026
|
+
global_user_state.remove_cluster(cluster_name,
|
|
2027
|
+
terminate=True,
|
|
2028
|
+
remove_events=True)
|
|
2021
2029
|
logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
|
|
2022
2030
|
'Removing the cluster from cache.')
|
|
2023
2031
|
return None
|
|
@@ -2137,7 +2145,7 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2137
2145
|
global_user_state.add_cluster_event(
|
|
2138
2146
|
cluster_name,
|
|
2139
2147
|
status_lib.ClusterStatus.UP,
|
|
2140
|
-
'All nodes up
|
|
2148
|
+
'All nodes up; SkyPilot runtime healthy.',
|
|
2141
2149
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2142
2150
|
nop_if_duplicate=True)
|
|
2143
2151
|
global_user_state.add_or_update_cluster(cluster_name,
|
|
@@ -2277,9 +2285,12 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2277
2285
|
-1,
|
|
2278
2286
|
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
2279
2287
|
stream_logs=False)
|
|
2280
|
-
except exceptions.CommandError
|
|
2288
|
+
except (exceptions.CommandError,
|
|
2289
|
+
grpc.FutureTimeoutError) as e:
|
|
2281
2290
|
success = False
|
|
2282
|
-
if e.
|
|
2291
|
+
if isinstance(e, grpc.FutureTimeoutError) or (
|
|
2292
|
+
isinstance(e, exceptions.CommandError) and
|
|
2293
|
+
e.returncode == 255):
|
|
2283
2294
|
word = 'autostopped' if noun == 'autostop' else 'autodowned'
|
|
2284
2295
|
logger.debug(f'The cluster is likely {word}.')
|
|
2285
2296
|
reset_local_autostop = False
|
|
@@ -2329,10 +2340,14 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2329
2340
|
# are only stored for an hour by default), so it is possible that
|
|
2330
2341
|
# the previous event has a status reason, but now it does not.
|
|
2331
2342
|
init_reason_regex = f'^Cluster is abnormal because {init_reason} .*'
|
|
2343
|
+
log_message = f'Cluster is abnormal because {init_reason}'
|
|
2344
|
+
if status_reason:
|
|
2345
|
+
log_message += f' ({status_reason})'
|
|
2346
|
+
log_message += '. Transitioned to INIT.'
|
|
2332
2347
|
global_user_state.add_cluster_event(
|
|
2333
2348
|
cluster_name,
|
|
2334
2349
|
status_lib.ClusterStatus.INIT,
|
|
2335
|
-
|
|
2350
|
+
log_message,
|
|
2336
2351
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2337
2352
|
nop_if_duplicate=True,
|
|
2338
2353
|
duplicate_regex=init_reason_regex)
|
|
@@ -2345,10 +2360,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2345
2360
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2346
2361
|
# STOPPED.
|
|
2347
2362
|
backend = backends.CloudVmRayBackend()
|
|
2348
|
-
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2349
2363
|
global_user_state.add_cluster_event(
|
|
2350
|
-
cluster_name, None, 'All nodes
|
|
2364
|
+
cluster_name, None, 'All nodes terminated, cleaning up the cluster.',
|
|
2351
2365
|
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
2366
|
+
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2352
2367
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2353
2368
|
|
|
2354
2369
|
|
|
@@ -1368,8 +1368,11 @@ class RetryingVmProvisioner(object):
|
|
|
1368
1368
|
if not dryrun:
|
|
1369
1369
|
os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
|
|
1370
1370
|
os.system(f'touch {log_path}')
|
|
1371
|
+
|
|
1371
1372
|
rich_utils.force_update_status(
|
|
1372
|
-
ux_utils.spinner_message('Launching',
|
|
1373
|
+
ux_utils.spinner_message('Launching',
|
|
1374
|
+
log_path,
|
|
1375
|
+
cluster_name=cluster_name))
|
|
1373
1376
|
|
|
1374
1377
|
# Get previous cluster status
|
|
1375
1378
|
cluster_exists = prev_cluster_status is not None
|
|
@@ -1539,6 +1542,7 @@ class RetryingVmProvisioner(object):
|
|
|
1539
1542
|
requested_resources=requested_resources,
|
|
1540
1543
|
ready=False,
|
|
1541
1544
|
is_managed=self._is_managed,
|
|
1545
|
+
provision_log_path=log_abs_path,
|
|
1542
1546
|
)
|
|
1543
1547
|
|
|
1544
1548
|
# Add cluster event for actual provisioning start.
|
|
@@ -1684,7 +1688,9 @@ class RetryingVmProvisioner(object):
|
|
|
1684
1688
|
config_dict['handle'] = handle
|
|
1685
1689
|
logger.info(
|
|
1686
1690
|
ux_utils.finishing_message(
|
|
1687
|
-
f'Cluster launched: {cluster_name!r}.',
|
|
1691
|
+
f'Cluster launched: {cluster_name!r}.',
|
|
1692
|
+
log_path,
|
|
1693
|
+
cluster_name=cluster_name))
|
|
1688
1694
|
return config_dict
|
|
1689
1695
|
|
|
1690
1696
|
# The cluster is not ready. We must perform error recording and/or
|
|
@@ -1818,7 +1824,8 @@ class RetryingVmProvisioner(object):
|
|
|
1818
1824
|
log_abs_path,
|
|
1819
1825
|
stream_logs=False,
|
|
1820
1826
|
start_streaming_at='Shared connection to',
|
|
1821
|
-
line_processor=log_utils.RayUpLineProcessor(
|
|
1827
|
+
line_processor=log_utils.RayUpLineProcessor(
|
|
1828
|
+
log_abs_path, cluster_name=cluster_handle.cluster_name),
|
|
1822
1829
|
# Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
|
|
1823
1830
|
# time during 'ray up' if insufficient capacity occurs.
|
|
1824
1831
|
env=dict(
|
|
@@ -3120,7 +3127,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3120
3127
|
is_managed=self._is_managed)
|
|
3121
3128
|
log_path = os.path.join(self.log_dir, 'provision.log')
|
|
3122
3129
|
rich_utils.force_update_status(
|
|
3123
|
-
ux_utils.spinner_message('Launching',
|
|
3130
|
+
ux_utils.spinner_message('Launching',
|
|
3131
|
+
log_path,
|
|
3132
|
+
cluster_name=cluster_name))
|
|
3124
3133
|
config_dict = retry_provisioner.provision_with_retries(
|
|
3125
3134
|
task, to_provision_config, dryrun, stream_logs,
|
|
3126
3135
|
skip_unnecessary_provisioning)
|
|
@@ -3159,8 +3168,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3159
3168
|
# Do not remove the stopped cluster from the global state
|
|
3160
3169
|
# if failed to start.
|
|
3161
3170
|
if not e.no_failover:
|
|
3171
|
+
global_user_state.add_cluster_event(
|
|
3172
|
+
cluster_name,
|
|
3173
|
+
None,
|
|
3174
|
+
'Provision failed: ' + str(e),
|
|
3175
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
3176
|
+
nop_if_duplicate=True)
|
|
3162
3177
|
global_user_state.remove_cluster(cluster_name,
|
|
3163
|
-
terminate=True
|
|
3178
|
+
terminate=True,
|
|
3179
|
+
remove_events=False)
|
|
3164
3180
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
3165
3181
|
None)
|
|
3166
3182
|
logger.error(
|
|
@@ -3962,7 +3978,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3962
3978
|
def _teardown(self,
|
|
3963
3979
|
handle: CloudVmRayResourceHandle,
|
|
3964
3980
|
terminate: bool,
|
|
3965
|
-
purge: bool = False
|
|
3981
|
+
purge: bool = False,
|
|
3982
|
+
explicitly_requested: bool = False):
|
|
3966
3983
|
"""Tear down or stop the cluster.
|
|
3967
3984
|
|
|
3968
3985
|
Args:
|
|
@@ -4037,7 +4054,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4037
4054
|
# ClusterOwnerIdentityMismatchError. The argument/flag
|
|
4038
4055
|
# `purge` should bypass such ID mismatch errors.
|
|
4039
4056
|
refresh_cluster_status=(
|
|
4040
|
-
not is_identity_mismatch_and_purge)
|
|
4057
|
+
not is_identity_mismatch_and_purge),
|
|
4058
|
+
explicitly_requested=explicitly_requested)
|
|
4041
4059
|
if terminate:
|
|
4042
4060
|
lock.force_unlock()
|
|
4043
4061
|
break
|
|
@@ -4418,7 +4436,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4418
4436
|
purge: bool = False,
|
|
4419
4437
|
post_teardown_cleanup: bool = True,
|
|
4420
4438
|
refresh_cluster_status: bool = True,
|
|
4421
|
-
remove_from_db: bool = True
|
|
4439
|
+
remove_from_db: bool = True,
|
|
4440
|
+
explicitly_requested: bool = False) -> None:
|
|
4422
4441
|
"""Teardown the cluster without acquiring the cluster status lock.
|
|
4423
4442
|
|
|
4424
4443
|
NOTE: This method should not be called without holding the cluster
|
|
@@ -4482,7 +4501,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4482
4501
|
f'provision yaml so it '
|
|
4483
4502
|
'has not been provisioned. Skipped.')
|
|
4484
4503
|
global_user_state.remove_cluster(handle.cluster_name,
|
|
4485
|
-
terminate=terminate
|
|
4504
|
+
terminate=terminate,
|
|
4505
|
+
remove_events=False)
|
|
4486
4506
|
return
|
|
4487
4507
|
log_path = os.path.join(os.path.expanduser(self.log_dir),
|
|
4488
4508
|
'teardown.log')
|
|
@@ -4539,8 +4559,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4539
4559
|
raise
|
|
4540
4560
|
|
|
4541
4561
|
if post_teardown_cleanup:
|
|
4542
|
-
self.post_teardown_cleanup(
|
|
4543
|
-
|
|
4562
|
+
self.post_teardown_cleanup(
|
|
4563
|
+
handle,
|
|
4564
|
+
terminate,
|
|
4565
|
+
purge,
|
|
4566
|
+
remove_from_db,
|
|
4567
|
+
explicitly_requested=explicitly_requested)
|
|
4544
4568
|
return
|
|
4545
4569
|
|
|
4546
4570
|
if (isinstance(cloud, clouds.IBM) and terminate and
|
|
@@ -4640,7 +4664,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4640
4664
|
terminate: bool,
|
|
4641
4665
|
purge: bool = False,
|
|
4642
4666
|
remove_from_db: bool = True,
|
|
4643
|
-
failover: bool = False
|
|
4667
|
+
failover: bool = False,
|
|
4668
|
+
explicitly_requested: bool = False) -> None:
|
|
4644
4669
|
"""Cleanup local configs/caches and delete TPUs after teardown.
|
|
4645
4670
|
|
|
4646
4671
|
This method will handle the following cleanup steps:
|
|
@@ -4819,7 +4844,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4819
4844
|
|
|
4820
4845
|
if not terminate or remove_from_db:
|
|
4821
4846
|
global_user_state.remove_cluster(handle.cluster_name,
|
|
4822
|
-
terminate=terminate
|
|
4847
|
+
terminate=terminate,
|
|
4848
|
+
remove_events=explicitly_requested)
|
|
4823
4849
|
|
|
4824
4850
|
def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
|
|
4825
4851
|
"""Remove the YAML config of a cluster."""
|
|
@@ -4928,11 +4954,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4928
4954
|
# We cannot check if the cluster is autostopping.
|
|
4929
4955
|
return False
|
|
4930
4956
|
if handle.is_grpc_enabled:
|
|
4931
|
-
|
|
4932
|
-
|
|
4933
|
-
|
|
4934
|
-
|
|
4935
|
-
|
|
4957
|
+
try:
|
|
4958
|
+
request = autostopv1_pb2.IsAutostoppingRequest()
|
|
4959
|
+
response = backend_utils.invoke_skylet_with_retries(
|
|
4960
|
+
handle, lambda: SkyletClient(handle.get_grpc_channel()).
|
|
4961
|
+
is_autostopping(request))
|
|
4962
|
+
return response.is_autostopping
|
|
4963
|
+
except Exception as e: # pylint: disable=broad-except
|
|
4964
|
+
# The cluster may have been terminated, causing the gRPC call
|
|
4965
|
+
# to timeout and fail.
|
|
4966
|
+
logger.debug(f'Failed to check if cluster is autostopping: {e}')
|
|
4967
|
+
return False
|
|
4936
4968
|
else:
|
|
4937
4969
|
logger.info(
|
|
4938
4970
|
'Using legacy remote execution for is_autostopping on '
|
|
@@ -256,7 +256,9 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
256
256
|
logger.error(
|
|
257
257
|
'Unable to run container - nvidia runtime for docker not '
|
|
258
258
|
'found. Have you installed nvidia-docker on your machine?')
|
|
259
|
-
global_user_state.remove_cluster(cluster_name,
|
|
259
|
+
global_user_state.remove_cluster(cluster_name,
|
|
260
|
+
terminate=True,
|
|
261
|
+
remove_events=False)
|
|
260
262
|
raise e
|
|
261
263
|
self.containers[handle] = container
|
|
262
264
|
logger.info(
|
|
@@ -323,7 +325,8 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
323
325
|
def _teardown(self,
|
|
324
326
|
handle: LocalDockerResourceHandle,
|
|
325
327
|
terminate: bool,
|
|
326
|
-
purge: bool = False
|
|
328
|
+
purge: bool = False,
|
|
329
|
+
explicitly_requested: bool = False):
|
|
327
330
|
"""Teardown kills the container."""
|
|
328
331
|
del purge # Unused.
|
|
329
332
|
if not terminate:
|
|
@@ -339,7 +342,9 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
|
339
342
|
container.remove(force=True)
|
|
340
343
|
cluster_name = handle.get_cluster_name()
|
|
341
344
|
|
|
342
|
-
global_user_state.remove_cluster(cluster_name,
|
|
345
|
+
global_user_state.remove_cluster(cluster_name,
|
|
346
|
+
terminate=True,
|
|
347
|
+
remove_events=explicitly_requested)
|
|
343
348
|
|
|
344
349
|
# --- Utilities ---
|
|
345
350
|
|
sky/client/cli/command.py
CHANGED
|
@@ -60,6 +60,7 @@ from sky.client.cli import git
|
|
|
60
60
|
from sky.data import storage_utils
|
|
61
61
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
62
62
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
63
|
+
from sky.schemas.api import responses
|
|
63
64
|
from sky.server import common as server_common
|
|
64
65
|
from sky.server import constants as server_constants
|
|
65
66
|
from sky.server.requests import requests
|
|
@@ -123,7 +124,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
|
123
124
|
clusters: Optional[List[str]],
|
|
124
125
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
125
126
|
all_users: bool = False,
|
|
126
|
-
) -> List[
|
|
127
|
+
) -> List[responses.StatusResponse]:
|
|
127
128
|
"""Returns a list of clusters that match the glob pattern.
|
|
128
129
|
|
|
129
130
|
Args:
|
|
@@ -1562,7 +1563,7 @@ def _status_kubernetes(show_all: bool):
|
|
|
1562
1563
|
|
|
1563
1564
|
|
|
1564
1565
|
def _show_endpoint(query_clusters: Optional[List[str]],
|
|
1565
|
-
cluster_records: List[
|
|
1566
|
+
cluster_records: List[responses.StatusResponse], ip: bool,
|
|
1566
1567
|
endpoints: bool, endpoint: Optional[int]) -> None:
|
|
1567
1568
|
show_endpoints = endpoints or endpoint is not None
|
|
1568
1569
|
show_single_endpoint = endpoint is not None
|
|
@@ -2171,6 +2172,10 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2171
2172
|
|
|
2172
2173
|
@cli.command()
|
|
2173
2174
|
@flags.config_option(expose_value=False)
|
|
2175
|
+
@click.option('--provision',
|
|
2176
|
+
is_flag=True,
|
|
2177
|
+
default=False,
|
|
2178
|
+
help='Stream the cluster provisioning logs (provision.log).')
|
|
2174
2179
|
@click.option(
|
|
2175
2180
|
'--sync-down',
|
|
2176
2181
|
'-s',
|
|
@@ -2207,6 +2212,7 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
|
2207
2212
|
def logs(
|
|
2208
2213
|
cluster: str,
|
|
2209
2214
|
job_ids: Tuple[str, ...],
|
|
2215
|
+
provision: bool,
|
|
2210
2216
|
sync_down: bool,
|
|
2211
2217
|
status: bool, # pylint: disable=redefined-outer-name
|
|
2212
2218
|
follow: bool,
|
|
@@ -2236,6 +2242,11 @@ def logs(
|
|
|
2236
2242
|
4. If the job fails or fetching the logs fails, the command will exit with
|
|
2237
2243
|
a non-zero return code.
|
|
2238
2244
|
"""
|
|
2245
|
+
if provision and (sync_down or status or job_ids):
|
|
2246
|
+
raise click.UsageError(
|
|
2247
|
+
'--provision cannot be combined with job log options '
|
|
2248
|
+
'(--sync-down/--status/job IDs).')
|
|
2249
|
+
|
|
2239
2250
|
if sync_down and status:
|
|
2240
2251
|
raise click.UsageError(
|
|
2241
2252
|
'Both --sync_down and --status are specified '
|
|
@@ -2248,6 +2259,10 @@ def logs(
|
|
|
2248
2259
|
|
|
2249
2260
|
job_ids = None if not job_ids else job_ids
|
|
2250
2261
|
|
|
2262
|
+
if provision:
|
|
2263
|
+
# Stream provision logs
|
|
2264
|
+
sys.exit(sdk.tail_provision_logs(cluster, follow=follow, tail=tail))
|
|
2265
|
+
|
|
2251
2266
|
if sync_down:
|
|
2252
2267
|
with rich_utils.client_status(
|
|
2253
2268
|
ux_utils.spinner_message('Downloading logs')):
|
|
@@ -4786,7 +4801,7 @@ def pool():
|
|
|
4786
4801
|
type=str,
|
|
4787
4802
|
nargs=-1,
|
|
4788
4803
|
**_get_shell_complete_args(_complete_file_name))
|
|
4789
|
-
@click.option('--pool
|
|
4804
|
+
@click.option('--pool',
|
|
4790
4805
|
'-p',
|
|
4791
4806
|
default=None,
|
|
4792
4807
|
type=str,
|
|
@@ -4808,7 +4823,7 @@ def pool():
|
|
|
4808
4823
|
@usage_lib.entrypoint
|
|
4809
4824
|
def jobs_pool_apply(
|
|
4810
4825
|
pool_yaml: Tuple[str, ...],
|
|
4811
|
-
|
|
4826
|
+
pool: Optional[str], # pylint: disable=redefined-outer-name
|
|
4812
4827
|
workdir: Optional[str],
|
|
4813
4828
|
infra: Optional[str],
|
|
4814
4829
|
cloud: Optional[str],
|
|
@@ -4841,11 +4856,11 @@ def jobs_pool_apply(
|
|
|
4841
4856
|
"""
|
|
4842
4857
|
cloud, region, zone = _handle_infra_cloud_region_zone_options(
|
|
4843
4858
|
infra, cloud, region, zone)
|
|
4844
|
-
if
|
|
4845
|
-
|
|
4859
|
+
if pool is None:
|
|
4860
|
+
pool = serve_lib.generate_service_name(pool=True)
|
|
4846
4861
|
|
|
4847
4862
|
task = _generate_task_with_service(
|
|
4848
|
-
service_name=
|
|
4863
|
+
service_name=pool,
|
|
4849
4864
|
service_yaml_args=pool_yaml,
|
|
4850
4865
|
workdir=workdir,
|
|
4851
4866
|
cloud=cloud,
|
|
@@ -4882,7 +4897,7 @@ def jobs_pool_apply(
|
|
|
4882
4897
|
dag.add(task)
|
|
4883
4898
|
|
|
4884
4899
|
request_id = managed_jobs.pool_apply(task,
|
|
4885
|
-
|
|
4900
|
+
pool,
|
|
4886
4901
|
mode=serve_lib.UpdateMode(mode),
|
|
4887
4902
|
_need_confirmation=not yes)
|
|
4888
4903
|
_async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
|
|
@@ -5120,7 +5135,7 @@ def _handle_serve_logs(
|
|
|
5120
5135
|
@usage_lib.entrypoint
|
|
5121
5136
|
# TODO(tian): Add default argument for this CLI if none of the flags are
|
|
5122
5137
|
# specified.
|
|
5123
|
-
def
|
|
5138
|
+
def jobs_pool_logs(
|
|
5124
5139
|
pool_name: str,
|
|
5125
5140
|
follow: bool,
|
|
5126
5141
|
controller: bool,
|
|
@@ -6037,7 +6052,7 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
|
6037
6052
|
# server accepts log_path-only streaming.
|
|
6038
6053
|
req_id = (server_common.RequestId[None](request_id)
|
|
6039
6054
|
if request_id is not None else None)
|
|
6040
|
-
sdk.stream_and_get(req_id, log_path, tail, follow
|
|
6055
|
+
sdk.stream_and_get(req_id, log_path, tail, follow)
|
|
6041
6056
|
|
|
6042
6057
|
|
|
6043
6058
|
@api.command('cancel', cls=_DocumentedCodeCommand)
|
sky/client/sdk.py
CHANGED
|
@@ -855,6 +855,56 @@ def tail_logs(cluster_name: str,
|
|
|
855
855
|
resumable=(tail == 0))
|
|
856
856
|
|
|
857
857
|
|
|
858
|
+
@usage_lib.entrypoint
|
|
859
|
+
@server_common.check_server_healthy_or_start
|
|
860
|
+
@versions.minimal_api_version(17)
|
|
861
|
+
@annotations.client_api
|
|
862
|
+
@rest.retry_transient_errors()
|
|
863
|
+
def tail_provision_logs(cluster_name: str,
|
|
864
|
+
follow: bool = True,
|
|
865
|
+
tail: int = 0,
|
|
866
|
+
output_stream: Optional['io.TextIOBase'] = None) -> int:
|
|
867
|
+
"""Tails the provisioning logs (provision.log) for a cluster.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
cluster_name: name of the cluster.
|
|
871
|
+
follow: follow the logs.
|
|
872
|
+
tail: lines from end to tail.
|
|
873
|
+
output_stream: optional stream to write logs.
|
|
874
|
+
Returns:
|
|
875
|
+
Exit code 0 on streaming success; raises on HTTP error.
|
|
876
|
+
"""
|
|
877
|
+
body = payloads.ClusterNameBody(cluster_name=cluster_name)
|
|
878
|
+
params = {
|
|
879
|
+
'follow': str(follow).lower(),
|
|
880
|
+
'tail': tail,
|
|
881
|
+
}
|
|
882
|
+
response = server_common.make_authenticated_request(
|
|
883
|
+
'POST',
|
|
884
|
+
'/provision_logs',
|
|
885
|
+
json=json.loads(body.model_dump_json()),
|
|
886
|
+
params=params,
|
|
887
|
+
stream=True,
|
|
888
|
+
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
|
889
|
+
None))
|
|
890
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
|
891
|
+
# streaming point on retry.
|
|
892
|
+
# request_id=None here because /provision_logs does not create an async
|
|
893
|
+
# request. Instead, it streams a plain file from the server. This does NOT
|
|
894
|
+
# violate the stream_response doc warning about None in multi-user
|
|
895
|
+
# environments: we are not asking stream_response to select “the latest
|
|
896
|
+
# request”. We already have the HTTP response to stream; request_id=None
|
|
897
|
+
# merely disables the follow-up GET. It is also necessary for --no-follow
|
|
898
|
+
# to return cleanly after printing the tailed lines. If we provided a
|
|
899
|
+
# non-None request_id here, the get(request_id) in stream_response(
|
|
900
|
+
# would fail since /provision_logs does not create a request record.
|
|
901
|
+
stream_response(request_id=None,
|
|
902
|
+
response=response,
|
|
903
|
+
output_stream=output_stream,
|
|
904
|
+
resumable=(tail == 0))
|
|
905
|
+
return 0
|
|
906
|
+
|
|
907
|
+
|
|
858
908
|
@usage_lib.entrypoint
|
|
859
909
|
@server_common.check_server_healthy_or_start
|
|
860
910
|
@annotations.client_api
|
|
@@ -1322,7 +1372,7 @@ def status(
|
|
|
1322
1372
|
cluster_names: Optional[List[str]] = None,
|
|
1323
1373
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
1324
1374
|
all_users: bool = False,
|
|
1325
|
-
) -> server_common.RequestId[List[
|
|
1375
|
+
) -> server_common.RequestId[List[responses.StatusResponse]]:
|
|
1326
1376
|
"""Gets cluster statuses.
|
|
1327
1377
|
|
|
1328
1378
|
If cluster_names is given, return those clusters. Otherwise, return all
|
sky/clouds/kubernetes.py
CHANGED
|
@@ -3,7 +3,6 @@ import os
|
|
|
3
3
|
import re
|
|
4
4
|
import subprocess
|
|
5
5
|
import tempfile
|
|
6
|
-
import typing
|
|
7
6
|
from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
8
7
|
|
|
9
8
|
import colorama
|
|
@@ -11,6 +10,7 @@ import colorama
|
|
|
11
10
|
from sky import catalog
|
|
12
11
|
from sky import clouds
|
|
13
12
|
from sky import exceptions
|
|
13
|
+
from sky import resources as resources_lib
|
|
14
14
|
from sky import sky_logging
|
|
15
15
|
from sky import skypilot_config
|
|
16
16
|
from sky.adaptors import kubernetes
|
|
@@ -31,10 +31,6 @@ from sky.utils import resources_utils
|
|
|
31
31
|
from sky.utils import schemas
|
|
32
32
|
from sky.utils import volume as volume_lib
|
|
33
33
|
|
|
34
|
-
if typing.TYPE_CHECKING:
|
|
35
|
-
# Renaming to avoid shadowing variables.
|
|
36
|
-
from sky import resources as resources_lib
|
|
37
|
-
|
|
38
34
|
logger = sky_logging.init_logger(__name__)
|
|
39
35
|
|
|
40
36
|
# Namespace for SkyPilot resources shared across multiple tenants on the
|
|
@@ -773,7 +769,7 @@ class Kubernetes(clouds.Cloud):
|
|
|
773
769
|
|
|
774
770
|
@staticmethod
|
|
775
771
|
def _warn_on_disk_size(resources: 'resources_lib.Resources'):
|
|
776
|
-
if resources.disk_size
|
|
772
|
+
if resources.disk_size != resources_lib.DEFAULT_DISK_SIZE_GB:
|
|
777
773
|
logger.info(f'{colorama.Style.DIM}Disk size {resources.disk_size} '
|
|
778
774
|
'is not supported by Kubernetes. '
|
|
779
775
|
'To add additional disk, use volumes.'
|
sky/clouds/nebius.py
CHANGED
|
@@ -442,7 +442,9 @@ class Nebius(clouds.Cloud):
|
|
|
442
442
|
del workspace_config # Unused
|
|
443
443
|
sdk = nebius.sdk()
|
|
444
444
|
profile_client = nebius.iam().ProfileServiceClient(sdk)
|
|
445
|
-
profile =
|
|
445
|
+
profile = nebius.sync_call(
|
|
446
|
+
profile_client.get(nebius.iam().GetProfileRequest(),
|
|
447
|
+
timeout=nebius.READ_TIMEOUT))
|
|
446
448
|
if profile.user_profile is not None:
|
|
447
449
|
if profile.user_profile.attributes is None:
|
|
448
450
|
raise exceptions.CloudUserIdentityError(
|
sky/core.py
CHANGED
|
@@ -25,6 +25,7 @@ from sky.clouds import cloud as sky_cloud
|
|
|
25
25
|
from sky.jobs.server import core as managed_jobs_core
|
|
26
26
|
from sky.provision.kubernetes import constants as kubernetes_constants
|
|
27
27
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
28
|
+
from sky.schemas.api import responses
|
|
28
29
|
from sky.skylet import autostop_lib
|
|
29
30
|
from sky.skylet import constants
|
|
30
31
|
from sky.skylet import job_lib
|
|
@@ -95,7 +96,7 @@ def status(
|
|
|
95
96
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
96
97
|
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
|
97
98
|
all_users: bool = False,
|
|
98
|
-
) -> List[
|
|
99
|
+
) -> List[responses.StatusResponse]:
|
|
99
100
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
100
101
|
"""Gets cluster statuses.
|
|
101
102
|
|
|
@@ -171,7 +172,9 @@ def status(
|
|
|
171
172
|
clusters = backend_utils.get_clusters(refresh=refresh,
|
|
172
173
|
cluster_names=cluster_names,
|
|
173
174
|
all_users=all_users)
|
|
174
|
-
return
|
|
175
|
+
return [
|
|
176
|
+
responses.StatusResponse.model_validate(cluster) for cluster in clusters
|
|
177
|
+
]
|
|
175
178
|
|
|
176
179
|
|
|
177
180
|
def status_kubernetes(
|
|
@@ -593,7 +596,10 @@ def down(cluster_name: str, purge: bool = False) -> None:
|
|
|
593
596
|
|
|
594
597
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
|
595
598
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
596
|
-
backend.teardown(handle,
|
|
599
|
+
backend.teardown(handle,
|
|
600
|
+
terminate=True,
|
|
601
|
+
purge=purge,
|
|
602
|
+
explicitly_requested=True)
|
|
597
603
|
|
|
598
604
|
|
|
599
605
|
@usage_lib.entrypoint
|
sky/dashboard/out/404.html
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-
|
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-b6987eb47888da9c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js" defer=""></script><script src="/dashboard/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/yW7-Bc1l0EwIosbauU8LZ/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"yW7-Bc1l0EwIosbauU8LZ","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|