skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250816__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +43 -1
- sky/backends/backend.py +5 -3
- sky/backends/backend_utils.py +22 -7
- sky/backends/cloud_vm_ray_backend.py +50 -18
- sky/backends/local_docker_backend.py +8 -3
- sky/client/cli/command.py +25 -10
- sky/client/sdk.py +51 -1
- sky/clouds/kubernetes.py +2 -6
- sky/clouds/nebius.py +3 -1
- sky/core.py +9 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1121-2edb8ab2ba080a76.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-2f60a90b7d76838e.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-fd15b3ff228f7738.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.bc5d2853355c9c47.js +1 -0
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
- sky/dashboard/out/_next/static/chunks/{4725.29550342bd53afd8.js → 4725.10f7a9a5d3ea8208.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
- sky/dashboard/out/_next/static/chunks/6856-e6f350f567182e87.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +36 -0
- sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
- sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-89a84fd7fa31362d.js} +2 -2
- sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
- sky/dashboard/out/_next/static/chunks/9984.7eb6cc51fb460cae.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c2ea34fda4f1f8c8.js → _app-ce361c6959bc2001.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-ec747e4f2dc39b57.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-81351f95f3bec08e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-c320641c2bcbbea6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-664c36eda967b1ba.js → [pool]-7d4182df6625fe10.js} +2 -7
- sky/dashboard/out/_next/static/chunks/pages/jobs-4b3ba1792dc6f21d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-65f72dee417237ef.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-338de9df523d883a.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-b6987eb47888da9c.js +1 -0
- sky/dashboard/out/_next/static/yW7-Bc1l0EwIosbauU8LZ/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +29 -9
- sky/execution.py +13 -10
- sky/global_user_state.py +131 -2
- sky/jobs/constants.py +1 -1
- sky/jobs/recovery_strategy.py +0 -3
- sky/jobs/scheduler.py +14 -21
- sky/jobs/server/core.py +64 -10
- sky/jobs/server/utils.py +1 -1
- sky/jobs/state.py +1 -3
- sky/jobs/utils.py +159 -11
- sky/provision/aws/config.py +19 -3
- sky/provision/aws/instance.py +2 -1
- sky/provision/kubernetes/instance.py +2 -1
- sky/provision/nebius/utils.py +101 -86
- sky/provision/provisioner.py +13 -8
- sky/resources.py +5 -5
- sky/schemas/api/responses.py +50 -1
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/serve/replica_managers.py +123 -101
- sky/serve/serve_state.py +32 -0
- sky/serve/serve_utils.py +37 -16
- sky/serve/service.py +51 -17
- sky/server/common.py +2 -3
- sky/server/constants.py +1 -1
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/serializers/decoders.py +20 -5
- sky/server/requests/serializers/encoders.py +21 -8
- sky/server/server.py +57 -11
- sky/templates/kubernetes-ray.yml.j2 +1 -0
- sky/utils/cli_utils/status_utils.py +2 -1
- sky/utils/common_utils.py +20 -0
- sky/utils/controller_utils.py +17 -4
- sky/utils/db/migration_utils.py +1 -1
- sky/utils/log_utils.py +14 -5
- sky/utils/resources_utils.py +25 -1
- sky/utils/schemas.py +3 -0
- sky/utils/ux_utils.py +36 -5
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/RECORD +107 -106
- sky/dashboard/out/_next/static/Y0eNlwi85qGRecLTin11y/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
- sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
- sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
- sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
- sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
- sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
- sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
- sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-13d53fffc03ccb52.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-fc9222e26c8e2f0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-00c0a51d21157453.js +0 -1
- /sky/dashboard/out/_next/static/chunks/{6989-37611fe6b86d274d.js → 6989-01359c57e018caa4.js} +0 -0
- /sky/dashboard/out/_next/static/{Y0eNlwi85qGRecLTin11y → yW7-Bc1l0EwIosbauU8LZ}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250814.dist-info → skypilot_nightly-1.0.0.dev20250816.dist-info}/top_level.txt +0 -0
sky/serve/service.py
CHANGED
|
@@ -113,6 +113,9 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
|
113
113
|
return not failed
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
# NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
|
|
117
|
+
# because we killed all the processes (controller & replica manager) before
|
|
118
|
+
# calling this function.
|
|
116
119
|
def _cleanup(service_name: str) -> bool:
|
|
117
120
|
"""Clean up all service related resources, i.e. replicas and storage."""
|
|
118
121
|
# Cleanup the HA recovery script first as it is possible that some error
|
|
@@ -135,28 +138,59 @@ def _cleanup(service_name: str) -> bool:
|
|
|
135
138
|
continue
|
|
136
139
|
p = multiprocessing.Process(target=replica_managers.terminate_cluster,
|
|
137
140
|
args=(info.cluster_name,))
|
|
138
|
-
p.start()
|
|
139
141
|
info2proc[info] = p
|
|
140
142
|
# Set replica status to `SHUTTING_DOWN`
|
|
141
143
|
info.status_property.sky_launch_status = (
|
|
142
|
-
replica_managers.ProcessStatus.SUCCEEDED)
|
|
144
|
+
replica_managers.common_utils.ProcessStatus.SUCCEEDED)
|
|
143
145
|
info.status_property.sky_down_status = (
|
|
144
|
-
replica_managers.ProcessStatus.
|
|
146
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED)
|
|
145
147
|
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
146
|
-
logger.info(f'
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
148
|
+
logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
|
|
149
|
+
|
|
150
|
+
def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
|
|
151
|
+
nonlocal failed
|
|
152
|
+
# Set replica status to `FAILED_CLEANUP`
|
|
153
|
+
info.status_property.sky_down_status = (
|
|
154
|
+
replica_managers.common_utils.ProcessStatus.FAILED)
|
|
155
|
+
serve_state.add_or_update_replica(service_name, info.replica_id, info)
|
|
156
|
+
failed = True
|
|
157
|
+
logger.error(f'Replica {info.replica_id} failed to terminate.')
|
|
158
|
+
|
|
159
|
+
# Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
|
|
160
|
+
# TODO(tian): Refactor to use the same logic and code.
|
|
161
|
+
while info2proc:
|
|
162
|
+
snapshot = list(info2proc.items())
|
|
163
|
+
for info, p in snapshot:
|
|
164
|
+
if p.is_alive():
|
|
165
|
+
continue
|
|
166
|
+
if (info.status_property.sky_down_status ==
|
|
167
|
+
replica_managers.common_utils.ProcessStatus.SCHEDULED):
|
|
168
|
+
if controller_utils.can_terminate():
|
|
169
|
+
try:
|
|
170
|
+
p.start()
|
|
171
|
+
except Exception as e: # pylint: disable=broad-except
|
|
172
|
+
_set_to_failed_cleanup(info)
|
|
173
|
+
logger.error(f'Failed to start process for replica '
|
|
174
|
+
f'{info.replica_id}: {e}')
|
|
175
|
+
del info2proc[info]
|
|
176
|
+
else:
|
|
177
|
+
info.status_property.sky_down_status = (
|
|
178
|
+
common_utils.ProcessStatus.RUNNING)
|
|
179
|
+
serve_state.add_or_update_replica(
|
|
180
|
+
service_name, info.replica_id, info)
|
|
181
|
+
else:
|
|
182
|
+
logger.info('Terminate process for replica '
|
|
183
|
+
f'{info.replica_id} finished.')
|
|
184
|
+
p.join()
|
|
185
|
+
del info2proc[info]
|
|
186
|
+
if p.exitcode == 0:
|
|
187
|
+
serve_state.remove_replica(service_name, info.replica_id)
|
|
188
|
+
logger.info(
|
|
189
|
+
f'Replica {info.replica_id} terminated successfully.')
|
|
190
|
+
else:
|
|
191
|
+
_set_to_failed_cleanup(info)
|
|
192
|
+
time.sleep(3)
|
|
193
|
+
|
|
160
194
|
versions = serve_state.get_service_versions(service_name)
|
|
161
195
|
serve_state.remove_service_versions(service_name)
|
|
162
196
|
|
sky/server/common.py
CHANGED
|
@@ -5,7 +5,6 @@ import enum
|
|
|
5
5
|
import functools
|
|
6
6
|
from http.cookiejar import CookieJar
|
|
7
7
|
from http.cookiejar import MozillaCookieJar
|
|
8
|
-
import json
|
|
9
8
|
import os
|
|
10
9
|
import pathlib
|
|
11
10
|
import re
|
|
@@ -372,7 +371,7 @@ def _handle_non_200_server_status(
|
|
|
372
371
|
'') == ApiServerStatus.VERSION_MISMATCH.value):
|
|
373
372
|
return ApiServerInfo(status=ApiServerStatus.VERSION_MISMATCH,
|
|
374
373
|
error=body.get('message', ''))
|
|
375
|
-
except
|
|
374
|
+
except requests.JSONDecodeError:
|
|
376
375
|
pass
|
|
377
376
|
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
|
378
377
|
|
|
@@ -463,7 +462,7 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
|
463
462
|
# OAuth.
|
|
464
463
|
set_api_cookie_jar(cookies, create_if_not_exists=True)
|
|
465
464
|
return server_info
|
|
466
|
-
except (
|
|
465
|
+
except (requests.JSONDecodeError, AttributeError) as e:
|
|
467
466
|
# Try to check if we got redirected to a login page.
|
|
468
467
|
for prev_response in response.history:
|
|
469
468
|
logger.debug(f'Previous response: {prev_response.url}')
|
sky/server/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ from sky.skylet import constants
|
|
|
10
10
|
# based on version info is needed.
|
|
11
11
|
# For more details and code guidelines, refer to:
|
|
12
12
|
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
|
|
13
|
-
API_VERSION =
|
|
13
|
+
API_VERSION = 17
|
|
14
14
|
|
|
15
15
|
# The minimum peer API version that the code should still work with.
|
|
16
16
|
# Notes (dev):
|
sky/server/requests/payloads.py
CHANGED
|
@@ -497,6 +497,12 @@ class JobsQueueBody(RequestBody):
|
|
|
497
497
|
skip_finished: bool = False
|
|
498
498
|
all_users: bool = False
|
|
499
499
|
job_ids: Optional[List[int]] = None
|
|
500
|
+
user_match: Optional[str] = None
|
|
501
|
+
workspace_match: Optional[str] = None
|
|
502
|
+
name_match: Optional[str] = None
|
|
503
|
+
pool_match: Optional[str] = None
|
|
504
|
+
page: Optional[int] = None
|
|
505
|
+
limit: Optional[int] = None
|
|
500
506
|
|
|
501
507
|
|
|
502
508
|
class JobsCancelBody(RequestBody):
|
|
@@ -9,6 +9,7 @@ from sky import models
|
|
|
9
9
|
from sky.catalog import common
|
|
10
10
|
from sky.data import storage
|
|
11
11
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
+
from sky.schemas.api import responses
|
|
12
13
|
from sky.serve import serve_state
|
|
13
14
|
from sky.server import constants as server_constants
|
|
14
15
|
from sky.skylet import job_lib
|
|
@@ -50,13 +51,17 @@ def default_decode_handler(return_value: Any) -> Any:
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
@register_decoders('status')
|
|
53
|
-
def decode_status(
|
|
54
|
+
def decode_status(
|
|
55
|
+
return_value: List[Dict[str, Any]]) -> List[responses.StatusResponse]:
|
|
54
56
|
clusters = return_value
|
|
57
|
+
response = []
|
|
55
58
|
for cluster in clusters:
|
|
56
59
|
cluster['handle'] = decode_and_unpickle(cluster['handle'])
|
|
57
60
|
cluster['status'] = status_lib.ClusterStatus(cluster['status'])
|
|
58
|
-
|
|
59
|
-
|
|
61
|
+
cluster['storage_mounts_metadata'] = decode_and_unpickle(
|
|
62
|
+
cluster['storage_mounts_metadata'])
|
|
63
|
+
response.append(responses.StatusResponse.model_validate(cluster))
|
|
64
|
+
return response
|
|
60
65
|
|
|
61
66
|
|
|
62
67
|
@register_decoders('status_kubernetes')
|
|
@@ -102,8 +107,18 @@ def decode_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
|
|
|
102
107
|
|
|
103
108
|
|
|
104
109
|
@register_decoders('jobs.queue')
|
|
105
|
-
def decode_jobs_queue(return_value
|
|
106
|
-
jobs
|
|
110
|
+
def decode_jobs_queue(return_value):
|
|
111
|
+
"""Decode jobs queue response.
|
|
112
|
+
|
|
113
|
+
Supports legacy list, or a dict {jobs, total}.
|
|
114
|
+
- Returns list[job]
|
|
115
|
+
"""
|
|
116
|
+
# Case 1: dict shape {jobs, total}
|
|
117
|
+
if isinstance(return_value, dict) and 'jobs' in return_value:
|
|
118
|
+
jobs = return_value.get('jobs', [])
|
|
119
|
+
else:
|
|
120
|
+
# Case 2: legacy list
|
|
121
|
+
jobs = return_value
|
|
107
122
|
for job in jobs:
|
|
108
123
|
job['status'] = managed_jobs.ManagedJobStatus(job['status'])
|
|
109
124
|
return jobs
|
|
@@ -8,6 +8,7 @@ import pickle
|
|
|
8
8
|
import typing
|
|
9
9
|
from typing import Any, Dict, List, Optional, Tuple
|
|
10
10
|
|
|
11
|
+
from sky.schemas.api import responses
|
|
11
12
|
from sky.server import constants as server_constants
|
|
12
13
|
|
|
13
14
|
if typing.TYPE_CHECKING:
|
|
@@ -51,13 +52,17 @@ def default_encoder(return_value: Any) -> Any:
|
|
|
51
52
|
|
|
52
53
|
|
|
53
54
|
@register_encoder('status')
|
|
54
|
-
def encode_status(
|
|
55
|
+
def encode_status(
|
|
56
|
+
clusters: List[responses.StatusResponse]) -> List[Dict[str, Any]]:
|
|
57
|
+
response = []
|
|
55
58
|
for cluster in clusters:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
59
|
+
response_cluster = cluster.model_dump()
|
|
60
|
+
response_cluster['status'] = cluster['status'].value
|
|
61
|
+
response_cluster['handle'] = pickle_and_encode(cluster['handle'])
|
|
62
|
+
response_cluster['storage_mounts_metadata'] = pickle_and_encode(
|
|
63
|
+
response_cluster['storage_mounts_metadata'])
|
|
64
|
+
response.append(response_cluster)
|
|
65
|
+
return response
|
|
61
66
|
|
|
62
67
|
|
|
63
68
|
@register_encoder('launch', 'exec', 'jobs.launch')
|
|
@@ -106,10 +111,18 @@ def encode_status_kubernetes(
|
|
|
106
111
|
|
|
107
112
|
|
|
108
113
|
@register_encoder('jobs.queue')
|
|
109
|
-
def encode_jobs_queue(
|
|
114
|
+
def encode_jobs_queue(jobs_or_tuple):
|
|
115
|
+
# Support returning either a plain jobs list or a (jobs, total) tuple
|
|
116
|
+
if isinstance(jobs_or_tuple, tuple) and len(jobs_or_tuple) == 2:
|
|
117
|
+
jobs, total = jobs_or_tuple
|
|
118
|
+
else:
|
|
119
|
+
jobs = jobs_or_tuple
|
|
120
|
+
total = None
|
|
110
121
|
for job in jobs:
|
|
111
122
|
job['status'] = job['status'].value
|
|
112
|
-
|
|
123
|
+
if total is None:
|
|
124
|
+
return jobs
|
|
125
|
+
return {'jobs': jobs, 'total': total}
|
|
113
126
|
|
|
114
127
|
|
|
115
128
|
def _encode_serve_status(
|
sky/server/server.py
CHANGED
|
@@ -792,8 +792,6 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
792
792
|
ctx.override_envs(validate_body.env_vars)
|
|
793
793
|
|
|
794
794
|
def validate_dag(dag: dag_utils.dag_lib.Dag):
|
|
795
|
-
# Resolve the volumes before admin policy and validation.
|
|
796
|
-
dag.resolve_and_validate_volumes()
|
|
797
795
|
# TODO: Admin policy may contain arbitrary code, which may be expensive
|
|
798
796
|
# to run and may block the server thread. However, moving it into the
|
|
799
797
|
# executor adds a ~150ms penalty on the local API server because of
|
|
@@ -802,6 +800,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
|
|
|
802
800
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
803
801
|
dag,
|
|
804
802
|
request_options=validate_body.get_request_options()) as dag:
|
|
803
|
+
dag.resolve_and_validate_volumes()
|
|
805
804
|
# Skip validating workdir and file_mounts, as those need to be
|
|
806
805
|
# validated after the files are uploaded to the SkyPilot API server
|
|
807
806
|
# with `upload_mounts_to_api_server`.
|
|
@@ -1233,7 +1232,8 @@ async def download_logs(
|
|
|
1233
1232
|
|
|
1234
1233
|
|
|
1235
1234
|
@app.post('/download')
|
|
1236
|
-
async def download(download_body: payloads.DownloadBody
|
|
1235
|
+
async def download(download_body: payloads.DownloadBody,
|
|
1236
|
+
request: fastapi.Request) -> None:
|
|
1237
1237
|
"""Downloads a folder from the cluster to the local machine."""
|
|
1238
1238
|
folder_paths = [
|
|
1239
1239
|
pathlib.Path(folder_path) for folder_path in download_body.folder_paths
|
|
@@ -1262,7 +1262,16 @@ async def download(download_body: payloads.DownloadBody) -> None:
|
|
|
1262
1262
|
str(folder_path.expanduser().resolve())
|
|
1263
1263
|
for folder_path in folder_paths
|
|
1264
1264
|
]
|
|
1265
|
-
|
|
1265
|
+
# Check for optional query parameter to control zip entry structure
|
|
1266
|
+
relative = request.query_params.get('relative', 'home')
|
|
1267
|
+
if relative == 'items':
|
|
1268
|
+
# Dashboard-friendly: entries relative to selected folders
|
|
1269
|
+
storage_utils.zip_files_and_folders(folders,
|
|
1270
|
+
zip_path,
|
|
1271
|
+
relative_to_items=True)
|
|
1272
|
+
else:
|
|
1273
|
+
# CLI-friendly (default): entries with full paths for mapping
|
|
1274
|
+
storage_utils.zip_files_and_folders(folders, zip_path)
|
|
1266
1275
|
|
|
1267
1276
|
# Add home path to the response headers, so that the client can replace
|
|
1268
1277
|
# the remote path in the zip file to the local path.
|
|
@@ -1284,6 +1293,46 @@ async def download(download_body: payloads.DownloadBody) -> None:
|
|
|
1284
1293
|
detail=f'Error creating zip file: {str(e)}')
|
|
1285
1294
|
|
|
1286
1295
|
|
|
1296
|
+
@app.post('/provision_logs')
|
|
1297
|
+
async def provision_logs(cluster_body: payloads.ClusterNameBody,
|
|
1298
|
+
follow: bool = True,
|
|
1299
|
+
tail: int = 0) -> fastapi.responses.StreamingResponse:
|
|
1300
|
+
"""Streams the provision.log for the latest launch request of a cluster."""
|
|
1301
|
+
# Prefer clusters table first, then cluster_history as fallback.
|
|
1302
|
+
log_path_str = global_user_state.get_cluster_provision_log_path(
|
|
1303
|
+
cluster_body.cluster_name)
|
|
1304
|
+
if not log_path_str:
|
|
1305
|
+
log_path_str = global_user_state.get_cluster_history_provision_log_path(
|
|
1306
|
+
cluster_body.cluster_name)
|
|
1307
|
+
if not log_path_str:
|
|
1308
|
+
raise fastapi.HTTPException(
|
|
1309
|
+
status_code=404,
|
|
1310
|
+
detail=('Provision log path is not recorded for this cluster. '
|
|
1311
|
+
'Please relaunch to generate provisioning logs.'))
|
|
1312
|
+
|
|
1313
|
+
log_path = pathlib.Path(log_path_str).expanduser().resolve()
|
|
1314
|
+
if not log_path.exists():
|
|
1315
|
+
raise fastapi.HTTPException(
|
|
1316
|
+
status_code=404,
|
|
1317
|
+
detail=f'Provision log path does not exist: {str(log_path)}')
|
|
1318
|
+
|
|
1319
|
+
# Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
|
|
1320
|
+
effective_tail = None if tail is None or tail <= 0 else tail
|
|
1321
|
+
|
|
1322
|
+
return fastapi.responses.StreamingResponse(
|
|
1323
|
+
content=stream_utils.log_streamer(None,
|
|
1324
|
+
log_path,
|
|
1325
|
+
tail=effective_tail,
|
|
1326
|
+
follow=follow),
|
|
1327
|
+
media_type='text/plain',
|
|
1328
|
+
headers={
|
|
1329
|
+
'Cache-Control': 'no-cache, no-transform',
|
|
1330
|
+
'X-Accel-Buffering': 'no',
|
|
1331
|
+
'Transfer-Encoding': 'chunked',
|
|
1332
|
+
},
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
|
|
1287
1336
|
@app.post('/cost_report')
|
|
1288
1337
|
async def cost_report(request: fastapi.Request,
|
|
1289
1338
|
cost_report_body: payloads.CostReportBody) -> None:
|
|
@@ -1541,13 +1590,7 @@ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
|
|
|
1541
1590
|
"""Checks the health of the API server.
|
|
1542
1591
|
|
|
1543
1592
|
Returns:
|
|
1544
|
-
|
|
1545
|
-
- status: str; The status of the API server.
|
|
1546
|
-
- api_version: str; The API version of the API server.
|
|
1547
|
-
- version: str; The version of SkyPilot used for API server.
|
|
1548
|
-
- version_on_disk: str; The version of the SkyPilot installation on
|
|
1549
|
-
disk, which can be used to warn about restarting the API server
|
|
1550
|
-
- commit: str; The commit hash of SkyPilot used for API server.
|
|
1593
|
+
responses.APIHealthResponse: The health response.
|
|
1551
1594
|
"""
|
|
1552
1595
|
user = request.state.auth_user
|
|
1553
1596
|
server_status = common.ApiServerStatus.HEALTHY
|
|
@@ -1815,6 +1858,9 @@ if __name__ == '__main__':
|
|
|
1815
1858
|
global_tasks.append(background.create_task(metrics_server.serve()))
|
|
1816
1859
|
global_tasks.append(
|
|
1817
1860
|
background.create_task(requests_lib.requests_gc_daemon()))
|
|
1861
|
+
global_tasks.append(
|
|
1862
|
+
background.create_task(
|
|
1863
|
+
global_user_state.cluster_event_retention_daemon()))
|
|
1818
1864
|
threading.Thread(target=background.run_forever, daemon=True).start()
|
|
1819
1865
|
|
|
1820
1866
|
queue_server, workers = executor.start(config)
|
|
@@ -6,6 +6,7 @@ import click
|
|
|
6
6
|
import colorama
|
|
7
7
|
|
|
8
8
|
from sky import backends
|
|
9
|
+
from sky.schemas.api import responses
|
|
9
10
|
from sky.utils import common_utils
|
|
10
11
|
from sky.utils import log_utils
|
|
11
12
|
from sky.utils import resources_utils
|
|
@@ -44,7 +45,7 @@ class StatusColumn:
|
|
|
44
45
|
return val
|
|
45
46
|
|
|
46
47
|
|
|
47
|
-
def show_status_table(cluster_records: List[
|
|
48
|
+
def show_status_table(cluster_records: List[responses.StatusResponse],
|
|
48
49
|
show_all: bool,
|
|
49
50
|
show_user: bool,
|
|
50
51
|
query_clusters: Optional[List[str]] = None,
|
sky/utils/common_utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Utils shared between all of sky"""
|
|
2
2
|
|
|
3
3
|
import difflib
|
|
4
|
+
import enum
|
|
4
5
|
import functools
|
|
5
6
|
import getpass
|
|
6
7
|
import hashlib
|
|
@@ -55,6 +56,25 @@ _VALID_ENV_VAR_REGEX = '[a-zA-Z_][a-zA-Z0-9_]*'
|
|
|
55
56
|
logger = sky_logging.init_logger(__name__)
|
|
56
57
|
|
|
57
58
|
|
|
59
|
+
class ProcessStatus(enum.Enum):
|
|
60
|
+
"""Process status."""
|
|
61
|
+
|
|
62
|
+
# The process is scheduled to run, but not started yet.
|
|
63
|
+
SCHEDULED = 'SCHEDULED'
|
|
64
|
+
|
|
65
|
+
# The process is running
|
|
66
|
+
RUNNING = 'RUNNING'
|
|
67
|
+
|
|
68
|
+
# The process is finished and succeeded
|
|
69
|
+
SUCCEEDED = 'SUCCEEDED'
|
|
70
|
+
|
|
71
|
+
# The process is interrupted
|
|
72
|
+
INTERRUPTED = 'INTERRUPTED'
|
|
73
|
+
|
|
74
|
+
# The process failed
|
|
75
|
+
FAILED = 'FAILED'
|
|
76
|
+
|
|
77
|
+
|
|
58
78
|
@annotations.lru_cache(scope='request')
|
|
59
79
|
def get_usage_run_id() -> str:
|
|
60
80
|
"""Returns a unique run id for each 'run'.
|
sky/utils/controller_utils.py
CHANGED
|
@@ -1224,13 +1224,26 @@ def _get_launch_parallelism() -> int:
|
|
|
1224
1224
|
|
|
1225
1225
|
|
|
1226
1226
|
def can_provision() -> bool:
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1227
|
+
# We always prioritize terminating over provisioning, to save the cost on
|
|
1228
|
+
# idle resources.
|
|
1229
|
+
if serve_state.total_number_scheduled_to_terminate_replicas() > 0:
|
|
1230
|
+
return False
|
|
1231
|
+
return can_terminate()
|
|
1231
1232
|
|
|
1232
1233
|
|
|
1233
1234
|
def can_start_new_process() -> bool:
|
|
1234
1235
|
num_procs = (serve_state.get_num_services() * SERVE_PROC_RATIO +
|
|
1235
1236
|
managed_job_state.get_num_alive_jobs())
|
|
1236
1237
|
return num_procs < _get_job_parallelism()
|
|
1238
|
+
|
|
1239
|
+
|
|
1240
|
+
# We limit the number of terminating replicas to the number of CPUs. This is
|
|
1241
|
+
# just a temporary solution to avoid overwhelming the controller. After one job
|
|
1242
|
+
# controller PR, we should use API server to handle resources management.
|
|
1243
|
+
def can_terminate() -> bool:
|
|
1244
|
+
num_terminating = (
|
|
1245
|
+
serve_state.total_number_provisioning_replicas() * SERVE_LAUNCH_RATIO +
|
|
1246
|
+
# Each terminate process will take roughly the same CPUs as job launch.
|
|
1247
|
+
serve_state.total_number_terminating_replicas() +
|
|
1248
|
+
managed_job_state.get_num_launching_jobs())
|
|
1249
|
+
return num_terminating < _get_launch_parallelism()
|
sky/utils/db/migration_utils.py
CHANGED
|
@@ -19,7 +19,7 @@ logger = sky_logging.init_logger(__name__)
|
|
|
19
19
|
DB_INIT_LOCK_TIMEOUT_SECONDS = 10
|
|
20
20
|
|
|
21
21
|
GLOBAL_USER_STATE_DB_NAME = 'state_db'
|
|
22
|
-
GLOBAL_USER_STATE_VERSION = '
|
|
22
|
+
GLOBAL_USER_STATE_VERSION = '006'
|
|
23
23
|
GLOBAL_USER_STATE_LOCK_PATH = '~/.sky/locks/.state_db.lock'
|
|
24
24
|
|
|
25
25
|
SPOT_JOBS_DB_NAME = 'spot_jobs_db'
|
sky/utils/log_utils.py
CHANGED
|
@@ -47,13 +47,16 @@ class RayUpLineProcessor(LineProcessor):
|
|
|
47
47
|
RUNTIME_SETUP = 1
|
|
48
48
|
PULLING_DOCKER_IMAGES = 2
|
|
49
49
|
|
|
50
|
-
def __init__(self, log_path: str):
|
|
50
|
+
def __init__(self, log_path: str, cluster_name: Optional[str] = None):
|
|
51
51
|
self.log_path = log_path
|
|
52
|
+
self.cluster_name = cluster_name
|
|
52
53
|
|
|
53
54
|
def __enter__(self) -> None:
|
|
54
55
|
self.state = self.ProvisionStatus.LAUNCH
|
|
55
56
|
self.status_display = rich_utils.safe_status(
|
|
56
|
-
ux_utils.spinner_message('Launching',
|
|
57
|
+
ux_utils.spinner_message('Launching',
|
|
58
|
+
self.log_path,
|
|
59
|
+
cluster_name=self.cluster_name))
|
|
57
60
|
self.status_display.start()
|
|
58
61
|
|
|
59
62
|
def process_line(self, log_line: str) -> None:
|
|
@@ -62,19 +65,25 @@ class RayUpLineProcessor(LineProcessor):
|
|
|
62
65
|
logger.info(' Head VM is up.')
|
|
63
66
|
self.status_display.update(
|
|
64
67
|
ux_utils.spinner_message(
|
|
65
|
-
'Launching - Preparing SkyPilot runtime',
|
|
68
|
+
'Launching - Preparing SkyPilot runtime',
|
|
69
|
+
self.log_path,
|
|
70
|
+
cluster_name=self.cluster_name))
|
|
66
71
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
|
67
72
|
if ('Pulling from' in log_line and
|
|
68
73
|
self.state == self.ProvisionStatus.RUNTIME_SETUP):
|
|
69
74
|
self.status_display.update(
|
|
70
75
|
ux_utils.spinner_message(
|
|
71
|
-
'Launching - Initializing docker container',
|
|
76
|
+
'Launching - Initializing docker container',
|
|
77
|
+
self.log_path,
|
|
78
|
+
cluster_name=self.cluster_name))
|
|
72
79
|
self.state = self.ProvisionStatus.PULLING_DOCKER_IMAGES
|
|
73
80
|
if ('Status: Downloaded newer image' in log_line and
|
|
74
81
|
self.state == self.ProvisionStatus.PULLING_DOCKER_IMAGES):
|
|
75
82
|
self.status_display.update(
|
|
76
83
|
ux_utils.spinner_message(
|
|
77
|
-
'Launching - Preparing SkyPilot runtime',
|
|
84
|
+
'Launching - Preparing SkyPilot runtime',
|
|
85
|
+
self.log_path,
|
|
86
|
+
cluster_name=self.cluster_name))
|
|
78
87
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
|
79
88
|
|
|
80
89
|
def __exit__(self, except_type: Optional[Type[BaseException]],
|
sky/utils/resources_utils.py
CHANGED
|
@@ -5,7 +5,7 @@ import itertools
|
|
|
5
5
|
import json
|
|
6
6
|
import math
|
|
7
7
|
import typing
|
|
8
|
-
from typing import Dict, List, Optional, Set, Union
|
|
8
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
9
9
|
|
|
10
10
|
from sky import skypilot_config
|
|
11
11
|
from sky.skylet import constants
|
|
@@ -435,3 +435,27 @@ def parse_time_minutes(time: str) -> int:
|
|
|
435
435
|
continue
|
|
436
436
|
|
|
437
437
|
raise ValueError(f'Invalid time format: {time}')
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def normalize_any_of_resources_config(
|
|
441
|
+
any_of: List[Dict[str, Any]]) -> Tuple[str, ...]:
|
|
442
|
+
"""Normalize a list of any_of resources config to a canonical form.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
any_of: A list of any_of resources config.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
A normalized tuple representation that can be compared for equality.
|
|
449
|
+
Two lists with the same resource configurations in different orders
|
|
450
|
+
will produce the same normalized result.
|
|
451
|
+
"""
|
|
452
|
+
if not any_of:
|
|
453
|
+
return tuple()
|
|
454
|
+
|
|
455
|
+
# Convert each config to JSON string with sorted keys, then sort the list
|
|
456
|
+
normalized_configs = [
|
|
457
|
+
json.dumps(config, sort_keys=True, separators=(',', ':'))
|
|
458
|
+
for config in any_of
|
|
459
|
+
]
|
|
460
|
+
|
|
461
|
+
return tuple(sorted(normalized_configs))
|
sky/utils/schemas.py
CHANGED
sky/utils/ux_utils.py
CHANGED
|
@@ -26,9 +26,16 @@ BOLD = '\033[1m'
|
|
|
26
26
|
RESET_BOLD = '\033[0m'
|
|
27
27
|
|
|
28
28
|
# Log path hint in the spinner during launching
|
|
29
|
+
# (old, kept for backward compatibility)
|
|
29
30
|
_LOG_PATH_HINT = (f'{colorama.Style.DIM}View logs: sky api logs -l '
|
|
30
31
|
'{log_path}'
|
|
31
32
|
f'{colorama.Style.RESET_ALL}')
|
|
33
|
+
# Log hint: recommend sky logs --provision <cluster_name>
|
|
34
|
+
_PROVISION_LOG_HINT = (
|
|
35
|
+
f'{colorama.Style.DIM}View logs: '
|
|
36
|
+
f'{BOLD}sky logs --provision {{cluster_name}}{RESET_BOLD}'
|
|
37
|
+
f'{colorama.Style.RESET_ALL}')
|
|
38
|
+
# Legacy path hint retained for local-only cases where we don't have cluster
|
|
32
39
|
_LOG_PATH_HINT_LOCAL = (f'{colorama.Style.DIM}View logs: '
|
|
33
40
|
'{log_path}'
|
|
34
41
|
f'{colorama.Style.RESET_ALL}')
|
|
@@ -126,7 +133,10 @@ class RedirectOutputForProcess:
|
|
|
126
133
|
|
|
127
134
|
def log_path_hint(log_path: Union[str, 'pathlib.Path'],
|
|
128
135
|
is_local: bool = False) -> str:
|
|
129
|
-
"""Gets the log path hint for the given log path.
|
|
136
|
+
"""Gets the log path hint for the given log path.
|
|
137
|
+
|
|
138
|
+
Kept for backward compatibility when only paths are available.
|
|
139
|
+
"""
|
|
130
140
|
log_path = str(log_path)
|
|
131
141
|
expanded_home = os.path.expanduser('~')
|
|
132
142
|
if log_path.startswith(expanded_home):
|
|
@@ -139,6 +149,12 @@ def log_path_hint(log_path: Union[str, 'pathlib.Path'],
|
|
|
139
149
|
return _LOG_PATH_HINT.format(log_path=log_path)
|
|
140
150
|
|
|
141
151
|
|
|
152
|
+
def provision_hint(cluster_name: Optional[str]) -> Optional[str]:
|
|
153
|
+
if not cluster_name:
|
|
154
|
+
return None
|
|
155
|
+
return _PROVISION_LOG_HINT.format(cluster_name=cluster_name)
|
|
156
|
+
|
|
157
|
+
|
|
142
158
|
def starting_message(message: str) -> str:
|
|
143
159
|
"""Gets the starting message for the given message."""
|
|
144
160
|
# We have to reset the color before the message, because sometimes if a
|
|
@@ -150,7 +166,8 @@ def starting_message(message: str) -> str:
|
|
|
150
166
|
def finishing_message(message: str,
|
|
151
167
|
log_path: Optional[Union[str, 'pathlib.Path']] = None,
|
|
152
168
|
is_local: bool = False,
|
|
153
|
-
follow_up_message: Optional[str] = None
|
|
169
|
+
follow_up_message: Optional[str] = None,
|
|
170
|
+
cluster_name: Optional[str] = None) -> str:
|
|
154
171
|
"""Gets the finishing message for the given message.
|
|
155
172
|
|
|
156
173
|
Args:
|
|
@@ -168,6 +185,9 @@ def finishing_message(message: str,
|
|
|
168
185
|
success_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.GREEN}✓ '
|
|
169
186
|
f'{message}{colorama.Style.RESET_ALL}{follow_up_message}'
|
|
170
187
|
f'{colorama.Style.RESET_ALL}')
|
|
188
|
+
hint = provision_hint(cluster_name)
|
|
189
|
+
if hint:
|
|
190
|
+
return f'{success_prefix} {hint}'
|
|
171
191
|
if log_path is None:
|
|
172
192
|
return success_prefix
|
|
173
193
|
path_hint = log_path_hint(log_path, is_local)
|
|
@@ -176,13 +196,17 @@ def finishing_message(message: str,
|
|
|
176
196
|
|
|
177
197
|
def error_message(message: str,
|
|
178
198
|
log_path: Optional[Union[str, 'pathlib.Path']] = None,
|
|
179
|
-
is_local: bool = False
|
|
199
|
+
is_local: bool = False,
|
|
200
|
+
cluster_name: Optional[str] = None) -> str:
|
|
180
201
|
"""Gets the error message for the given message."""
|
|
181
202
|
# We have to reset the color before the message, because sometimes if a
|
|
182
203
|
# previous spinner with dimmed color overflows in a narrow terminal, the
|
|
183
204
|
# color might be messed up.
|
|
184
205
|
error_prefix = (f'{colorama.Style.RESET_ALL}{colorama.Fore.RED}⨯'
|
|
185
206
|
f'{colorama.Style.RESET_ALL} {message}')
|
|
207
|
+
hint = provision_hint(cluster_name)
|
|
208
|
+
if hint:
|
|
209
|
+
return f'{error_prefix} {hint}'
|
|
186
210
|
if log_path is None:
|
|
187
211
|
return error_prefix
|
|
188
212
|
path_hint = log_path_hint(log_path, is_local)
|
|
@@ -200,9 +224,16 @@ def retry_message(message: str) -> str:
|
|
|
200
224
|
|
|
201
225
|
def spinner_message(message: str,
|
|
202
226
|
log_path: Optional[Union[str, 'pathlib.Path']] = None,
|
|
203
|
-
is_local: bool = False
|
|
204
|
-
|
|
227
|
+
is_local: bool = False,
|
|
228
|
+
cluster_name: Optional[str] = None) -> str:
|
|
229
|
+
"""Gets the spinner message for the given message and log path.
|
|
230
|
+
|
|
231
|
+
If cluster_name is provided, recommend `sky logs --provision <cluster>`.
|
|
232
|
+
"""
|
|
205
233
|
colored_spinner = f'[bold cyan]{message}[/]'
|
|
234
|
+
hint = provision_hint(cluster_name)
|
|
235
|
+
if hint:
|
|
236
|
+
return f'{colored_spinner} {hint}'
|
|
206
237
|
if log_path is None:
|
|
207
238
|
return colored_spinner
|
|
208
239
|
path_hint = log_path_hint(log_path, is_local)
|