skypilot-nightly 1.0.0.dev20250720__py3-none-any.whl → 1.0.0.dev20250724__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/admin_policy.py +11 -4
- sky/backends/backend_utils.py +27 -11
- sky/backends/cloud_vm_ray_backend.py +22 -27
- sky/client/cli/command.py +44 -28
- sky/client/sdk.py +52 -7
- sky/client/sdk.pyi +296 -0
- sky/clouds/nebius.py +2 -5
- sky/clouds/utils/oci_utils.py +16 -40
- sky/clouds/vast.py +2 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/BURfWrKsQk9psMPv0OXrh/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
- sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
- sky/dashboard/out/_next/static/chunks/{1871-a821dcaaae2a3823.js → 1871-ea0e7283886407ca.js} +2 -2
- sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
- sky/dashboard/out/_next/static/chunks/{2641.5233e938f14e31a7.js → 2641.74c19c4d45a2c034.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
- sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
- sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
- sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
- sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
- sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
- sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
- sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
- sky/dashboard/out/_next/static/chunks/{938-63fc419cb82ad9b3.js → 938-7ee806653aef0609.js} +1 -1
- sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
- sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-b6447da22305b14a.js +1 -0
- sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +93 -32
- sky/exceptions.py +8 -0
- sky/global_user_state.py +2 -3
- sky/jobs/state.py +2 -2
- sky/logs/__init__.py +4 -0
- sky/logs/agent.py +14 -0
- sky/logs/aws.py +276 -0
- sky/provision/nebius/utils.py +3 -6
- sky/server/common.py +9 -4
- sky/server/requests/payloads.py +20 -4
- sky/server/rest.py +6 -0
- sky/server/server.py +2 -1
- sky/setup_files/MANIFEST.in +1 -1
- sky/setup_files/alembic.ini +0 -4
- sky/skylet/constants.py +4 -0
- sky/skypilot_config.py +5 -31
- sky/utils/common_utils.py +8 -3
- sky/utils/config_utils.py +17 -0
- sky/utils/db/migration_utils.py +44 -4
- sky/utils/locks.py +319 -0
- sky/utils/rich_utils.py +2 -3
- sky/utils/schemas.py +92 -56
- sky/utils/timeline.py +41 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/RECORD +88 -86
- sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
- sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
- sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
- sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4869.c7c055a5c2814f33.js +0 -16
- sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
- sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
- sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
- sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
- sky/dashboard/out/_next/static/chunks/9470-8178183f3bae198f.js +0 -1
- sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-507712f30cd3cec3.js +0 -20
- sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-26cdc782eed15a7d.js +0 -1
- sky/dashboard/out/_next/static/css/5122cb0a08486fd3.css +0 -3
- sky/dashboard/out/_next/static/pTQKG61ng32Zc7gsAROFJ/_buildManifest.js +0 -1
- sky/schemas/db/skypilot_config/001_initial_schema.py +0 -30
- /sky/dashboard/out/_next/static/{pTQKG61ng32Zc7gsAROFJ → BURfWrKsQk9psMPv0OXrh}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250720.dist-info → skypilot_nightly-1.0.0.dev20250724.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import urllib.request
|
|
6
6
|
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
8
|
+
_SKYPILOT_COMMIT_SHA = '5b5dfe080437e0b2e58b1e7742c5864a1ebe9432'
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _get_git_commit():
|
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
__commit__ = _get_git_commit()
|
|
38
|
-
__version__ = '1.0.0.
|
|
38
|
+
__version__ = '1.0.0.dev20250724'
|
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
40
40
|
|
|
41
41
|
|
sky/admin_policy.py
CHANGED
|
@@ -121,11 +121,17 @@ class MutatedUserRequest:
|
|
|
121
121
|
dict(self.skypilot_config),)).model_dump_json()
|
|
122
122
|
|
|
123
123
|
@classmethod
|
|
124
|
-
def decode(cls, mutated_user_request_body: str
|
|
124
|
+
def decode(cls, mutated_user_request_body: str,
|
|
125
|
+
original_request: UserRequest) -> 'MutatedUserRequest':
|
|
125
126
|
mutated_user_request_body = _MutatedUserRequestBody.model_validate_json(
|
|
126
127
|
mutated_user_request_body)
|
|
127
|
-
|
|
128
|
-
common_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
|
|
128
|
+
task = sky.Task.from_yaml_config(
|
|
129
|
+
common_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
|
|
130
|
+
# Some internal Task fields are not serialized. We need to manually
|
|
131
|
+
# restore them from the original request.
|
|
132
|
+
task.managed_job_dag = original_request.task.managed_job_dag
|
|
133
|
+
task.service_name = original_request.task.service_name
|
|
134
|
+
return cls(task=task,
|
|
129
135
|
skypilot_config=config_utils.Config.from_dict(
|
|
130
136
|
common_utils.read_yaml_all_str(
|
|
131
137
|
mutated_user_request_body.skypilot_config)[0],))
|
|
@@ -243,7 +249,8 @@ class RestfulAdminPolicy(PolicyTemplate):
|
|
|
243
249
|
f'{self.policy_url}: {e}') from None
|
|
244
250
|
|
|
245
251
|
try:
|
|
246
|
-
mutated_user_request = MutatedUserRequest.decode(
|
|
252
|
+
mutated_user_request = MutatedUserRequest.decode(
|
|
253
|
+
response.json(), user_request)
|
|
247
254
|
except Exception as e: # pylint: disable=broad-except
|
|
248
255
|
with ux_utils.print_exception_no_traceback():
|
|
249
256
|
raise exceptions.RestfulPolicyError(
|
sky/backends/backend_utils.py
CHANGED
|
@@ -17,7 +17,6 @@ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
|
17
17
|
import uuid
|
|
18
18
|
|
|
19
19
|
import colorama
|
|
20
|
-
import filelock
|
|
21
20
|
from packaging import version
|
|
22
21
|
from typing_extensions import Literal
|
|
23
22
|
|
|
@@ -45,6 +44,7 @@ from sky.utils import common_utils
|
|
|
45
44
|
from sky.utils import context_utils
|
|
46
45
|
from sky.utils import controller_utils
|
|
47
46
|
from sky.utils import env_options
|
|
47
|
+
from sky.utils import locks
|
|
48
48
|
from sky.utils import registry
|
|
49
49
|
from sky.utils import resources_utils
|
|
50
50
|
from sky.utils import rich_utils
|
|
@@ -110,17 +110,12 @@ _TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
|
|
|
110
110
|
# Note: This value cannot be too small, otherwise OOM issue may occur.
|
|
111
111
|
DEFAULT_TASK_CPU_DEMAND = 0.5
|
|
112
112
|
|
|
113
|
-
# Filelocks for the cluster status change.
|
|
114
|
-
CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
|
|
115
113
|
CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
|
|
116
114
|
|
|
117
115
|
# Time that must elapse since the last status check before we should re-check if
|
|
118
116
|
# the cluster has been terminated or autostopped.
|
|
119
117
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
120
118
|
|
|
121
|
-
# Filelocks for updating cluster's file_mounts.
|
|
122
|
-
CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
|
|
123
|
-
'~/.sky/.{}_file_mounts.lock')
|
|
124
119
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
125
120
|
|
|
126
121
|
# Remote dir that holds our runtime files.
|
|
@@ -2005,9 +2000,20 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2005
2000
|
|
|
2006
2001
|
total_nodes = handle.launched_nodes * handle.num_ips_per_node
|
|
2007
2002
|
|
|
2003
|
+
cloud_name = repr(handle.launched_resources.cloud).lower()
|
|
2008
2004
|
for i in range(5):
|
|
2009
|
-
|
|
2010
|
-
|
|
2005
|
+
try:
|
|
2006
|
+
ready_head, ready_workers, output, stderr = (
|
|
2007
|
+
get_node_counts_from_ray_status(head_runner))
|
|
2008
|
+
except RuntimeError as e:
|
|
2009
|
+
logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
|
|
2010
|
+
f' {i}: {common_utils.format_exception(e)}')
|
|
2011
|
+
if cloud_name != 'kubernetes':
|
|
2012
|
+
raise e
|
|
2013
|
+
# We retry for kubernetes because coreweave can have a
|
|
2014
|
+
# transient network issue.
|
|
2015
|
+
time.sleep(1)
|
|
2016
|
+
continue
|
|
2011
2017
|
if ready_head + ready_workers == total_nodes:
|
|
2012
2018
|
return True
|
|
2013
2019
|
logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
|
|
@@ -2294,8 +2300,7 @@ def refresh_cluster_record(
|
|
|
2294
2300
|
|
|
2295
2301
|
# The loop logic allows us to notice if the status was updated in the
|
|
2296
2302
|
# global_user_state by another process and stop trying to get the lock.
|
|
2297
|
-
|
|
2298
|
-
lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
|
|
2303
|
+
lock = locks.get_lock(cluster_status_lock_id(cluster_name))
|
|
2299
2304
|
start_time = time.perf_counter()
|
|
2300
2305
|
|
|
2301
2306
|
# Loop until we have an up-to-date status or until we acquire the lock.
|
|
@@ -2319,7 +2324,8 @@ def refresh_cluster_record(
|
|
|
2319
2324
|
return record
|
|
2320
2325
|
# Update and return the cluster status.
|
|
2321
2326
|
return _update_cluster_status(cluster_name)
|
|
2322
|
-
|
|
2327
|
+
|
|
2328
|
+
except locks.LockTimeout:
|
|
2323
2329
|
# lock.acquire() will throw a Timeout exception if the lock is not
|
|
2324
2330
|
# available and we have blocking=False.
|
|
2325
2331
|
pass
|
|
@@ -3197,3 +3203,13 @@ def get_endpoints(cluster: str,
|
|
|
3197
3203
|
return {
|
|
3198
3204
|
port_num: urls[0].url() for port_num, urls in port_details.items()
|
|
3199
3205
|
}
|
|
3206
|
+
|
|
3207
|
+
|
|
3208
|
+
def cluster_status_lock_id(cluster_name: str) -> str:
|
|
3209
|
+
"""Get the lock ID for cluster status operations."""
|
|
3210
|
+
return f'{cluster_name}_status'
|
|
3211
|
+
|
|
3212
|
+
|
|
3213
|
+
def cluster_file_mounts_lock_id(cluster_name: str) -> str:
|
|
3214
|
+
"""Get the lock ID for cluster file mounts operations."""
|
|
3215
|
+
return f'{cluster_name}_file_mounts'
|
|
@@ -20,7 +20,6 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
|
|
|
20
20
|
Union)
|
|
21
21
|
|
|
22
22
|
import colorama
|
|
23
|
-
import filelock
|
|
24
23
|
import yaml
|
|
25
24
|
|
|
26
25
|
import sky
|
|
@@ -64,6 +63,7 @@ from sky.utils import common_utils
|
|
|
64
63
|
from sky.utils import context_utils
|
|
65
64
|
from sky.utils import controller_utils
|
|
66
65
|
from sky.utils import env_options
|
|
66
|
+
from sky.utils import locks
|
|
67
67
|
from sky.utils import log_utils
|
|
68
68
|
from sky.utils import message_utils
|
|
69
69
|
from sky.utils import registry
|
|
@@ -2916,9 +2916,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2916
2916
|
# Check if the cluster is owned by the current user. Raise
|
|
2917
2917
|
# exceptions.ClusterOwnerIdentityMismatchError
|
|
2918
2918
|
backend_utils.check_owner_identity(cluster_name)
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
with timeline.FileLockEvent(lock_path):
|
|
2919
|
+
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
2920
|
+
with timeline.DistributedLockEvent(lock_id):
|
|
2922
2921
|
# Try to launch the exiting cluster first. If no existing cluster,
|
|
2923
2922
|
# this function will create a to_provision_config with required
|
|
2924
2923
|
# resources.
|
|
@@ -3065,7 +3064,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3065
3064
|
|
|
3066
3065
|
self._update_after_cluster_provisioned(
|
|
3067
3066
|
handle, to_provision_config.prev_handle, task,
|
|
3068
|
-
prev_cluster_status,
|
|
3067
|
+
prev_cluster_status, lock_id, config_hash)
|
|
3069
3068
|
return handle, False
|
|
3070
3069
|
|
|
3071
3070
|
cluster_config_file = config_dict['ray']
|
|
@@ -3137,7 +3136,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3137
3136
|
|
|
3138
3137
|
self._update_after_cluster_provisioned(
|
|
3139
3138
|
handle, to_provision_config.prev_handle, task,
|
|
3140
|
-
prev_cluster_status,
|
|
3139
|
+
prev_cluster_status, lock_id, config_hash)
|
|
3141
3140
|
return handle, False
|
|
3142
3141
|
|
|
3143
3142
|
def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
|
|
@@ -3155,7 +3154,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3155
3154
|
prev_handle: Optional[CloudVmRayResourceHandle],
|
|
3156
3155
|
task: task_lib.Task,
|
|
3157
3156
|
prev_cluster_status: Optional[status_lib.ClusterStatus],
|
|
3158
|
-
|
|
3157
|
+
lock_id: str, config_hash: str) -> None:
|
|
3159
3158
|
usage_lib.messages.usage.update_cluster_resources(
|
|
3160
3159
|
handle.launched_nodes, handle.launched_resources)
|
|
3161
3160
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
@@ -3237,7 +3236,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3237
3236
|
handle.cached_external_ssh_ports, handle.docker_user,
|
|
3238
3237
|
handle.ssh_user)
|
|
3239
3238
|
|
|
3240
|
-
|
|
3239
|
+
locks.get_lock(lock_id).force_unlock()
|
|
3241
3240
|
|
|
3242
3241
|
def _sync_workdir(self, handle: CloudVmRayResourceHandle,
|
|
3243
3242
|
workdir: Union[Path, Dict[str, Any]],
|
|
@@ -3819,8 +3818,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3819
3818
|
is_identity_mismatch_and_purge = True
|
|
3820
3819
|
else:
|
|
3821
3820
|
raise
|
|
3822
|
-
|
|
3823
|
-
|
|
3821
|
+
lock_id = backend_utils.cluster_status_lock_id(cluster_name)
|
|
3822
|
+
lock = locks.get_lock(lock_id)
|
|
3824
3823
|
# Retry in case new cluster operation comes in and holds the lock
|
|
3825
3824
|
# right after the lock is removed.
|
|
3826
3825
|
n_attempts = 2
|
|
@@ -3828,7 +3827,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3828
3827
|
n_attempts -= 1
|
|
3829
3828
|
# In case other running cluster operations are still holding the
|
|
3830
3829
|
# lock.
|
|
3831
|
-
|
|
3830
|
+
lock.force_unlock()
|
|
3832
3831
|
# We have to kill the cluster requests, because `down` and `stop`
|
|
3833
3832
|
# should be higher priority than the cluster requests, and we should
|
|
3834
3833
|
# release the lock from other requests.
|
|
@@ -3847,9 +3846,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3847
3846
|
f'cluster {handle.cluster_name}: '
|
|
3848
3847
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
3849
3848
|
try:
|
|
3850
|
-
with
|
|
3851
|
-
lock_path,
|
|
3852
|
-
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
|
|
3849
|
+
with lock:
|
|
3853
3850
|
self.teardown_no_lock(
|
|
3854
3851
|
handle,
|
|
3855
3852
|
terminate,
|
|
@@ -3862,14 +3859,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3862
3859
|
refresh_cluster_status=(
|
|
3863
3860
|
not is_identity_mismatch_and_purge))
|
|
3864
3861
|
if terminate:
|
|
3865
|
-
|
|
3862
|
+
lock.force_unlock()
|
|
3866
3863
|
break
|
|
3867
|
-
except
|
|
3864
|
+
except locks.LockTimeout as e:
|
|
3868
3865
|
logger.debug(f'Failed to acquire lock for {cluster_name}, '
|
|
3869
3866
|
f'retrying...')
|
|
3870
3867
|
if n_attempts <= 0:
|
|
3871
3868
|
raise RuntimeError(
|
|
3872
|
-
f'Cluster {cluster_name!r} is locked by {
|
|
3869
|
+
f'Cluster {cluster_name!r} is locked by {lock_id}. '
|
|
3873
3870
|
'Check to see if it is still being launched') from e
|
|
3874
3871
|
|
|
3875
3872
|
# --- CloudVMRayBackend Specific APIs ---
|
|
@@ -5245,18 +5242,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5245
5242
|
# reconstruct them during cluster restart.
|
|
5246
5243
|
continue
|
|
5247
5244
|
storage_mounts_metadata[dst] = storage_obj.handle
|
|
5248
|
-
|
|
5249
|
-
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
|
5245
|
+
lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
|
|
5250
5246
|
lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
|
|
5251
5247
|
try:
|
|
5252
|
-
with
|
|
5248
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
5253
5249
|
global_user_state.set_cluster_storage_mounts_metadata(
|
|
5254
5250
|
cluster_name, storage_mounts_metadata)
|
|
5255
|
-
except
|
|
5251
|
+
except locks.LockTimeout as e:
|
|
5256
5252
|
raise RuntimeError(
|
|
5257
5253
|
f'Failed to store metadata for cluster {cluster_name!r} due to '
|
|
5258
5254
|
'a timeout when trying to access local database. Please '
|
|
5259
|
-
f'try again or manually remove the lock at {
|
|
5255
|
+
f'try again or manually remove the lock at {lock_id}. '
|
|
5260
5256
|
f'{common_utils.format_exception(e)}') from None
|
|
5261
5257
|
|
|
5262
5258
|
def get_storage_mounts_metadata(
|
|
@@ -5267,19 +5263,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
5267
5263
|
After retrieving storage_mounts_metadata, it converts back the
|
|
5268
5264
|
StorageMetadata to Storage object and restores 'storage_mounts.'
|
|
5269
5265
|
"""
|
|
5270
|
-
|
|
5271
|
-
backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
|
|
5266
|
+
lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
|
|
5272
5267
|
lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
|
|
5273
5268
|
try:
|
|
5274
|
-
with
|
|
5269
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
5275
5270
|
storage_mounts_metadata = (
|
|
5276
5271
|
global_user_state.get_cluster_storage_mounts_metadata(
|
|
5277
5272
|
cluster_name))
|
|
5278
|
-
except
|
|
5273
|
+
except locks.LockTimeout as e:
|
|
5279
5274
|
raise RuntimeError(
|
|
5280
5275
|
f'Failed to retrieve metadata for cluster {cluster_name!r} '
|
|
5281
5276
|
'due to a timeout when trying to access local database. '
|
|
5282
|
-
f'Please try again or manually remove the lock at {
|
|
5277
|
+
f'Please try again or manually remove the lock at {lock_id}.'
|
|
5283
5278
|
f' {common_utils.format_exception(e)}') from None
|
|
5284
5279
|
|
|
5285
5280
|
if storage_mounts_metadata is None:
|
sky/client/cli/command.py
CHANGED
|
@@ -2143,7 +2143,9 @@ def logs(
|
|
|
2143
2143
|
if sync_down:
|
|
2144
2144
|
with rich_utils.client_status(
|
|
2145
2145
|
ux_utils.spinner_message('Downloading logs')):
|
|
2146
|
-
log_local_path_dict = sdk.download_logs(
|
|
2146
|
+
log_local_path_dict = sdk.download_logs(
|
|
2147
|
+
cluster,
|
|
2148
|
+
list(job_ids) if job_ids else None)
|
|
2147
2149
|
style = colorama.Style
|
|
2148
2150
|
fore = colorama.Fore
|
|
2149
2151
|
for job, log_local_path in log_local_path_dict.items():
|
|
@@ -2195,8 +2197,7 @@ def logs(
|
|
|
2195
2197
|
f'{colorama.Style.RESET_ALL}')
|
|
2196
2198
|
|
|
2197
2199
|
# Stream logs from the server.
|
|
2198
|
-
|
|
2199
|
-
sys.exit(returncode)
|
|
2200
|
+
sys.exit(sdk.tail_logs(cluster, job_id, follow, tail=tail))
|
|
2200
2201
|
|
|
2201
2202
|
|
|
2202
2203
|
@cli.command()
|
|
@@ -3023,17 +3024,18 @@ def _down_or_stop_clusters(
|
|
|
3023
3024
|
click.echo(common_utils.format_exception(e))
|
|
3024
3025
|
else:
|
|
3025
3026
|
raise
|
|
3026
|
-
|
|
3027
|
-
|
|
3028
|
-
|
|
3029
|
-
|
|
3030
|
-
|
|
3031
|
-
|
|
3032
|
-
|
|
3033
|
-
|
|
3034
|
-
|
|
3035
|
-
|
|
3036
|
-
|
|
3027
|
+
if not purge:
|
|
3028
|
+
confirm_str = 'delete'
|
|
3029
|
+
user_input = click.prompt(
|
|
3030
|
+
f'To proceed, please type {colorama.Style.BRIGHT}'
|
|
3031
|
+
f'{confirm_str!r}{colorama.Style.RESET_ALL}',
|
|
3032
|
+
type=str)
|
|
3033
|
+
if user_input != confirm_str:
|
|
3034
|
+
raise click.Abort()
|
|
3035
|
+
else:
|
|
3036
|
+
click.echo('Since --purge is set, errors will be ignored '
|
|
3037
|
+
'and controller will be removed from '
|
|
3038
|
+
'local state.\nSkipping confirmation.')
|
|
3037
3039
|
no_confirm = True
|
|
3038
3040
|
names += controllers
|
|
3039
3041
|
|
|
@@ -3243,7 +3245,7 @@ def show_gpus(
|
|
|
3243
3245
|
infra: Optional[str],
|
|
3244
3246
|
cloud: Optional[str],
|
|
3245
3247
|
region: Optional[str],
|
|
3246
|
-
all_regions:
|
|
3248
|
+
all_regions: bool):
|
|
3247
3249
|
"""Show supported GPU/TPU/accelerators and their prices.
|
|
3248
3250
|
|
|
3249
3251
|
The names and counts shown can be set in the ``accelerators`` field in task
|
|
@@ -5397,7 +5399,7 @@ def api():
|
|
|
5397
5399
|
required=False,
|
|
5398
5400
|
help='Enable basic authentication in the SkyPilot API server.')
|
|
5399
5401
|
@usage_lib.entrypoint
|
|
5400
|
-
def api_start(deploy: bool, host:
|
|
5402
|
+
def api_start(deploy: bool, host: str, foreground: bool,
|
|
5401
5403
|
enable_basic_auth: bool):
|
|
5402
5404
|
"""Starts the SkyPilot API server locally."""
|
|
5403
5405
|
sdk.api_start(deploy=deploy,
|
|
@@ -5508,19 +5510,27 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
|
|
|
5508
5510
|
columns.append('Cluster')
|
|
5509
5511
|
columns.extend(['Created', 'Status'])
|
|
5510
5512
|
table = log_utils.create_table(columns)
|
|
5511
|
-
|
|
5512
|
-
|
|
5513
|
-
|
|
5514
|
-
|
|
5515
|
-
|
|
5516
|
-
|
|
5513
|
+
if len(request_list) > 0:
|
|
5514
|
+
for request in request_list:
|
|
5515
|
+
r_id = request.request_id
|
|
5516
|
+
if not verbose:
|
|
5517
|
+
r_id = common_utils.truncate_long_string(r_id, 36)
|
|
5518
|
+
req_status = requests.RequestStatus(request.status)
|
|
5519
|
+
row = [r_id, request.user_name, request.name]
|
|
5520
|
+
if verbose:
|
|
5521
|
+
row.append(request.cluster_name)
|
|
5522
|
+
row.extend([
|
|
5523
|
+
log_utils.readable_time_duration(request.created_at),
|
|
5524
|
+
req_status.colored_str()
|
|
5525
|
+
])
|
|
5526
|
+
table.add_row(row)
|
|
5527
|
+
else:
|
|
5528
|
+
# add dummy data for when api server is down.
|
|
5529
|
+
dummy_row = ['-'] * 5
|
|
5517
5530
|
if verbose:
|
|
5518
|
-
|
|
5519
|
-
|
|
5520
|
-
|
|
5521
|
-
req_status.colored_str()
|
|
5522
|
-
])
|
|
5523
|
-
table.add_row(row)
|
|
5531
|
+
dummy_row.append('-')
|
|
5532
|
+
table.add_row(dummy_row)
|
|
5533
|
+
click.echo()
|
|
5524
5534
|
click.echo(table)
|
|
5525
5535
|
|
|
5526
5536
|
|
|
@@ -5568,6 +5578,12 @@ def api_login(endpoint: Optional[str], relogin: bool,
|
|
|
5568
5578
|
sdk.api_login(endpoint, relogin, service_account_token)
|
|
5569
5579
|
|
|
5570
5580
|
|
|
5581
|
+
@api.command('logout', cls=_DocumentedCodeCommand)
|
|
5582
|
+
def api_logout():
|
|
5583
|
+
"""Logs out of the api server"""
|
|
5584
|
+
sdk.api_logout()
|
|
5585
|
+
|
|
5586
|
+
|
|
5571
5587
|
@api.command('info', cls=_DocumentedCodeCommand)
|
|
5572
5588
|
@flags.config_option(expose_value=False)
|
|
5573
5589
|
@usage_lib.entrypoint
|
sky/client/sdk.py
CHANGED
|
@@ -1854,6 +1854,18 @@ def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
|
|
|
1854
1854
|
return server_common.get_request_id(response)
|
|
1855
1855
|
|
|
1856
1856
|
|
|
1857
|
+
def _local_api_server_running(kill: bool = False) -> bool:
|
|
1858
|
+
"""Checks if the local api server is running."""
|
|
1859
|
+
for process in psutil.process_iter(attrs=['pid', 'cmdline']):
|
|
1860
|
+
cmdline = process.info['cmdline']
|
|
1861
|
+
if cmdline and server_common.API_SERVER_CMD in ' '.join(cmdline):
|
|
1862
|
+
if kill:
|
|
1863
|
+
subprocess_utils.kill_children_processes(
|
|
1864
|
+
parent_pids=[process.pid], force=True)
|
|
1865
|
+
return True
|
|
1866
|
+
return False
|
|
1867
|
+
|
|
1868
|
+
|
|
1857
1869
|
@usage_lib.entrypoint
|
|
1858
1870
|
@annotations.client_api
|
|
1859
1871
|
def api_status(
|
|
@@ -1872,6 +1884,10 @@ def api_status(
|
|
|
1872
1884
|
Returns:
|
|
1873
1885
|
A list of request payloads.
|
|
1874
1886
|
"""
|
|
1887
|
+
if server_common.is_api_server_local() and not _local_api_server_running():
|
|
1888
|
+
logger.info('SkyPilot API server is not running.')
|
|
1889
|
+
return []
|
|
1890
|
+
|
|
1875
1891
|
body = payloads.RequestStatusBody(request_ids=request_ids,
|
|
1876
1892
|
all_status=all_status)
|
|
1877
1893
|
response = server_common.make_authenticated_request(
|
|
@@ -1992,13 +2008,7 @@ def api_stop() -> None:
|
|
|
1992
2008
|
f'Cannot kill the API server at {server_url} because it is not '
|
|
1993
2009
|
f'the default SkyPilot API server started locally.')
|
|
1994
2010
|
|
|
1995
|
-
found =
|
|
1996
|
-
for process in psutil.process_iter(attrs=['pid', 'cmdline']):
|
|
1997
|
-
cmdline = process.info['cmdline']
|
|
1998
|
-
if cmdline and server_common.API_SERVER_CMD in ' '.join(cmdline):
|
|
1999
|
-
subprocess_utils.kill_children_processes(parent_pids=[process.pid],
|
|
2000
|
-
force=True)
|
|
2001
|
-
found = True
|
|
2011
|
+
found = _local_api_server_running(kill=True)
|
|
2002
2012
|
|
|
2003
2013
|
# Remove the database for requests.
|
|
2004
2014
|
server_common.clear_local_api_server_database()
|
|
@@ -2067,6 +2077,22 @@ def _save_config_updates(endpoint: Optional[str] = None,
|
|
|
2067
2077
|
skypilot_config.reload_config()
|
|
2068
2078
|
|
|
2069
2079
|
|
|
2080
|
+
def _clear_api_server_config() -> None:
|
|
2081
|
+
"""Clear endpoint and service account token from config file."""
|
|
2082
|
+
config_path = pathlib.Path(
|
|
2083
|
+
skypilot_config.get_user_config_path()).expanduser()
|
|
2084
|
+
with filelock.FileLock(config_path.with_suffix('.lock')):
|
|
2085
|
+
if not config_path.exists():
|
|
2086
|
+
return
|
|
2087
|
+
|
|
2088
|
+
config = skypilot_config.get_user_config()
|
|
2089
|
+
config = dict(config)
|
|
2090
|
+
del config['api_server']
|
|
2091
|
+
|
|
2092
|
+
common_utils.dump_yaml(str(config_path), config, blank=True)
|
|
2093
|
+
skypilot_config.reload_config()
|
|
2094
|
+
|
|
2095
|
+
|
|
2070
2096
|
def _validate_endpoint(endpoint: Optional[str]) -> str:
|
|
2071
2097
|
"""Validate and normalize the endpoint URL."""
|
|
2072
2098
|
if endpoint is None:
|
|
@@ -2323,3 +2349,22 @@ def api_login(endpoint: Optional[str] = None,
|
|
|
2323
2349
|
endpoint)
|
|
2324
2350
|
_show_logged_in_message(endpoint, dashboard_url, final_api_server_info.user,
|
|
2325
2351
|
server_status)
|
|
2352
|
+
|
|
2353
|
+
|
|
2354
|
+
@usage_lib.entrypoint
|
|
2355
|
+
@annotations.client_api
|
|
2356
|
+
def api_logout() -> None:
|
|
2357
|
+
"""Logout of the API server.
|
|
2358
|
+
|
|
2359
|
+
Clears all cookies and settings stored in ~/.sky/config.yaml"""
|
|
2360
|
+
if server_common.is_api_server_local():
|
|
2361
|
+
with ux_utils.print_exception_no_traceback():
|
|
2362
|
+
raise RuntimeError('Local api server cannot be logged out. '
|
|
2363
|
+
'Use `sky api stop` instead.')
|
|
2364
|
+
|
|
2365
|
+
# no need to clear cookies if it doesn't exist.
|
|
2366
|
+
server_common.set_api_cookie_jar(cookiejar.MozillaCookieJar(),
|
|
2367
|
+
create_if_not_exists=False)
|
|
2368
|
+
_clear_api_server_config()
|
|
2369
|
+
logger.info(f'{colorama.Fore.GREEN}Logged out of SkyPilot API server.'
|
|
2370
|
+
f'{colorama.Style.RESET_ALL}')
|