skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/backends/backend_utils.py +57 -7
- sky/backends/cloud_vm_ray_backend.py +50 -8
- sky/client/cli/command.py +60 -26
- sky/client/sdk.py +132 -65
- sky/client/sdk_async.py +1 -1
- sky/core.py +10 -2
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
- sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/execution.py +21 -4
- sky/global_user_state.py +110 -1
- sky/jobs/client/sdk.py +27 -20
- sky/jobs/controller.py +2 -1
- sky/jobs/recovery_strategy.py +3 -0
- sky/jobs/server/core.py +4 -0
- sky/jobs/utils.py +9 -2
- sky/provision/__init__.py +3 -2
- sky/provision/aws/instance.py +5 -4
- sky/provision/azure/instance.py +5 -4
- sky/provision/cudo/instance.py +5 -4
- sky/provision/do/instance.py +5 -4
- sky/provision/fluidstack/instance.py +5 -4
- sky/provision/gcp/instance.py +5 -4
- sky/provision/hyperbolic/instance.py +5 -4
- sky/provision/kubernetes/instance.py +36 -6
- sky/provision/lambda_cloud/instance.py +5 -4
- sky/provision/nebius/instance.py +5 -4
- sky/provision/oci/instance.py +5 -4
- sky/provision/paperspace/instance.py +5 -4
- sky/provision/provisioner.py +6 -0
- sky/provision/runpod/instance.py +5 -4
- sky/provision/scp/instance.py +5 -5
- sky/provision/vast/instance.py +5 -5
- sky/provision/vsphere/instance.py +5 -4
- sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
- sky/serve/client/impl.py +11 -8
- sky/serve/client/sdk.py +7 -7
- sky/serve/serve_state.py +437 -340
- sky/serve/serve_utils.py +37 -3
- sky/serve/server/impl.py +2 -2
- sky/server/common.py +12 -8
- sky/server/constants.py +1 -1
- sky/setup_files/alembic.ini +4 -0
- sky/skypilot_config.py +4 -4
- sky/users/permission.py +1 -1
- sky/utils/cli_utils/status_utils.py +10 -1
- sky/utils/db/db_utils.py +53 -1
- sky/utils/db/migration_utils.py +5 -1
- sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
- sky/utils/resource_checker.py +162 -21
- sky/volumes/client/sdk.py +4 -4
- sky/workspaces/core.py +210 -6
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +87 -83
- sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
- /sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional
|
|
|
5
5
|
import urllib.request
|
|
6
6
|
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
|
8
|
+
_SKYPILOT_COMMIT_SHA = '1e311e80f4a9112a6d2c86bb78d4c225042cedbc'
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _get_git_commit():
|
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
__commit__ = _get_git_commit()
|
|
38
|
-
__version__ = '1.0.0.
|
|
38
|
+
__version__ = '1.0.0.dev20250812'
|
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
|
40
40
|
|
|
41
41
|
|
sky/adaptors/kubernetes.py
CHANGED
|
@@ -142,8 +142,11 @@ def _load_config(context: Optional[str] = None):
|
|
|
142
142
|
# show up in SkyPilot tasks. For now, we work around by using
|
|
143
143
|
# DNS name instead of environment variables.
|
|
144
144
|
# See issue: https://github.com/skypilot-org/skypilot/issues/2287
|
|
145
|
-
|
|
146
|
-
|
|
145
|
+
# Only set if not already present (preserving existing values)
|
|
146
|
+
if 'KUBERNETES_SERVICE_HOST' not in os.environ:
|
|
147
|
+
os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
|
|
148
|
+
if 'KUBERNETES_SERVICE_PORT' not in os.environ:
|
|
149
|
+
os.environ['KUBERNETES_SERVICE_PORT'] = '443'
|
|
147
150
|
kubernetes.config.load_incluster_config()
|
|
148
151
|
except kubernetes.config.config_exception.ConfigException:
|
|
149
152
|
_load_config_from_kubeconfig()
|
sky/backends/backend_utils.py
CHANGED
|
@@ -121,6 +121,7 @@ CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
|
|
|
121
121
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
122
122
|
|
|
123
123
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
124
|
+
WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
|
|
124
125
|
|
|
125
126
|
# Remote dir that holds our runtime files.
|
|
126
127
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
@@ -1772,8 +1773,9 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
|
|
|
1772
1773
|
|
|
1773
1774
|
def _query_cluster_status_via_cloud_api(
|
|
1774
1775
|
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
1775
|
-
) -> List[status_lib.ClusterStatus]:
|
|
1776
|
-
"""Returns the status of the cluster
|
|
1776
|
+
) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
|
|
1777
|
+
"""Returns the status of the cluster as a list of tuples corresponding
|
|
1778
|
+
to the node status and an optional reason string for said status.
|
|
1777
1779
|
|
|
1778
1780
|
Raises:
|
|
1779
1781
|
exceptions.ClusterStatusFetchingError: the cluster status cannot be
|
|
@@ -1812,9 +1814,13 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1812
1814
|
region = provider_config.get('region') or provider_config.get(
|
|
1813
1815
|
'location')
|
|
1814
1816
|
zone = ray_config['provider'].get('availability_zone')
|
|
1817
|
+
# TODO (kyuds): refactor cloud.query_status api to include reason.
|
|
1818
|
+
# Currently not refactoring as this API is actually supposed to be
|
|
1819
|
+
# deprecated soon.
|
|
1815
1820
|
node_statuses = cloud.query_status(
|
|
1816
1821
|
cluster_name_on_cloud,
|
|
1817
1822
|
tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
|
|
1823
|
+
node_statuses = [(status, None) for status in node_statuses]
|
|
1818
1824
|
return node_statuses
|
|
1819
1825
|
|
|
1820
1826
|
|
|
@@ -2014,8 +2020,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2014
2020
|
|
|
2015
2021
|
node_statuses = _query_cluster_status_via_cloud_api(handle)
|
|
2016
2022
|
|
|
2017
|
-
all_nodes_up = (all(
|
|
2018
|
-
|
|
2023
|
+
all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
|
|
2024
|
+
for status in node_statuses) and
|
|
2019
2025
|
len(node_statuses) == handle.launched_nodes)
|
|
2020
2026
|
|
|
2021
2027
|
def get_node_counts_from_ray_status(
|
|
@@ -2120,6 +2126,13 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2120
2126
|
# run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
|
|
2121
2127
|
# head-ip/worker-ips`.
|
|
2122
2128
|
record['status'] = status_lib.ClusterStatus.UP
|
|
2129
|
+
# Add cluster event for instance status check.
|
|
2130
|
+
global_user_state.add_cluster_event(
|
|
2131
|
+
cluster_name,
|
|
2132
|
+
status_lib.ClusterStatus.UP,
|
|
2133
|
+
'All nodes up + ray cluster healthy.',
|
|
2134
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2135
|
+
nop_if_duplicate=True)
|
|
2123
2136
|
global_user_state.add_or_update_cluster(cluster_name,
|
|
2124
2137
|
handle,
|
|
2125
2138
|
requested_resources=None,
|
|
@@ -2204,9 +2217,19 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2204
2217
|
# regardless of the ray cluster's health.
|
|
2205
2218
|
# (2) Otherwise, we will reset the autostop setting, unless the cluster is
|
|
2206
2219
|
# autostopping/autodowning.
|
|
2207
|
-
|
|
2208
|
-
|
|
2220
|
+
some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
|
|
2221
|
+
some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
|
|
2222
|
+
for status in node_statuses)
|
|
2223
|
+
is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
|
|
2224
|
+
|
|
2209
2225
|
if is_abnormal:
|
|
2226
|
+
status_reason = ', '.join(
|
|
2227
|
+
[status[1] for status in node_statuses if status[1] is not None])
|
|
2228
|
+
|
|
2229
|
+
if some_nodes_terminated:
|
|
2230
|
+
init_reason = f'one or more nodes terminated ({status_reason})'
|
|
2231
|
+
elif some_nodes_not_stopped:
|
|
2232
|
+
init_reason = f'some nodes are up and some nodes are stopped ({status_reason})'
|
|
2210
2233
|
logger.debug('The cluster is abnormal. Setting to INIT status. '
|
|
2211
2234
|
f'node_statuses: {node_statuses}')
|
|
2212
2235
|
if record['autostop'] >= 0:
|
|
@@ -2290,6 +2313,12 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2290
2313
|
# represent that the cluster is partially preempted.
|
|
2291
2314
|
# TODO(zhwu): the definition of INIT should be audited/changed.
|
|
2292
2315
|
# Adding a new status UNHEALTHY for abnormal status can be a choice.
|
|
2316
|
+
global_user_state.add_cluster_event(
|
|
2317
|
+
cluster_name,
|
|
2318
|
+
status_lib.ClusterStatus.INIT,
|
|
2319
|
+
f'Cluster is abnormal because {init_reason}. Transitioned to INIT.',
|
|
2320
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2321
|
+
nop_if_duplicate=True)
|
|
2293
2322
|
global_user_state.add_or_update_cluster(cluster_name,
|
|
2294
2323
|
handle,
|
|
2295
2324
|
requested_resources=None,
|
|
@@ -2300,6 +2329,9 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2300
2329
|
# STOPPED.
|
|
2301
2330
|
backend = backends.CloudVmRayBackend()
|
|
2302
2331
|
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2332
|
+
global_user_state.add_cluster_event(
|
|
2333
|
+
cluster_name, None, 'All nodes stopped, terminating cluster.',
|
|
2334
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
2303
2335
|
return global_user_state.get_cluster_from_name(cluster_name)
|
|
2304
2336
|
|
|
2305
2337
|
|
|
@@ -2760,6 +2792,9 @@ def get_clusters(
|
|
|
2760
2792
|
refresh: common.StatusRefreshMode,
|
|
2761
2793
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
2762
2794
|
all_users: bool = True,
|
|
2795
|
+
# Internal only:
|
|
2796
|
+
# pylint: disable=invalid-name
|
|
2797
|
+
_include_is_managed: bool = False,
|
|
2763
2798
|
) -> List[Dict[str, Any]]:
|
|
2764
2799
|
"""Returns a list of cached or optionally refreshed cluster records.
|
|
2765
2800
|
|
|
@@ -2780,6 +2815,8 @@ def get_clusters(
|
|
|
2780
2815
|
names.
|
|
2781
2816
|
all_users: If True, return clusters from all users. If False, only
|
|
2782
2817
|
return clusters from the current user.
|
|
2818
|
+
_include_is_managed: Whether to force include clusters created by the
|
|
2819
|
+
controller.
|
|
2783
2820
|
|
|
2784
2821
|
Returns:
|
|
2785
2822
|
A list of cluster records. If the cluster does not exist or has been
|
|
@@ -2788,6 +2825,13 @@ def get_clusters(
|
|
|
2788
2825
|
records = global_user_state.get_clusters()
|
|
2789
2826
|
current_user = common_utils.get_current_user()
|
|
2790
2827
|
|
|
2828
|
+
# Filter out clusters created by the controller.
|
|
2829
|
+
if (not env_options.Options.SHOW_DEBUG_INFO.get() and
|
|
2830
|
+
not _include_is_managed):
|
|
2831
|
+
records = [
|
|
2832
|
+
record for record in records if not record.get('is_managed', False)
|
|
2833
|
+
]
|
|
2834
|
+
|
|
2791
2835
|
# Filter by user if requested
|
|
2792
2836
|
if not all_users:
|
|
2793
2837
|
records = [
|
|
@@ -3221,7 +3265,8 @@ def get_endpoints(cluster: str,
|
|
|
3221
3265
|
with ux_utils.print_exception_no_traceback():
|
|
3222
3266
|
raise ValueError(f'Invalid endpoint {port!r}.') from None
|
|
3223
3267
|
cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
|
|
3224
|
-
cluster_names=[cluster]
|
|
3268
|
+
cluster_names=[cluster],
|
|
3269
|
+
_include_is_managed=True)
|
|
3225
3270
|
if not cluster_records:
|
|
3226
3271
|
with ux_utils.print_exception_no_traceback():
|
|
3227
3272
|
raise exceptions.ClusterNotUpError(
|
|
@@ -3311,3 +3356,8 @@ def cluster_status_lock_id(cluster_name: str) -> str:
|
|
|
3311
3356
|
def cluster_file_mounts_lock_id(cluster_name: str) -> str:
|
|
3312
3357
|
"""Get the lock ID for cluster file mounts operations."""
|
|
3313
3358
|
return f'{cluster_name}_file_mounts'
|
|
3359
|
+
|
|
3360
|
+
|
|
3361
|
+
def workspace_lock_id(workspace_name: str) -> str:
|
|
3362
|
+
"""Get the lock ID for workspace operations."""
|
|
3363
|
+
return f'{workspace_name}_workspace'
|
|
@@ -1177,7 +1177,8 @@ class RetryingVmProvisioner(object):
|
|
|
1177
1177
|
local_wheel_path: pathlib.Path,
|
|
1178
1178
|
wheel_hash: str,
|
|
1179
1179
|
blocked_resources: Optional[Iterable[
|
|
1180
|
-
resources_lib.Resources]] = None
|
|
1180
|
+
resources_lib.Resources]] = None,
|
|
1181
|
+
is_managed: Optional[bool] = None):
|
|
1181
1182
|
self._blocked_resources: Set[resources_lib.Resources] = set()
|
|
1182
1183
|
if blocked_resources:
|
|
1183
1184
|
# blocked_resources is not None and not empty.
|
|
@@ -1189,6 +1190,7 @@ class RetryingVmProvisioner(object):
|
|
|
1189
1190
|
self._requested_features = requested_features
|
|
1190
1191
|
self._local_wheel_path = local_wheel_path
|
|
1191
1192
|
self._wheel_hash = wheel_hash
|
|
1193
|
+
self._is_managed = is_managed
|
|
1192
1194
|
|
|
1193
1195
|
def _yield_zones(
|
|
1194
1196
|
self, to_provision: resources_lib.Resources, num_nodes: int,
|
|
@@ -1522,8 +1524,16 @@ class RetryingVmProvisioner(object):
|
|
|
1522
1524
|
cluster_handle=handle,
|
|
1523
1525
|
requested_resources=requested_resources,
|
|
1524
1526
|
ready=False,
|
|
1527
|
+
is_managed=self._is_managed,
|
|
1525
1528
|
)
|
|
1526
1529
|
|
|
1530
|
+
# Add cluster event for actual provisioning start.
|
|
1531
|
+
global_user_state.add_cluster_event(
|
|
1532
|
+
cluster_name, status_lib.ClusterStatus.INIT,
|
|
1533
|
+
f'Provisioning on {to_provision.cloud.display_name()} ' +
|
|
1534
|
+
f'in {to_provision.region}',
|
|
1535
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
1536
|
+
|
|
1527
1537
|
global_user_state.set_owner_identity_for_cluster(
|
|
1528
1538
|
cluster_name, cloud_user_identity)
|
|
1529
1539
|
|
|
@@ -2753,6 +2763,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2753
2763
|
self._dag = None
|
|
2754
2764
|
self._optimize_target = None
|
|
2755
2765
|
self._requested_features = set()
|
|
2766
|
+
self._dump_final_script = False
|
|
2767
|
+
self._is_managed = False
|
|
2756
2768
|
|
|
2757
2769
|
# Command for running the setup script. It is only set when the
|
|
2758
2770
|
# setup needs to be run outside the self._setup() and as part of
|
|
@@ -2769,6 +2781,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2769
2781
|
self._requested_features = kwargs.pop('requested_features',
|
|
2770
2782
|
self._requested_features)
|
|
2771
2783
|
self._dump_final_script = kwargs.pop('dump_final_script', False)
|
|
2784
|
+
self._is_managed = kwargs.pop('is_managed', False)
|
|
2772
2785
|
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
|
2773
2786
|
|
|
2774
2787
|
def check_resources_fit_cluster(
|
|
@@ -2930,10 +2943,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2930
2943
|
skip_unnecessary_provisioning)
|
|
2931
2944
|
except locks.LockTimeout:
|
|
2932
2945
|
if not communicated_with_user:
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2946
|
+
rich_utils.force_update_status(
|
|
2947
|
+
ux_utils.spinner_message('Launching - blocked by ' +
|
|
2948
|
+
'other requests ' +
|
|
2949
|
+
colorama.Style.RESET_ALL +
|
|
2950
|
+
colorama.Style.DIM +
|
|
2951
|
+
'Check concurrent requests: ' +
|
|
2952
|
+
'sky api status '))
|
|
2937
2953
|
|
|
2938
2954
|
def _locked_provision(
|
|
2939
2955
|
self,
|
|
@@ -2990,7 +3006,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
2990
3006
|
self._requested_features,
|
|
2991
3007
|
local_wheel_path,
|
|
2992
3008
|
wheel_hash,
|
|
2993
|
-
blocked_resources=task.blocked_resources
|
|
3009
|
+
blocked_resources=task.blocked_resources,
|
|
3010
|
+
is_managed=self._is_managed)
|
|
2994
3011
|
log_path = os.path.join(self.log_dir, 'provision.log')
|
|
2995
3012
|
rich_utils.force_update_status(
|
|
2996
3013
|
ux_utils.spinner_message('Launching', log_path))
|
|
@@ -3000,6 +3017,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3000
3017
|
break
|
|
3001
3018
|
except exceptions.ResourcesUnavailableError as e:
|
|
3002
3019
|
log_path = retry_provisioner.log_dir + '/provision.log'
|
|
3020
|
+
|
|
3003
3021
|
error_message = (
|
|
3004
3022
|
f'{colorama.Fore.RED}Failed to provision all '
|
|
3005
3023
|
f'possible launchable resources.'
|
|
@@ -3016,6 +3034,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3016
3034
|
hint_message = (f'\n{retry_message} '
|
|
3017
3035
|
f'{ux_utils.log_path_hint(log_path)}'
|
|
3018
3036
|
f'{colorama.Style.RESET_ALL}')
|
|
3037
|
+
|
|
3038
|
+
# Add cluster event for retry.
|
|
3039
|
+
global_user_state.add_cluster_event(
|
|
3040
|
+
cluster_name, status_lib.ClusterStatus.INIT,
|
|
3041
|
+
f'Retrying provisioning after {gap_seconds:.0f}s',
|
|
3042
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3043
|
+
|
|
3019
3044
|
raise exceptions.ExecutionRetryableError(
|
|
3020
3045
|
error_message,
|
|
3021
3046
|
hint=hint_message,
|
|
@@ -3067,6 +3092,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3067
3092
|
# and other necessary files to the VM.
|
|
3068
3093
|
# 3. Run setup commands to install dependencies.
|
|
3069
3094
|
# 4. Starting ray cluster and skylet.
|
|
3095
|
+
|
|
3096
|
+
# Add cluster event for runtime setup start
|
|
3097
|
+
global_user_state.add_cluster_event(
|
|
3098
|
+
handle.cluster_name, status_lib.ClusterStatus.INIT,
|
|
3099
|
+
'Setting up SkyPilot runtime on cluster',
|
|
3100
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3101
|
+
|
|
3070
3102
|
cluster_info = provisioner.post_provision_runtime_setup(
|
|
3071
3103
|
repr(handle.launched_resources.cloud),
|
|
3072
3104
|
resources_utils.ClusterName(handle.cluster_name,
|
|
@@ -3252,6 +3284,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
3252
3284
|
config_hash=config_hash,
|
|
3253
3285
|
task_config=user_specified_task_config,
|
|
3254
3286
|
)
|
|
3287
|
+
|
|
3288
|
+
# Add cluster event for successful provisioning.
|
|
3289
|
+
global_user_state.add_cluster_event(
|
|
3290
|
+
handle.cluster_name, status_lib.ClusterStatus.UP,
|
|
3291
|
+
'Cluster successfully provisioned with ' +
|
|
3292
|
+
f'{handle.launched_nodes} nodes',
|
|
3293
|
+
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
3294
|
+
|
|
3255
3295
|
usage_lib.messages.usage.update_final_cluster_status(
|
|
3256
3296
|
status_lib.ClusterStatus.UP)
|
|
3257
3297
|
# We still add the cluster to ssh config file on API server, this
|
|
@@ -4624,8 +4664,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
|
4624
4664
|
non_terminated_only=False)
|
|
4625
4665
|
|
|
4626
4666
|
unexpected_node_state: Optional[Tuple[str, str]] = None
|
|
4627
|
-
for node_id,
|
|
4628
|
-
|
|
4667
|
+
for node_id, node_status_tuple in node_status_dict.items():
|
|
4668
|
+
node_status, reason = node_status_tuple
|
|
4669
|
+
reason = '' if reason is None else f' ({reason})'
|
|
4670
|
+
logger.debug(f'{node_id} status: {node_status}{reason}')
|
|
4629
4671
|
# FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
|
|
4630
4672
|
# between "stopping/stopped" and "terminating/terminated",
|
|
4631
4673
|
# so we allow for either status instead of casing on
|
sky/client/cli/command.py
CHANGED
|
@@ -35,7 +35,7 @@ import sys
|
|
|
35
35
|
import traceback
|
|
36
36
|
import typing
|
|
37
37
|
from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
|
|
38
|
-
Union)
|
|
38
|
+
TypeVar, Union)
|
|
39
39
|
|
|
40
40
|
import click
|
|
41
41
|
import colorama
|
|
@@ -116,6 +116,8 @@ _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
|
|
|
116
116
|
'`sky jobs launch`. `{command}` supports a '
|
|
117
117
|
'single task only.')
|
|
118
118
|
|
|
119
|
+
T = TypeVar('T')
|
|
120
|
+
|
|
119
121
|
|
|
120
122
|
def _get_cluster_records_and_set_ssh_config(
|
|
121
123
|
clusters: Optional[List[str]],
|
|
@@ -224,8 +226,8 @@ def _get_glob_matches(candidate_names: List[str],
|
|
|
224
226
|
return list(set(glob_storages))
|
|
225
227
|
|
|
226
228
|
|
|
227
|
-
def _async_call_or_wait(request_id:
|
|
228
|
-
request_name: str) -> Any:
|
|
229
|
+
def _async_call_or_wait(request_id: server_common.RequestId[T],
|
|
230
|
+
async_call: bool, request_name: str) -> Any:
|
|
229
231
|
short_request_id = request_id[:8]
|
|
230
232
|
if not async_call:
|
|
231
233
|
try:
|
|
@@ -1411,7 +1413,7 @@ def exec(
|
|
|
1411
1413
|
|
|
1412
1414
|
|
|
1413
1415
|
def _handle_jobs_queue_request(
|
|
1414
|
-
request_id: str,
|
|
1416
|
+
request_id: server_common.RequestId[List[Dict[str, Any]]],
|
|
1415
1417
|
show_all: bool,
|
|
1416
1418
|
show_user: bool,
|
|
1417
1419
|
max_num_jobs_to_show: Optional[int],
|
|
@@ -1492,7 +1494,7 @@ def _handle_jobs_queue_request(
|
|
|
1492
1494
|
|
|
1493
1495
|
|
|
1494
1496
|
def _handle_services_request(
|
|
1495
|
-
request_id: str,
|
|
1497
|
+
request_id: server_common.RequestId[List[Dict[str, Any]]],
|
|
1496
1498
|
service_names: Optional[List[str]],
|
|
1497
1499
|
show_all: bool,
|
|
1498
1500
|
show_endpoint: bool,
|
|
@@ -1879,17 +1881,19 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1879
1881
|
skip_finished=True,
|
|
1880
1882
|
all_users=all_users)
|
|
1881
1883
|
|
|
1882
|
-
def submit_services(
|
|
1884
|
+
def submit_services(
|
|
1885
|
+
) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
|
|
1883
1886
|
return serve_lib.status(service_names=None)
|
|
1884
1887
|
|
|
1885
|
-
def submit_pools(
|
|
1888
|
+
def submit_pools(
|
|
1889
|
+
) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
|
|
1886
1890
|
try:
|
|
1887
1891
|
return managed_jobs.pool_status(pool_names=None)
|
|
1888
1892
|
except exceptions.APINotSupportedError as e:
|
|
1889
1893
|
logger.debug(f'Pools are not supported in the remote server: {e}')
|
|
1890
1894
|
return None
|
|
1891
1895
|
|
|
1892
|
-
def submit_workspace() -> Optional[str]:
|
|
1896
|
+
def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
|
|
1893
1897
|
try:
|
|
1894
1898
|
return sdk.workspaces()
|
|
1895
1899
|
except RuntimeError:
|
|
@@ -1928,11 +1932,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1928
1932
|
if not (ip or show_endpoints):
|
|
1929
1933
|
workspace_request_id = workspace_request_future.result()
|
|
1930
1934
|
|
|
1931
|
-
managed_jobs_queue_request_id = (
|
|
1932
|
-
|
|
1933
|
-
|
|
1935
|
+
managed_jobs_queue_request_id = (server_common.RequestId()
|
|
1936
|
+
if not managed_jobs_queue_request_id else
|
|
1937
|
+
managed_jobs_queue_request_id)
|
|
1938
|
+
service_status_request_id = (server_common.RequestId()
|
|
1939
|
+
if not service_status_request_id else
|
|
1934
1940
|
service_status_request_id)
|
|
1935
|
-
pool_status_request_id = (
|
|
1941
|
+
pool_status_request_id = (server_common.RequestId()
|
|
1942
|
+
if not pool_status_request_id else
|
|
1936
1943
|
pool_status_request_id)
|
|
1937
1944
|
|
|
1938
1945
|
# Phase 3: Get cluster records and handle special cases
|
|
@@ -1957,7 +1964,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
|
1957
1964
|
if workspace_request_id is not None:
|
|
1958
1965
|
all_workspaces = sdk.get(workspace_request_id)
|
|
1959
1966
|
else:
|
|
1960
|
-
all_workspaces =
|
|
1967
|
+
all_workspaces = {constants.SKYPILOT_DEFAULT_WORKSPACE: {}}
|
|
1961
1968
|
active_workspace = skypilot_config.get_active_workspace()
|
|
1962
1969
|
show_workspace = len(all_workspaces) > 1
|
|
1963
1970
|
_show_enabled_infra(active_workspace, show_workspace)
|
|
@@ -2974,6 +2981,8 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
|
|
2974
2981
|
controller = controller_utils.Controllers.from_name(controller_name)
|
|
2975
2982
|
assert controller is not None, controller_name
|
|
2976
2983
|
|
|
2984
|
+
# TODO(tian): We also need to check pools after we allow running pools on
|
|
2985
|
+
# jobs controller.
|
|
2977
2986
|
with rich_utils.client_status(
|
|
2978
2987
|
'[bold cyan]Checking for in-progress managed jobs[/]'):
|
|
2979
2988
|
try:
|
|
@@ -3070,6 +3079,21 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
|
|
3070
3079
|
# controller being STOPPED or being firstly launched, i.e., there is
|
|
3071
3080
|
# no in-prgress services.
|
|
3072
3081
|
services = []
|
|
3082
|
+
except exceptions.InconsistentConsolidationModeError:
|
|
3083
|
+
# If this error is raised, it means the user switched to the
|
|
3084
|
+
# consolidation mode but the previous controller cluster is still
|
|
3085
|
+
# running. We should allow the user to tear down the controller
|
|
3086
|
+
# cluster in this case.
|
|
3087
|
+
with skypilot_config.override_skypilot_config(
|
|
3088
|
+
{'serve': {
|
|
3089
|
+
'controller': {
|
|
3090
|
+
'consolidation_mode': False
|
|
3091
|
+
}
|
|
3092
|
+
}}):
|
|
3093
|
+
# Check again with the consolidation mode disabled. This is to
|
|
3094
|
+
# make sure there is no in-progress services.
|
|
3095
|
+
request_id = serve_lib.status(service_names=None)
|
|
3096
|
+
services = sdk.stream_and_get(request_id)
|
|
3073
3097
|
|
|
3074
3098
|
if services:
|
|
3075
3099
|
service_names = [service['name'] for service in services]
|
|
@@ -3836,7 +3860,7 @@ def show_gpus(
|
|
|
3836
3860
|
yield k8s_messages
|
|
3837
3861
|
yield '\n\n'
|
|
3838
3862
|
|
|
3839
|
-
|
|
3863
|
+
list_accelerator_counts_result = sdk.stream_and_get(
|
|
3840
3864
|
sdk.list_accelerator_counts(
|
|
3841
3865
|
gpus_only=True,
|
|
3842
3866
|
clouds=clouds_to_list,
|
|
@@ -3853,14 +3877,20 @@ def show_gpus(
|
|
|
3853
3877
|
|
|
3854
3878
|
# "Common" GPUs
|
|
3855
3879
|
for gpu in catalog.get_common_gpus():
|
|
3856
|
-
if gpu in
|
|
3857
|
-
gpu_table.add_row([
|
|
3880
|
+
if gpu in list_accelerator_counts_result:
|
|
3881
|
+
gpu_table.add_row([
|
|
3882
|
+
gpu,
|
|
3883
|
+
_list_to_str(list_accelerator_counts_result.pop(gpu))
|
|
3884
|
+
])
|
|
3858
3885
|
yield from gpu_table.get_string()
|
|
3859
3886
|
|
|
3860
3887
|
# Google TPUs
|
|
3861
3888
|
for tpu in catalog.get_tpus():
|
|
3862
|
-
if tpu in
|
|
3863
|
-
tpu_table.add_row([
|
|
3889
|
+
if tpu in list_accelerator_counts_result:
|
|
3890
|
+
tpu_table.add_row([
|
|
3891
|
+
tpu,
|
|
3892
|
+
_list_to_str(list_accelerator_counts_result.pop(tpu))
|
|
3893
|
+
])
|
|
3864
3894
|
if tpu_table.get_string():
|
|
3865
3895
|
yield '\n\n'
|
|
3866
3896
|
yield from tpu_table.get_string()
|
|
@@ -3868,7 +3898,7 @@ def show_gpus(
|
|
|
3868
3898
|
# Other GPUs
|
|
3869
3899
|
if show_all:
|
|
3870
3900
|
yield '\n\n'
|
|
3871
|
-
for gpu, qty in sorted(
|
|
3901
|
+
for gpu, qty in sorted(list_accelerator_counts_result.items()):
|
|
3872
3902
|
other_table.add_row([gpu, _list_to_str(qty)])
|
|
3873
3903
|
yield from other_table.get_string()
|
|
3874
3904
|
yield '\n\n'
|
|
@@ -3919,7 +3949,7 @@ def show_gpus(
|
|
|
3919
3949
|
|
|
3920
3950
|
# For clouds other than Kubernetes, get the accelerator details
|
|
3921
3951
|
# Case-sensitive
|
|
3922
|
-
|
|
3952
|
+
list_accelerators_result = sdk.stream_and_get(
|
|
3923
3953
|
sdk.list_accelerators(gpus_only=True,
|
|
3924
3954
|
name_filter=name,
|
|
3925
3955
|
quantity_filter=quantity,
|
|
@@ -3935,8 +3965,8 @@ def show_gpus(
|
|
|
3935
3965
|
# - Group by cloud
|
|
3936
3966
|
# - Sort within each group by prices
|
|
3937
3967
|
# - Sort groups by each cloud's (min price, min spot price)
|
|
3938
|
-
new_result = {}
|
|
3939
|
-
for i, (gpu, items) in enumerate(
|
|
3968
|
+
new_result: Dict[str, List[catalog_common.InstanceTypeInfo]] = {}
|
|
3969
|
+
for i, (gpu, items) in enumerate(list_accelerators_result.items()):
|
|
3940
3970
|
df = pd.DataFrame([t._asdict() for t in items])
|
|
3941
3971
|
# Determine the minimum prices for each cloud.
|
|
3942
3972
|
min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'),
|
|
@@ -3954,14 +3984,14 @@ def show_gpus(
|
|
|
3954
3984
|
for row in df.to_records(index=False)
|
|
3955
3985
|
]
|
|
3956
3986
|
new_result[gpu] = sorted_dataclasses
|
|
3957
|
-
|
|
3987
|
+
list_accelerators_result = new_result
|
|
3958
3988
|
|
|
3959
3989
|
if print_section_titles and not show_all:
|
|
3960
3990
|
yield '\n\n'
|
|
3961
3991
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
|
3962
3992
|
f'Cloud GPUs{colorama.Style.RESET_ALL}\n')
|
|
3963
3993
|
|
|
3964
|
-
if not
|
|
3994
|
+
if not list_accelerators_result:
|
|
3965
3995
|
quantity_str = (f' with requested quantity {quantity}'
|
|
3966
3996
|
if quantity else '')
|
|
3967
3997
|
cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.'
|
|
@@ -3969,7 +3999,7 @@ def show_gpus(
|
|
|
3969
3999
|
yield 'To show available accelerators, run: sky show-gpus --all'
|
|
3970
4000
|
return
|
|
3971
4001
|
|
|
3972
|
-
for i, (gpu, items) in enumerate(
|
|
4002
|
+
for i, (gpu, items) in enumerate(list_accelerators_result.items()):
|
|
3973
4003
|
accelerator_table_headers = [
|
|
3974
4004
|
'GPU',
|
|
3975
4005
|
'QTY',
|
|
@@ -6039,7 +6069,11 @@ def api_logs(request_id: Optional[str], server_logs: bool,
|
|
|
6039
6069
|
if request_id is not None and log_path is not None:
|
|
6040
6070
|
raise click.BadParameter(
|
|
6041
6071
|
'Only one of request ID and log path can be provided.')
|
|
6042
|
-
|
|
6072
|
+
# Only wrap request_id when it is provided; otherwise pass None so the
|
|
6073
|
+
# server accepts log_path-only streaming.
|
|
6074
|
+
req_id = (server_common.RequestId[None](request_id)
|
|
6075
|
+
if request_id is not None else None)
|
|
6076
|
+
sdk.stream_and_get(req_id, log_path, tail, follow=follow)
|
|
6043
6077
|
|
|
6044
6078
|
|
|
6045
6079
|
@api.command('cancel', cls=_DocumentedCodeCommand)
|