PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250807py3-none-any.whl → 1.0.0.dev20250812py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (91) hide show

sky/__init__.py +2 -2
sky/adaptors/kubernetes.py +5 -2
sky/backends/backend_utils.py +57 -7
sky/backends/cloud_vm_ray_backend.py +50 -8
sky/client/cli/command.py +60 -26
sky/client/sdk.py +132 -65
sky/client/sdk_async.py +1 -1
sky/core.py +10 -2
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/{6601-3e21152fe16da09c.js → 6601-06114c982db410b6.js} +1 -1
sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
sky/dashboard/out/_next/static/chunks/{8969-318c3dca725e8e5d.js → 8969-c9686994ddafcf01.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{_app-1e6de35d15a8d432.js → _app-491a4d699d95e808.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/execution.py +21 -4
sky/global_user_state.py +110 -1
sky/jobs/client/sdk.py +27 -20
sky/jobs/controller.py +2 -1
sky/jobs/recovery_strategy.py +3 -0
sky/jobs/server/core.py +4 -0
sky/jobs/utils.py +9 -2
sky/provision/__init__.py +3 -2
sky/provision/aws/instance.py +5 -4
sky/provision/azure/instance.py +5 -4
sky/provision/cudo/instance.py +5 -4
sky/provision/do/instance.py +5 -4
sky/provision/fluidstack/instance.py +5 -4
sky/provision/gcp/instance.py +5 -4
sky/provision/hyperbolic/instance.py +5 -4
sky/provision/kubernetes/instance.py +36 -6
sky/provision/lambda_cloud/instance.py +5 -4
sky/provision/nebius/instance.py +5 -4
sky/provision/oci/instance.py +5 -4
sky/provision/paperspace/instance.py +5 -4
sky/provision/provisioner.py +6 -0
sky/provision/runpod/instance.py +5 -4
sky/provision/scp/instance.py +5 -5
sky/provision/vast/instance.py +5 -5
sky/provision/vsphere/instance.py +5 -4
sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
sky/schemas/db/global_user_state/004_is_managed.py +34 -0
sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
sky/schemas/db/serve_state/001_initial_schema.py +67 -0
sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
sky/serve/client/impl.py +11 -8
sky/serve/client/sdk.py +7 -7
sky/serve/serve_state.py +437 -340
sky/serve/serve_utils.py +37 -3
sky/serve/server/impl.py +2 -2
sky/server/common.py +12 -8
sky/server/constants.py +1 -1
sky/setup_files/alembic.ini +4 -0
sky/skypilot_config.py +4 -4
sky/users/permission.py +1 -1
sky/utils/cli_utils/status_utils.py +10 -1
sky/utils/db/db_utils.py +53 -1
sky/utils/db/migration_utils.py +5 -1
sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
sky/utils/resource_checker.py +162 -21
sky/volumes/client/sdk.py +4 -4
sky/workspaces/core.py +210 -6
{skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/METADATA +2 -2
{skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/RECORD +87 -83
sky/dashboard/out/_next/static/chunks/8056-019615038d6ce427.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6fd1d2d8441aa54b.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-76efbdad99742559.js +0 -1
/sky/dashboard/out/_next/static/{YAirOGsV1z6B2RJ0VIUmD → Fuy7OzApYTUMz2QgoP7dP}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250807.dist-info → skypilot_nightly-1.0.0.dev20250812.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = 'a167cba8230b0ffda6baa0c825fa0eb5d5ab4aa4'
+_SKYPILOT_COMMIT_SHA = '1e311e80f4a9112a6d2c86bb78d4c225042cedbc'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250807'
+__version__ = '1.0.0.dev20250812'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/adaptors/kubernetes.py CHANGED Viewed

@@ -142,8 +142,11 @@ def _load_config(context: Optional[str] = None):
             # show up in SkyPilot tasks. For now, we work around by using
             # DNS name instead of environment variables.
             # See issue: https://github.com/skypilot-org/skypilot/issues/2287
-            os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
-            os.environ['KUBERNETES_SERVICE_PORT'] = '443'
+            # Only set if not already present (preserving existing values)
+            if 'KUBERNETES_SERVICE_HOST' not in os.environ:
+                os.environ['KUBERNETES_SERVICE_HOST'] = 'kubernetes.default.svc'
+            if 'KUBERNETES_SERVICE_PORT' not in os.environ:
+                os.environ['KUBERNETES_SERVICE_PORT'] = '443'
             kubernetes.config.load_incluster_config()
         except kubernetes.config.config_exception.ConfigException:
             _load_config_from_kubeconfig()

sky/backends/backend_utils.py CHANGED Viewed

@@ -121,6 +121,7 @@ CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
 _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
 CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
+WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
 # Remote dir that holds our runtime files.
 _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -1772,8 +1773,9 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
 def _query_cluster_status_via_cloud_api(
     handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
-) -> List[status_lib.ClusterStatus]:
-    """Returns the status of the cluster.
+) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
+    """Returns the status of the cluster as a list of tuples corresponding
+    to the node status and an optional reason string for said status.
     Raises:
         exceptions.ClusterStatusFetchingError: the cluster status cannot be
@@ -1812,9 +1814,13 @@ def _query_cluster_status_via_cloud_api(
         region = provider_config.get('region') or provider_config.get(
             'location')
         zone = ray_config['provider'].get('availability_zone')
+        # TODO (kyuds): refactor cloud.query_status api to include reason.
+        # Currently not refactoring as this API is actually supposed to be
+        # deprecated soon.
         node_statuses = cloud.query_status(
             cluster_name_on_cloud,
             tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
+        node_statuses = [(status, None) for status in node_statuses]
     return node_statuses
@@ -2014,8 +2020,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
     node_statuses = _query_cluster_status_via_cloud_api(handle)
-    all_nodes_up = (all(
-        status == status_lib.ClusterStatus.UP for status in node_statuses) and
+    all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
+                        for status in node_statuses) and
                     len(node_statuses) == handle.launched_nodes)
     def get_node_counts_from_ray_status(
@@ -2120,6 +2126,13 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
         # head-ip/worker-ips`.
         record['status'] = status_lib.ClusterStatus.UP
+        # Add cluster event for instance status check.
+        global_user_state.add_cluster_event(
+            cluster_name,
+            status_lib.ClusterStatus.UP,
+            'All nodes up + ray cluster healthy.',
+            global_user_state.ClusterEventType.STATUS_CHANGE,
+            nop_if_duplicate=True)
         global_user_state.add_or_update_cluster(cluster_name,
                                                 handle,
                                                 requested_resources=None,
@@ -2204,9 +2217,19 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
     #      regardless of the ray cluster's health.
     #  (2) Otherwise, we will reset the autostop setting, unless the cluster is
     #      autostopping/autodowning.
-    is_abnormal = ((0 < len(node_statuses) < handle.launched_nodes) or any(
-        status != status_lib.ClusterStatus.STOPPED for status in node_statuses))
+    some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
+    some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
+                                 for status in node_statuses)
+    is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
     if is_abnormal:
+        status_reason = ', '.join(
+            [status[1] for status in node_statuses if status[1] is not None])
+        if some_nodes_terminated:
+            init_reason = f'one or more nodes terminated ({status_reason})'
+        elif some_nodes_not_stopped:
+            init_reason = f'some nodes are up and some nodes are stopped ({status_reason})'
         logger.debug('The cluster is abnormal. Setting to INIT status. '
                      f'node_statuses: {node_statuses}')
         if record['autostop'] >= 0:
@@ -2290,6 +2313,12 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         # represent that the cluster is partially preempted.
         # TODO(zhwu): the definition of INIT should be audited/changed.
         # Adding a new status UNHEALTHY for abnormal status can be a choice.
+        global_user_state.add_cluster_event(
+            cluster_name,
+            status_lib.ClusterStatus.INIT,
+            f'Cluster is abnormal because {init_reason}. Transitioned to INIT.',
+            global_user_state.ClusterEventType.STATUS_CHANGE,
+            nop_if_duplicate=True)
         global_user_state.add_or_update_cluster(cluster_name,
                                                 handle,
                                                 requested_resources=None,
@@ -2300,6 +2329,9 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
     # STOPPED.
     backend = backends.CloudVmRayBackend()
     backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
+    global_user_state.add_cluster_event(
+        cluster_name, None, 'All nodes stopped, terminating cluster.',
+        global_user_state.ClusterEventType.STATUS_CHANGE)
     return global_user_state.get_cluster_from_name(cluster_name)
@@ -2760,6 +2792,9 @@ def get_clusters(
     refresh: common.StatusRefreshMode,
     cluster_names: Optional[Union[str, List[str]]] = None,
     all_users: bool = True,
+    # Internal only:
+    # pylint: disable=invalid-name
+    _include_is_managed: bool = False,
 ) -> List[Dict[str, Any]]:
     """Returns a list of cached or optionally refreshed cluster records.
@@ -2780,6 +2815,8 @@ def get_clusters(
             names.
         all_users: If True, return clusters from all users. If False, only
             return clusters from the current user.
+        _include_is_managed: Whether to force include clusters created by the
+            controller.
     Returns:
         A list of cluster records. If the cluster does not exist or has been
@@ -2788,6 +2825,13 @@ def get_clusters(
     records = global_user_state.get_clusters()
     current_user = common_utils.get_current_user()
+    # Filter out clusters created by the controller.
+    if (not env_options.Options.SHOW_DEBUG_INFO.get() and
+            not _include_is_managed):
+        records = [
+            record for record in records if not record.get('is_managed', False)
+        ]
     # Filter by user if requested
     if not all_users:
         records = [
@@ -3221,7 +3265,8 @@ def get_endpoints(cluster: str,
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(f'Invalid endpoint {port!r}.') from None
     cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
-                                   cluster_names=[cluster])
+                                   cluster_names=[cluster],
+                                   _include_is_managed=True)
     if not cluster_records:
         with ux_utils.print_exception_no_traceback():
             raise exceptions.ClusterNotUpError(
@@ -3311,3 +3356,8 @@ def cluster_status_lock_id(cluster_name: str) -> str:
 def cluster_file_mounts_lock_id(cluster_name: str) -> str:
     """Get the lock ID for cluster file mounts operations."""
     return f'{cluster_name}_file_mounts'
+def workspace_lock_id(workspace_name: str) -> str:
+    """Get the lock ID for workspace operations."""
+    return f'{workspace_name}_workspace'

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -1177,7 +1177,8 @@ class RetryingVmProvisioner(object):
                  local_wheel_path: pathlib.Path,
                  wheel_hash: str,
                  blocked_resources: Optional[Iterable[
-                     resources_lib.Resources]] = None):
+                     resources_lib.Resources]] = None,
+                 is_managed: Optional[bool] = None):
         self._blocked_resources: Set[resources_lib.Resources] = set()
         if blocked_resources:
             # blocked_resources is not None and not empty.
@@ -1189,6 +1190,7 @@ class RetryingVmProvisioner(object):
         self._requested_features = requested_features
         self._local_wheel_path = local_wheel_path
         self._wheel_hash = wheel_hash
+        self._is_managed = is_managed
     def _yield_zones(
             self, to_provision: resources_lib.Resources, num_nodes: int,
@@ -1522,8 +1524,16 @@ class RetryingVmProvisioner(object):
                 cluster_handle=handle,
                 requested_resources=requested_resources,
                 ready=False,
+                is_managed=self._is_managed,
             )
+            # Add cluster event for actual provisioning start.
+            global_user_state.add_cluster_event(
+                cluster_name, status_lib.ClusterStatus.INIT,
+                f'Provisioning on {to_provision.cloud.display_name()} ' +
+                f'in {to_provision.region}',
+                global_user_state.ClusterEventType.STATUS_CHANGE)
             global_user_state.set_owner_identity_for_cluster(
                 cluster_name, cloud_user_identity)
@@ -2753,6 +2763,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         self._dag = None
         self._optimize_target = None
         self._requested_features = set()
+        self._dump_final_script = False
+        self._is_managed = False
         # Command for running the setup script. It is only set when the
         # setup needs to be run outside the self._setup() and as part of
@@ -2769,6 +2781,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         self._requested_features = kwargs.pop('requested_features',
                                               self._requested_features)
         self._dump_final_script = kwargs.pop('dump_final_script', False)
+        self._is_managed = kwargs.pop('is_managed', False)
         assert not kwargs, f'Unexpected kwargs: {kwargs}'
     def check_resources_fit_cluster(
@@ -2930,10 +2943,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                               skip_unnecessary_provisioning)
             except locks.LockTimeout:
                 if not communicated_with_user:
-                    logger.info(f'{colorama.Fore.YELLOW}'
-                                f'Launching delayed, check concurrent tasks: '
-                                f'sky api status')
-                    communicated_with_user = True
+                    rich_utils.force_update_status(
+                        ux_utils.spinner_message('Launching - blocked by ' +
+                                                 'other requests ' +
+                                                 colorama.Style.RESET_ALL +
+                                                 colorama.Style.DIM +
+                                                 'Check concurrent requests: ' +
+                                                 'sky api status '))
     def _locked_provision(
         self,
@@ -2990,7 +3006,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         self._requested_features,
                         local_wheel_path,
                         wheel_hash,
-                        blocked_resources=task.blocked_resources)
+                        blocked_resources=task.blocked_resources,
+                        is_managed=self._is_managed)
                     log_path = os.path.join(self.log_dir, 'provision.log')
                     rich_utils.force_update_status(
                         ux_utils.spinner_message('Launching', log_path))
@@ -3000,6 +3017,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     break
                 except exceptions.ResourcesUnavailableError as e:
                     log_path = retry_provisioner.log_dir + '/provision.log'
                     error_message = (
                         f'{colorama.Fore.RED}Failed to provision all '
                         f'possible launchable resources.'
@@ -3016,6 +3034,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         hint_message = (f'\n{retry_message} '
                                         f'{ux_utils.log_path_hint(log_path)}'
                                         f'{colorama.Style.RESET_ALL}')
+                        # Add cluster event for retry.
+                        global_user_state.add_cluster_event(
+                            cluster_name, status_lib.ClusterStatus.INIT,
+                            f'Retrying provisioning after {gap_seconds:.0f}s',
+                            global_user_state.ClusterEventType.STATUS_CHANGE)
                         raise exceptions.ExecutionRetryableError(
                             error_message,
                             hint=hint_message,
@@ -3067,6 +3092,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 #    and other necessary files to the VM.
                 # 3. Run setup commands to install dependencies.
                 # 4. Starting ray cluster and skylet.
+                # Add cluster event for runtime setup start
+                global_user_state.add_cluster_event(
+                    handle.cluster_name, status_lib.ClusterStatus.INIT,
+                    'Setting up SkyPilot runtime on cluster',
+                    global_user_state.ClusterEventType.STATUS_CHANGE)
                 cluster_info = provisioner.post_provision_runtime_setup(
                     repr(handle.launched_resources.cloud),
                     resources_utils.ClusterName(handle.cluster_name,
@@ -3252,6 +3284,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 config_hash=config_hash,
                 task_config=user_specified_task_config,
             )
+            # Add cluster event for successful provisioning.
+            global_user_state.add_cluster_event(
+                handle.cluster_name, status_lib.ClusterStatus.UP,
+                'Cluster successfully provisioned with ' +
+                f'{handle.launched_nodes} nodes',
+                global_user_state.ClusterEventType.STATUS_CHANGE)
             usage_lib.messages.usage.update_final_cluster_status(
                 status_lib.ClusterStatus.UP)
             # We still add the cluster to ssh config file on API server, this
@@ -4624,8 +4664,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     non_terminated_only=False)
                 unexpected_node_state: Optional[Tuple[str, str]] = None
-                for node_id, node_status in node_status_dict.items():
-                    logger.debug(f'{node_id} status: {node_status}')
+                for node_id, node_status_tuple in node_status_dict.items():
+                    node_status, reason = node_status_tuple
+                    reason = '' if reason is None else f' ({reason})'
+                    logger.debug(f'{node_id} status: {node_status}{reason}')
                     # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
                     # between "stopping/stopped" and "terminating/terminated",
                     # so we allow for either status instead of casing on

sky/client/cli/command.py CHANGED Viewed

@@ -35,7 +35,7 @@ import sys
 import traceback
 import typing
 from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
-                    Union)
+                    TypeVar, Union)
 import click
 import colorama
@@ -116,6 +116,8 @@ _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
                               '`sky jobs launch`. `{command}` supports a '
                               'single task only.')
+T = TypeVar('T')
 def _get_cluster_records_and_set_ssh_config(
     clusters: Optional[List[str]],
@@ -224,8 +226,8 @@ def _get_glob_matches(candidate_names: List[str],
     return list(set(glob_storages))
-def _async_call_or_wait(request_id: str, async_call: bool,
-                        request_name: str) -> Any:
+def _async_call_or_wait(request_id: server_common.RequestId[T],
+                        async_call: bool, request_name: str) -> Any:
     short_request_id = request_id[:8]
     if not async_call:
         try:
@@ -1411,7 +1413,7 @@ def exec(
 def _handle_jobs_queue_request(
-        request_id: str,
+        request_id: server_common.RequestId[List[Dict[str, Any]]],
         show_all: bool,
         show_user: bool,
         max_num_jobs_to_show: Optional[int],
@@ -1492,7 +1494,7 @@ def _handle_jobs_queue_request(
 def _handle_services_request(
-    request_id: str,
+    request_id: server_common.RequestId[List[Dict[str, Any]]],
     service_names: Optional[List[str]],
     show_all: bool,
     show_endpoint: bool,
@@ -1879,17 +1881,19 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
                                   skip_finished=True,
                                   all_users=all_users)
-    def submit_services() -> Optional[str]:
+    def submit_services(
+    ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
         return serve_lib.status(service_names=None)
-    def submit_pools() -> Optional[str]:
+    def submit_pools(
+    ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
         try:
             return managed_jobs.pool_status(pool_names=None)
         except exceptions.APINotSupportedError as e:
             logger.debug(f'Pools are not supported in the remote server: {e}')
             return None
-    def submit_workspace() -> Optional[str]:
+    def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
         try:
             return sdk.workspaces()
         except RuntimeError:
@@ -1928,11 +1932,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
         if not (ip or show_endpoints):
             workspace_request_id = workspace_request_future.result()
-    managed_jobs_queue_request_id = ('' if not managed_jobs_queue_request_id
-                                     else managed_jobs_queue_request_id)
-    service_status_request_id = ('' if not service_status_request_id else
+    managed_jobs_queue_request_id = (server_common.RequestId()
+                                     if not managed_jobs_queue_request_id else
+                                     managed_jobs_queue_request_id)
+    service_status_request_id = (server_common.RequestId()
+                                 if not service_status_request_id else
                                  service_status_request_id)
-    pool_status_request_id = ('' if not pool_status_request_id else
+    pool_status_request_id = (server_common.RequestId()
+                              if not pool_status_request_id else
                               pool_status_request_id)
     # Phase 3: Get cluster records and handle special cases
@@ -1957,7 +1964,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
     if workspace_request_id is not None:
         all_workspaces = sdk.get(workspace_request_id)
     else:
-        all_workspaces = [constants.SKYPILOT_DEFAULT_WORKSPACE]
+        all_workspaces = {constants.SKYPILOT_DEFAULT_WORKSPACE: {}}
     active_workspace = skypilot_config.get_active_workspace()
     show_workspace = len(all_workspaces) > 1
     _show_enabled_infra(active_workspace, show_workspace)
@@ -2974,6 +2981,8 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
     controller = controller_utils.Controllers.from_name(controller_name)
     assert controller is not None, controller_name
+    # TODO(tian): We also need to check pools after we allow running pools on
+    # jobs controller.
     with rich_utils.client_status(
             '[bold cyan]Checking for in-progress managed jobs[/]'):
         try:
@@ -3070,6 +3079,21 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
             # controller being STOPPED or being firstly launched, i.e., there is
             # no in-prgress services.
             services = []
+        except exceptions.InconsistentConsolidationModeError:
+            # If this error is raised, it means the user switched to the
+            # consolidation mode but the previous controller cluster is still
+            # running. We should allow the user to tear down the controller
+            # cluster in this case.
+            with skypilot_config.override_skypilot_config(
+                {'serve': {
+                    'controller': {
+                        'consolidation_mode': False
+                    }
+                }}):
+                # Check again with the consolidation mode disabled. This is to
+                # make sure there is no in-progress services.
+                request_id = serve_lib.status(service_names=None)
+                services = sdk.stream_and_get(request_id)
     if services:
         service_names = [service['name'] for service in services]
@@ -3836,7 +3860,7 @@ def show_gpus(
                 yield k8s_messages
                 yield '\n\n'
-            result = sdk.stream_and_get(
+            list_accelerator_counts_result = sdk.stream_and_get(
                 sdk.list_accelerator_counts(
                     gpus_only=True,
                     clouds=clouds_to_list,
@@ -3853,14 +3877,20 @@ def show_gpus(
             # "Common" GPUs
             for gpu in catalog.get_common_gpus():
-                if gpu in result:
-                    gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))])
+                if gpu in list_accelerator_counts_result:
+                    gpu_table.add_row([
+                        gpu,
+                        _list_to_str(list_accelerator_counts_result.pop(gpu))
+                    ])
             yield from gpu_table.get_string()
             # Google TPUs
             for tpu in catalog.get_tpus():
-                if tpu in result:
-                    tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))])
+                if tpu in list_accelerator_counts_result:
+                    tpu_table.add_row([
+                        tpu,
+                        _list_to_str(list_accelerator_counts_result.pop(tpu))
+                    ])
             if tpu_table.get_string():
                 yield '\n\n'
             yield from tpu_table.get_string()
@@ -3868,7 +3898,7 @@ def show_gpus(
             # Other GPUs
             if show_all:
                 yield '\n\n'
-                for gpu, qty in sorted(result.items()):
+                for gpu, qty in sorted(list_accelerator_counts_result.items()):
                     other_table.add_row([gpu, _list_to_str(qty)])
                 yield from other_table.get_string()
                 yield '\n\n'
@@ -3919,7 +3949,7 @@ def show_gpus(
         # For clouds other than Kubernetes, get the accelerator details
         # Case-sensitive
-        result = sdk.stream_and_get(
+        list_accelerators_result = sdk.stream_and_get(
             sdk.list_accelerators(gpus_only=True,
                                   name_filter=name,
                                   quantity_filter=quantity,
@@ -3935,8 +3965,8 @@ def show_gpus(
         #   - Group by cloud
         #   - Sort within each group by prices
         #   - Sort groups by each cloud's (min price, min spot price)
-        new_result = {}
-        for i, (gpu, items) in enumerate(result.items()):
+        new_result: Dict[str, List[catalog_common.InstanceTypeInfo]] = {}
+        for i, (gpu, items) in enumerate(list_accelerators_result.items()):
             df = pd.DataFrame([t._asdict() for t in items])
             # Determine the minimum prices for each cloud.
             min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'),
@@ -3954,14 +3984,14 @@ def show_gpus(
                 for row in df.to_records(index=False)
             ]
             new_result[gpu] = sorted_dataclasses
-        result = new_result
+        list_accelerators_result = new_result
         if print_section_titles and not show_all:
             yield '\n\n'
             yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
                    f'Cloud GPUs{colorama.Style.RESET_ALL}\n')
-        if not result:
+        if not list_accelerators_result:
             quantity_str = (f' with requested quantity {quantity}'
                             if quantity else '')
             cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.'
@@ -3969,7 +3999,7 @@ def show_gpus(
             yield 'To show available accelerators, run: sky show-gpus --all'
             return
-        for i, (gpu, items) in enumerate(result.items()):
+        for i, (gpu, items) in enumerate(list_accelerators_result.items()):
             accelerator_table_headers = [
                 'GPU',
                 'QTY',
@@ -6039,7 +6069,11 @@ def api_logs(request_id: Optional[str], server_logs: bool,
     if request_id is not None and log_path is not None:
         raise click.BadParameter(
             'Only one of request ID and log path can be provided.')
-    sdk.stream_and_get(request_id, log_path, tail)
+    # Only wrap request_id when it is provided; otherwise pass None so the
+    # server accepts log_path-only streaming.
+    req_id = (server_common.RequestId[None](request_id)
+              if request_id is not None else None)
+    sdk.stream_and_get(req_id, log_path, tail, follow=follow)
 @api.command('cancel', cls=_DocumentedCodeCommand)

skypilot-nightly 1.0.0.dev20250807__py3-none-any.whl → 1.0.0.dev20250812__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250807py3-none-any.whl → 1.0.0.dev20250812py3-none-any.whl