PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250922py3-none-any.whl → 1.0.0.dev20250926py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (123) hide show

sky/__init__.py +2 -2
sky/backends/backend.py +10 -0
sky/backends/backend_utils.py +207 -79
sky/backends/cloud_vm_ray_backend.py +37 -13
sky/backends/local_docker_backend.py +9 -0
sky/client/cli/command.py +112 -53
sky/client/common.py +4 -2
sky/client/sdk.py +17 -7
sky/client/sdk_async.py +4 -2
sky/clouds/kubernetes.py +2 -1
sky/clouds/runpod.py +20 -7
sky/core.py +9 -54
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +1 -0
sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
sky/dashboard/out/_next/static/chunks/9037-d0c00018a5ba198c.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ad77b12fc736dca3.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-8e64d11e58eab5cb.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/mounting_utils.py +19 -10
sky/execution.py +4 -2
sky/global_user_state.py +271 -67
sky/jobs/client/sdk.py +10 -1
sky/jobs/constants.py +2 -0
sky/jobs/controller.py +11 -7
sky/jobs/server/core.py +5 -3
sky/jobs/server/server.py +15 -11
sky/jobs/utils.py +1 -1
sky/logs/agent.py +30 -3
sky/logs/aws.py +9 -19
sky/provision/__init__.py +2 -1
sky/provision/aws/instance.py +2 -1
sky/provision/azure/instance.py +2 -1
sky/provision/cudo/instance.py +2 -2
sky/provision/do/instance.py +2 -2
sky/provision/docker_utils.py +41 -19
sky/provision/fluidstack/instance.py +2 -2
sky/provision/gcp/instance.py +2 -1
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/instance_setup.py +1 -1
sky/provision/kubernetes/instance.py +134 -8
sky/provision/lambda_cloud/instance.py +2 -1
sky/provision/nebius/instance.py +2 -1
sky/provision/oci/instance.py +2 -1
sky/provision/paperspace/instance.py +2 -2
sky/provision/primeintellect/instance.py +2 -2
sky/provision/provisioner.py +1 -0
sky/provision/runpod/__init__.py +2 -0
sky/provision/runpod/instance.py +2 -2
sky/provision/scp/instance.py +2 -2
sky/provision/seeweb/instance.py +2 -1
sky/provision/vast/instance.py +2 -1
sky/provision/vsphere/instance.py +6 -5
sky/schemas/api/responses.py +2 -1
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/serve/autoscalers.py +2 -0
sky/serve/client/impl.py +45 -19
sky/serve/replica_managers.py +12 -5
sky/serve/serve_utils.py +5 -7
sky/serve/server/core.py +9 -6
sky/serve/server/impl.py +78 -25
sky/serve/server/server.py +4 -5
sky/serve/service_spec.py +33 -0
sky/server/constants.py +1 -1
sky/server/daemons.py +2 -3
sky/server/requests/executor.py +56 -6
sky/server/requests/payloads.py +32 -8
sky/server/requests/preconditions.py +2 -3
sky/server/rest.py +2 -0
sky/server/server.py +28 -19
sky/server/stream_utils.py +34 -12
sky/setup_files/dependencies.py +5 -2
sky/setup_files/setup.py +44 -44
sky/skylet/constants.py +4 -1
sky/skylet/events.py +42 -0
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +24 -18
sky/usage/usage_lib.py +3 -0
sky/utils/cli_utils/status_utils.py +4 -5
sky/utils/context.py +104 -29
sky/utils/controller_utils.py +7 -6
sky/utils/db/db_utils.py +5 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/kubernetes/create_cluster.sh +13 -28
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/kubernetes_deploy_utils.py +194 -38
sky/utils/kubernetes_enums.py +5 -0
sky/utils/ux_utils.py +35 -1
sky/utils/yaml_utils.py +9 -0
sky/volumes/client/sdk.py +44 -8
sky/volumes/server/core.py +1 -0
sky/volumes/server/server.py +33 -7
sky/volumes/volume.py +35 -28
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/METADATA +38 -33
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/RECORD +118 -117
sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
/sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → VXU6_xE28M55BOdwmUUJS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250926.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ import urllib.request
 from sky.utils import directory_utils
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '5fc4b25c5fd6b2833aabf992583d1b1e3f843f42'
+_SKYPILOT_COMMIT_SHA = '827d534c8bbfa61b895467b9431283e923dd9841'
 def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250922'
+__version__ = '1.0.0.dev20250926'
 __root_dir__ = directory_utils.get_sky_dir()

sky/backends/backend.py CHANGED Viewed

@@ -95,6 +95,12 @@ class Backend(Generic[_ResourceHandleType]):
                      envs_and_secrets: Dict[str, str]) -> None:
         return self._sync_workdir(handle, workdir, envs_and_secrets)
+    @timeline.event
+    @usage_lib.messages.usage.update_runtime('download_file')
+    def download_file(self, handle: _ResourceHandleType, local_file_path: str,
+                      remote_file_path: str) -> None:
+        return self._download_file(handle, local_file_path, remote_file_path)
     @timeline.event
     @usage_lib.messages.usage.update_runtime('sync_file_mounts')
     def sync_file_mounts(
@@ -172,6 +178,10 @@ class Backend(Generic[_ResourceHandleType]):
                       envs_and_secrets: Dict[str, str]) -> None:
         raise NotImplementedError
+    def _download_file(self, handle: _ResourceHandleType, local_file_path: str,
+                       remote_file_path: str) -> None:
+        raise NotImplementedError
     def _sync_file_mounts(
         self,
         handle: _ResourceHandleType,

sky/backends/backend_utils.py CHANGED Viewed

@@ -52,6 +52,7 @@ from sky.utils import cluster_utils
 from sky.utils import command_runner
 from sky.utils import common
 from sky.utils import common_utils
+from sky.utils import context as context_lib
 from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import env_options
@@ -796,7 +797,7 @@ def write_cluster_config(
             cloud=str(cloud).lower(),
             region=region.name,
             keys=('use_ssm',),
-            default_value=False)
+            default_value=None)
         if use_ssm and ssh_proxy_command is not None:
             raise exceptions.InvalidCloudConfigs(
@@ -804,15 +805,18 @@ def write_cluster_config(
                 f'is already set to {ssh_proxy_command!r}. Please remove '
                 'ssh_proxy_command or set use_ssm to false.')
-        if not use_ssm and use_internal_ips and ssh_proxy_command is None:
-            logger.warning(
-                f'{colorama.Fore.YELLOW}'
-                'use_internal_ips is set to true, '
-                'but ssh_proxy_command is not set. Defaulting to '
-                'using SSM. Specify ssh_proxy_command to use a different '
-                'https://docs.skypilot.co/en/latest/reference/config.html#'
-                f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
-            use_ssm = True
+        if use_internal_ips and ssh_proxy_command is None:
+            # Only if use_ssm is explicitly not set, we default to using SSM.
+            if use_ssm is None:
+                logger.warning(
+                    f'{colorama.Fore.YELLOW}'
+                    'use_internal_ips is set to true, '
+                    'but ssh_proxy_command is not set. Defaulting to '
+                    'using SSM. Specify ssh_proxy_command to use a different '
+                    'https://docs.skypilot.co/en/latest/reference/config.html#'
+                    f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
+                use_ssm = True
         if use_ssm:
             aws_profile = os.environ.get('AWS_PROFILE', None)
             profile_str = f'--profile {aws_profile}' if aws_profile else ''
@@ -1843,7 +1847,9 @@ def check_owner_identity(cluster_name: str) -> None:
     """
     if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
         return
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    record = global_user_state.get_cluster_from_name(cluster_name,
+                                                     include_user_info=False,
+                                                     summary_response=True)
     if record is None:
         return
     handle = record['handle']
@@ -1930,6 +1936,7 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
     }
+@context_utils.cancellation_guard
 def _query_cluster_status_via_cloud_api(
     handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
 ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
@@ -2137,7 +2144,10 @@ def check_can_clone_disk_and_override_task(
     return task, handle
-def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
+def _update_cluster_status(
+        cluster_name: str,
+        include_user_info: bool = True,
+        summary_response: bool = False) -> Optional[Dict[str, Any]]:
     """Update the cluster status.
     The cluster status is updated by checking ray cluster and real status from
@@ -2164,7 +2174,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
           fetched from the cloud provider or there are leaked nodes causing
           the node number larger than expected.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    record = global_user_state.get_cluster_from_name(
+        cluster_name,
+        include_user_info=include_user_info,
+        summary_response=summary_response)
     if record is None:
         return None
     handle = record['handle']
@@ -2340,7 +2353,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             ready=True,
             is_launch=False,
             existing_cluster_hash=record['cluster_hash'])
-        return global_user_state.get_cluster_from_name(cluster_name)
+        return global_user_state.get_cluster_from_name(
+            cluster_name,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
     # All cases below are transitioning the cluster to non-UP states.
     launched_resources = handle.launched_resources.assert_launchable()
@@ -2552,7 +2568,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             ready=False,
             is_launch=False,
             existing_cluster_hash=record['cluster_hash'])
-        return global_user_state.get_cluster_from_name(cluster_name)
+        return global_user_state.get_cluster_from_name(
+            cluster_name,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
     # Now is_abnormal is False: either node_statuses is empty or all nodes are
     # STOPPED.
     verb = 'terminated' if to_terminate else 'stopped'
@@ -2567,7 +2586,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         nop_if_duplicate=True,
     )
     backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
-    return global_user_state.get_cluster_from_name(cluster_name)
+    return global_user_state.get_cluster_from_name(
+        cluster_name,
+        include_user_info=include_user_info,
+        summary_response=summary_response)
 def _must_refresh_cluster_status(
@@ -2589,12 +2611,13 @@ def _must_refresh_cluster_status(
 def refresh_cluster_record(
-    cluster_name: str,
-    *,
-    force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
-    acquire_per_cluster_status_lock: bool = True,
-    cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
-) -> Optional[Dict[str, Any]]:
+        cluster_name: str,
+        *,
+        force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
+        acquire_per_cluster_status_lock: bool = True,
+        cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
+        include_user_info: bool = True,
+        summary_response: bool = False) -> Optional[Dict[str, Any]]:
     """Refresh the cluster, and return the possibly updated record.
     The function will update the cached cluster status in the global state. For
@@ -2634,7 +2657,11 @@ def refresh_cluster_record(
           the node number larger than expected.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    ctx = context_lib.get()
+    record = global_user_state.get_cluster_from_name(
+        cluster_name,
+        include_user_info=include_user_info,
+        summary_response=summary_response)
     if record is None:
         return None
     # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
@@ -2653,12 +2680,16 @@ def refresh_cluster_record(
         # Loop until we have an up-to-date status or until we acquire the lock.
         while True:
+            # Check if the context is canceled.
+            if ctx is not None and ctx.is_canceled():
+                raise asyncio.CancelledError()
             # Check to see if we can return the cached status.
             if not _must_refresh_cluster_status(record, force_refresh_statuses):
                 return record
             if not acquire_per_cluster_status_lock:
-                return _update_cluster_status(cluster_name)
+                return _update_cluster_status(cluster_name, include_user_info,
+                                              summary_response)
             # Try to acquire the lock so we can fetch the status.
             try:
@@ -2666,12 +2697,16 @@ def refresh_cluster_record(
                     # Check the cluster status again, since it could have been
                     # updated between our last check and acquiring the lock.
                     record = global_user_state.get_cluster_from_name(
-                        cluster_name)
+                        cluster_name,
+                        include_user_info=include_user_info,
+                        summary_response=summary_response)
                     if record is None or not _must_refresh_cluster_status(
                             record, force_refresh_statuses):
                         return record
                     # Update and return the cluster status.
-                    return _update_cluster_status(cluster_name)
+                    return _update_cluster_status(cluster_name,
+                                                  include_user_info,
+                                                  summary_response)
             except locks.LockTimeout:
                 # lock.acquire() will throw a Timeout exception if the lock is not
@@ -2692,7 +2727,10 @@ def refresh_cluster_record(
             time.sleep(lock.poll_interval)
             # Refresh for next loop iteration.
-            record = global_user_state.get_cluster_from_name(cluster_name)
+            record = global_user_state.get_cluster_from_name(
+                cluster_name,
+                include_user_info=include_user_info,
+                summary_response=summary_response)
             if record is None:
                 return None
@@ -2717,7 +2755,9 @@ def refresh_cluster_status_handle(
         cluster_name,
         force_refresh_statuses=force_refresh_statuses,
         acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
-        cluster_status_lock_timeout=cluster_status_lock_timeout)
+        cluster_status_lock_timeout=cluster_status_lock_timeout,
+        include_user_info=False,
+        summary_response=True)
     if record is None:
         return None, None
     return record['status'], record['handle']
@@ -2768,7 +2808,9 @@ def check_cluster_available(
         exceptions.CloudUserIdentityError: if we fail to get the current user
           identity.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    record = global_user_state.get_cluster_from_name(cluster_name,
+                                                     include_user_info=False,
+                                                     summary_response=True)
     if dryrun:
         assert record is not None, cluster_name
         return record['handle']
@@ -2955,7 +2997,8 @@ def is_controller_accessible(
             f'fatal, but {controller_name} commands/calls may hang or return '
             'stale information, when the controller is not up.\n'
             f'  Details: {common_utils.format_exception(e, use_bracket=True)}')
-        record = global_user_state.get_cluster_from_name(cluster_name)
+        record = global_user_state.get_cluster_from_name(
+            cluster_name, include_user_info=False, summary_response=True)
         if record is not None:
             controller_status, handle = record['status'], record['handle']
             # We check the connection even if the cluster has a cached status UP
@@ -3012,22 +3055,98 @@ class CloudFilter(enum.Enum):
     LOCAL = 'local'
-def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
+def _get_glob_clusters(
+        clusters: List[str],
+        silent: bool = False,
+        workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
     """Returns a list of clusters that match the glob pattern."""
     glob_clusters = []
     for cluster in clusters:
-        glob_cluster = global_user_state.get_glob_cluster_names(cluster)
+        glob_cluster = global_user_state.get_glob_cluster_names(
+            cluster, workspaces_filter=workspaces_filter)
         if len(glob_cluster) == 0 and not silent:
             logger.info(f'Cluster {cluster} not found.')
         glob_clusters.extend(glob_cluster)
     return list(set(glob_clusters))
+def _refresh_cluster(
+        cluster_name: str,
+        force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
+        include_user_info: bool = True,
+        summary_response: bool = False) -> Optional[Dict[str, Any]]:
+    try:
+        record = refresh_cluster_record(
+            cluster_name,
+            force_refresh_statuses=force_refresh_statuses,
+            acquire_per_cluster_status_lock=True,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
+    except (exceptions.ClusterStatusFetchingError,
+            exceptions.CloudUserIdentityError,
+            exceptions.ClusterOwnerIdentityMismatchError) as e:
+        # Do not fail the entire refresh process. The caller will
+        # handle the 'UNKNOWN' status, and collect the errors into
+        # a table.
+        record = {'status': 'UNKNOWN', 'error': e}
+    return record
+def refresh_cluster_records() -> None:
+    """Refreshes the status of all clusters, except managed clusters.
+    Used by the background status refresh daemon.
+    This function is a stripped-down version of get_clusters, with only the
+    bare bones refresh logic.
+    Returns:
+        None
+    Raises:
+        None
+    """
+    exclude_managed_clusters = True
+    if env_options.Options.SHOW_DEBUG_INFO.get():
+        exclude_managed_clusters = False
+    cluster_names = global_user_state.get_cluster_names(
+        exclude_managed_clusters=exclude_managed_clusters,)
+    # TODO(syang): we should try not to leak
+    # request info in backend_utils.py.
+    # Refactor this to use some other info to
+    # determine if a launch is in progress.
+    request = requests_lib.get_request_tasks(
+        req_filter=requests_lib.RequestTaskFilter(
+            status=[requests_lib.RequestStatus.RUNNING],
+            cluster_names=cluster_names,
+            include_request_names=['sky.launch']))
+    cluster_names_with_launch_request = {
+        request.cluster_name for request in request
+    }
+    cluster_names_without_launch_request = [
+        cluster_name for cluster_name in cluster_names
+        if cluster_name not in cluster_names_with_launch_request
+    ]
+    def _refresh_cluster_record(cluster_name):
+        return _refresh_cluster(cluster_name,
+                                force_refresh_statuses=set(
+                                    status_lib.ClusterStatus),
+                                include_user_info=False,
+                                summary_response=True)
+    if len(cluster_names) > 0:
+        # Do not refresh the clusters that have an active launch request.
+        subprocess_utils.run_in_parallel(_refresh_cluster_record,
+                                         cluster_names_without_launch_request)
 def get_clusters(
     refresh: common.StatusRefreshMode,
     cluster_names: Optional[Union[str, List[str]]] = None,
     all_users: bool = True,
     include_credentials: bool = False,
+    summary_response: bool = False,
     # Internal only:
     # pylint: disable=invalid-name
     _include_is_managed: bool = False,
@@ -3055,10 +3174,23 @@ def get_clusters(
         A list of cluster records. If the cluster does not exist or has been
         terminated, the record will be omitted from the returned list.
     """
+    accessible_workspaces = workspaces_core.get_workspaces()
     if cluster_names is not None:
         if isinstance(cluster_names, str):
             cluster_names = [cluster_names]
-        cluster_names = _get_glob_clusters(cluster_names, silent=True)
+        non_glob_cluster_names = []
+        glob_cluster_names = []
+        for cluster_name in cluster_names:
+            if ux_utils.is_glob_pattern(cluster_name):
+                glob_cluster_names.append(cluster_name)
+            else:
+                non_glob_cluster_names.append(cluster_name)
+        cluster_names = non_glob_cluster_names
+        if glob_cluster_names:
+            cluster_names += _get_glob_clusters(
+                glob_cluster_names,
+                silent=True,
+                workspaces_filter=accessible_workspaces)
     exclude_managed_clusters = False
     if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
@@ -3066,13 +3198,12 @@ def get_clusters(
     user_hashes_filter = None
     if not all_users:
         user_hashes_filter = {common_utils.get_current_user().id}
-    accessible_workspaces = workspaces_core.get_workspaces()
     records = global_user_state.get_clusters(
         exclude_managed_clusters=exclude_managed_clusters,
         user_hashes_filter=user_hashes_filter,
         workspaces_filter=accessible_workspaces,
         cluster_names=cluster_names,
-    )
+        summary_response=summary_response)
     yellow = colorama.Fore.YELLOW
     bright = colorama.Style.BRIGHT
@@ -3080,12 +3211,10 @@ def get_clusters(
     if cluster_names is not None:
         record_names = {record['name'] for record in records}
-        not_exist_cluster_names = [
-            cluster_name for cluster_name in cluster_names
-            if cluster_name not in record_names
-        ]
-        if not_exist_cluster_names:
-            clusters_str = ', '.join(not_exist_cluster_names)
+        not_found_clusters = ux_utils.get_non_matched_query(
+            cluster_names, record_names)
+        if not_found_clusters:
+            clusters_str = ', '.join(not_found_clusters)
             logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
     def _get_records_with_handle(
@@ -3096,7 +3225,7 @@ def get_clusters(
             if record is not None and record['handle'] is not None
         ]
-    def _update_records_with_resources_str(
+    def _update_records_with_handle_info(
             records: List[Optional[Dict[str, Any]]]) -> None:
         """Add resource str to record"""
         for record in _get_records_with_handle(records):
@@ -3107,6 +3236,8 @@ def get_clusters(
             record[
                 'resources_str_full'] = resources_utils.get_readable_resources_repr(
                     handle, simplify=False)
+            if not summary_response:
+                record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
     def _update_records_with_credentials(
             records: List[Optional[Dict[str, Any]]]) -> None:
@@ -3146,7 +3277,7 @@ def get_clusters(
             record['credentials'] = credential
     def _update_records_with_resources(
-            records: List[Optional[Dict[str, Any]]]) -> None:
+        records: List[Optional[Dict[str, Any]]],) -> None:
         """Add the resources to the record."""
         for record in _get_records_with_handle(records):
             handle = record['handle']
@@ -3165,8 +3296,8 @@ def get_clusters(
                 f'{handle.launched_resources.accelerators}'
                 if handle.launched_resources.accelerators else None)
-    # Add auth_config to the records
-    _update_records_with_resources_str(records)
+    # Add handle info to the records
+    _update_records_with_handle_info(records)
     if include_credentials:
         _update_records_with_credentials(records)
     if refresh == common.StatusRefreshMode.NONE:
@@ -3187,47 +3318,44 @@ def get_clusters(
     else:
         force_refresh_statuses = None
-    def _refresh_cluster(cluster_name):
-        # TODO(syang): we should try not to leak
-        # request info in backend_utils.py.
-        # Refactor this to use some other info to
-        # determine if a launch is in progress.
-        request = requests_lib.get_request_tasks(
-            req_filter=requests_lib.RequestTaskFilter(
-                status=[requests_lib.RequestStatus.RUNNING],
-                cluster_names=[cluster_name],
-                include_request_names=['sky.launch']))
-        if len(request) > 0:
-            # There is an active launch request on the cluster,
-            # so we don't want to update the cluster status until
-            # the request is completed.
-            logger.debug(f'skipping refresh for cluster {cluster_name} '
-                         'as there is an active launch request')
-            return global_user_state.get_cluster_from_name(cluster_name)
-        try:
-            record = refresh_cluster_record(
-                cluster_name,
-                force_refresh_statuses=force_refresh_statuses,
-                acquire_per_cluster_status_lock=True)
-            _update_records_with_resources_str([record])
+    def _refresh_cluster_record(cluster_name):
+        record = _refresh_cluster(cluster_name,
+                                  force_refresh_statuses=force_refresh_statuses,
+                                  include_user_info=True,
+                                  summary_response=summary_response)
+        if 'error' not in record:
+            _update_records_with_handle_info([record])
             if include_credentials:
                 _update_records_with_credentials([record])
-        except (exceptions.ClusterStatusFetchingError,
-                exceptions.CloudUserIdentityError,
-                exceptions.ClusterOwnerIdentityMismatchError) as e:
-            # Do not fail the entire refresh process. The caller will
-            # handle the 'UNKNOWN' status, and collect the errors into
-            # a table.
-            record = {'status': 'UNKNOWN', 'error': e}
-        progress.update(task, advance=1)
+            progress.update(task, advance=1)
         return record
     cluster_names = [record['name'] for record in records]
-    updated_records = []
-    if len(cluster_names) > 0:
+    # TODO(syang): we should try not to leak
+    # request info in backend_utils.py.
+    # Refactor this to use some other info to
+    # determine if a launch is in progress.
+    request = requests_lib.get_request_tasks(
+        req_filter=requests_lib.RequestTaskFilter(
+            status=[requests_lib.RequestStatus.RUNNING],
+            cluster_names=cluster_names,
+            include_request_names=['sky.launch']))
+    cluster_names_with_launch_request = {
+        request.cluster_name for request in request
+    }
+    cluster_names_without_launch_request = [
+        cluster_name for cluster_name in cluster_names
+        if cluster_name not in cluster_names_with_launch_request
+    ]
+    # for clusters that have an active launch request, we do not refresh the status
+    updated_records = [
+        record for record in records
+        if record['name'] in cluster_names_with_launch_request
+    ]
+    if len(cluster_names_without_launch_request) > 0:
         with progress:
             updated_records = subprocess_utils.run_in_parallel(
-                _refresh_cluster, cluster_names)
+                _refresh_cluster_record, cluster_names_without_launch_request)
     # Show information for removed clusters.
     kept_records = []

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -116,6 +116,9 @@ Path = str
 SKY_REMOTE_APP_DIR = backend_utils.SKY_REMOTE_APP_DIR
 SKY_REMOTE_WORKDIR = constants.SKY_REMOTE_WORKDIR
+# Unset RAY_RAYLET_PID to prevent the Ray cluster in the SkyPilot runtime
+# from interfering with the Ray cluster in the user's task (if any).
+UNSET_RAY_ENV_VARS = ['RAY_RAYLET_PID']
 logger = sky_logging.init_logger(__name__)
@@ -712,6 +715,8 @@ class RayCodeGen:
             done
             echo "skypilot: cached mount uploaded complete"
         fi""")
+        unset_ray_env_vars = ' && '.join(
+            [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
         self._code += [
             sky_env_vars_dict_str,
             textwrap.dedent(f"""\
@@ -721,6 +726,7 @@ class RayCodeGen:
             script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
         if script is not None:
+            script=f'{unset_ray_env_vars}; {{script}}'
             script += rclone_flush_script
             sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
@@ -3261,9 +3267,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # Usage Collection:
         usage_lib.messages.usage.update_cluster_resources(
             handle.launched_nodes, launched_resources)
-        record = global_user_state.get_cluster_from_name(cluster_name)
-        if record is not None:
-            usage_lib.messages.usage.update_cluster_status(record['status'])
+        status = global_user_state.get_status_from_cluster_name(cluster_name)
+        if status is not None:
+            usage_lib.messages.usage.update_cluster_status(status)
         assert launched_resources.region is not None, handle
@@ -3532,8 +3538,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                             error_message + '\n' + str(e),
                             failover_history=e.failover_history) from None
             if dryrun:
-                record = global_user_state.get_cluster_from_name(cluster_name)
-                return record['handle'] if record is not None else None, False
+                handle = global_user_state.get_handle_from_cluster_name(
+                    cluster_name)
+                return handle if handle is not None else None, False
             if config_dict['provisioning_skipped']:
                 # Skip further provisioning.
@@ -3541,10 +3548,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # ('handle', 'provision_record', 'resources_vars')
                 # We need to return the handle - but it should be the existing
                 # handle for the cluster.
-                record = global_user_state.get_cluster_from_name(cluster_name)
-                assert record is not None and record['handle'] is not None, (
-                    cluster_name, record)
-                return record['handle'], True
+                handle = global_user_state.get_handle_from_cluster_name(
+                    cluster_name)
+                assert handle is not None, (cluster_name, handle)
+                return handle, True
             if 'provision_record' in config_dict:
                 # New provisioner is used here.
@@ -3939,6 +3946,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
         # Need this `-i` option to make sure `source ~/.bashrc` work
         setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
+        unset_ray_env_vars = ' && '.join(
+            [f'unset {var}' for var in UNSET_RAY_ENV_VARS])
+        setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
         runners = handle.get_command_runners(avoid_ssh_control=True)
         def _setup_node(node_id: int) -> None:
@@ -4088,6 +4098,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         logger.info(
             ux_utils.finishing_message('Setup completed.', setup_log_path))
+    def _download_file(self, handle: CloudVmRayResourceHandle,
+                       local_file_path: str, remote_file_path: str) -> None:
+        """Syncs file from remote to local."""
+        runners = handle.get_command_runners()
+        head_runner = runners[0]
+        head_runner.rsync(
+            source=local_file_path,
+            target=remote_file_path,
+            up=False,
+            stream_logs=False,
+        )
     def _exec_code_on_head(
         self,
         handle: CloudVmRayResourceHandle,
@@ -4992,10 +5014,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     f'{handle.cluster_name!r}. Assuming the cluster is still '
                     'up.')
         if not cluster_status_fetched:
-            record = global_user_state.get_cluster_from_name(
+            status = global_user_state.get_status_from_cluster_name(
                 handle.cluster_name)
-            prev_cluster_status = record[
-                'status'] if record is not None else None
+            prev_cluster_status = status if status is not None else None
         if prev_cluster_status is None:
             # When the cluster is not in the cluster table, we guarantee that
             # all related resources / cache / config are cleaned up, i.e. it
@@ -5568,7 +5589,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             exceptions.InvalidClusterNameError: If the cluster name is invalid.
             # TODO(zhwu): complete the list of exceptions.
         """
-        record = global_user_state.get_cluster_from_name(cluster_name)
+        record = global_user_state.get_cluster_from_name(
+            cluster_name, include_user_info=False, summary_response=True)
         if record is None:
             handle_before_refresh = None
             status_before_refresh = None
@@ -5589,6 +5611,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 cluster_name,
                 force_refresh_statuses={status_lib.ClusterStatus.INIT},
                 acquire_per_cluster_status_lock=False,
+                include_user_info=False,
+                summary_response=True,
             )
             if record is not None:
                 prev_cluster_status = record['status']

skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250926__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250922py3-none-any.whl → 1.0.0.dev20250926py3-none-any.whl