PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250827py3-none-any.whl → 1.0.0.dev20250829py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (86) hide show

sky/__init__.py +2 -2
sky/admin_policy.py +11 -10
sky/authentication.py +1 -1
sky/backends/backend.py +3 -5
sky/backends/backend_utils.py +140 -52
sky/backends/cloud_vm_ray_backend.py +30 -25
sky/backends/local_docker_backend.py +3 -8
sky/backends/wheel_utils.py +35 -8
sky/client/cli/command.py +41 -9
sky/client/sdk.py +23 -8
sky/client/sdk_async.py +6 -2
sky/clouds/aws.py +118 -1
sky/core.py +1 -4
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/global_user_state.py +82 -22
sky/jobs/client/sdk.py +5 -2
sky/jobs/recovery_strategy.py +9 -4
sky/jobs/server/server.py +2 -1
sky/logs/agent.py +2 -2
sky/logs/aws.py +6 -3
sky/provision/aws/config.py +78 -3
sky/provision/aws/instance.py +45 -6
sky/provision/do/utils.py +2 -1
sky/provision/kubernetes/instance.py +55 -11
sky/provision/kubernetes/utils.py +11 -2
sky/provision/nebius/utils.py +36 -2
sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
sky/serve/client/impl.py +5 -4
sky/serve/replica_managers.py +4 -3
sky/serve/serve_utils.py +2 -2
sky/serve/server/impl.py +3 -2
sky/serve/server/server.py +2 -1
sky/server/auth/oauth2_proxy.py +10 -4
sky/server/common.py +4 -4
sky/server/daemons.py +16 -5
sky/server/requests/executor.py +5 -3
sky/server/requests/payloads.py +3 -1
sky/server/requests/preconditions.py +3 -2
sky/server/requests/requests.py +121 -19
sky/server/server.py +85 -60
sky/server/stream_utils.py +7 -5
sky/setup_files/dependencies.py +6 -1
sky/sky_logging.py +28 -0
sky/skylet/constants.py +6 -0
sky/skylet/events.py +2 -3
sky/skypilot_config.py +10 -10
sky/task.py +1 -1
sky/templates/aws-ray.yml.j2 +1 -0
sky/templates/nebius-ray.yml.j2 +4 -8
sky/usage/usage_lib.py +3 -2
sky/utils/annotations.py +8 -2
sky/utils/cluster_utils.py +3 -3
sky/utils/common_utils.py +0 -72
sky/utils/controller_utils.py +4 -3
sky/utils/dag_utils.py +4 -4
sky/utils/db/db_utils.py +11 -0
sky/utils/db/migration_utils.py +1 -1
sky/utils/kubernetes/config_map_utils.py +3 -3
sky/utils/kubernetes_enums.py +1 -0
sky/utils/lock_events.py +94 -0
sky/utils/schemas.py +3 -0
sky/utils/timeline.py +24 -93
sky/utils/yaml_utils.py +77 -10
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/METADATA +8 -2
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/RECORD +86 -84
/sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{-eL7Ky3bxVivzeLHNB9U6 → hYJYFIxp_ZFONR4wTIJqZ}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250827.dist-info → skypilot_nightly-1.0.0.dev20250829.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -7,7 +7,7 @@ import urllib.request
 from sky.utils import directory_utils
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '01e261e45881118c963337e22cc819586eef9b43'
+_SKYPILOT_COMMIT_SHA = '27f74c78af59ef98180b59a30c43410e46e3ce37'
 def _get_git_commit():
@@ -37,7 +37,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250827'
+__version__ = '1.0.0.dev20250829'
 __root_dir__ = directory_utils.get_sky_dir()

sky/admin_policy.py CHANGED Viewed

@@ -13,6 +13,7 @@ from sky.adaptors import common as adaptors_common
 from sky.utils import common_utils
 from sky.utils import config_utils
 from sky.utils import ux_utils
+from sky.utils import yaml_utils
 if typing.TYPE_CHECKING:
     import requests
@@ -80,9 +81,9 @@ class UserRequest:
     def encode(self) -> str:
         return _UserRequestBody(
-            task=common_utils.dump_yaml_str(self.task.to_yaml_config()),
-            skypilot_config=common_utils.dump_yaml_str(
-                dict(self.skypilot_config)),
+            task=yaml_utils.dump_yaml_str(self.task.to_yaml_config()),
+            skypilot_config=yaml_utils.dump_yaml_str(dict(
+                self.skypilot_config)),
             request_options=self.request_options,
             at_client_side=self.at_client_side,
         ).model_dump_json()
@@ -92,9 +93,9 @@ class UserRequest:
         user_request_body = _UserRequestBody.model_validate_json(body)
         return cls(
             task=sky.Task.from_yaml_config(
-                common_utils.read_yaml_all_str(user_request_body.task)[0]),
+                yaml_utils.read_yaml_all_str(user_request_body.task)[0]),
             skypilot_config=config_utils.Config.from_dict(
-                common_utils.read_yaml_all_str(
+                yaml_utils.read_yaml_all_str(
                     user_request_body.skypilot_config)[0]),
             request_options=user_request_body.request_options,
             at_client_side=user_request_body.at_client_side,
@@ -116,9 +117,9 @@ class MutatedUserRequest:
     def encode(self) -> str:
         return _MutatedUserRequestBody(
-            task=common_utils.dump_yaml_str(self.task.to_yaml_config()),
-            skypilot_config=common_utils.dump_yaml_str(
-                dict(self.skypilot_config),)).model_dump_json()
+            task=yaml_utils.dump_yaml_str(self.task.to_yaml_config()),
+            skypilot_config=yaml_utils.dump_yaml_str(dict(
+                self.skypilot_config),)).model_dump_json()
     @classmethod
     def decode(cls, mutated_user_request_body: str,
@@ -126,14 +127,14 @@ class MutatedUserRequest:
         mutated_user_request_body = _MutatedUserRequestBody.model_validate_json(
             mutated_user_request_body)
         task = sky.Task.from_yaml_config(
-            common_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
+            yaml_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
         # Some internal Task fields are not serialized. We need to manually
         # restore them from the original request.
         task.managed_job_dag = original_request.task.managed_job_dag
         task.service_name = original_request.task.service_name
         return cls(task=task,
                    skypilot_config=config_utils.Config.from_dict(
-                       common_utils.read_yaml_all_str(
+                       yaml_utils.read_yaml_all_str(
                            mutated_user_request_body.skypilot_config)[0],))

sky/authentication.py CHANGED Viewed

@@ -198,7 +198,7 @@ def configure_ssh_info(config: Dict[str, Any]) -> Dict[str, Any]:
     _, public_key_path = get_or_generate_keys()
     with open(public_key_path, 'r', encoding='utf-8') as f:
         public_key = f.read().strip()
-    config_str = common_utils.dump_yaml_str(config)
+    config_str = yaml_utils.dump_yaml_str(config)
     config_str = config_str.replace('skypilot:ssh_user',
                                     config['auth']['ssh_user'])
     config_str = config_str.replace('skypilot:ssh_public_key_content',

sky/backends/backend.py CHANGED Viewed

@@ -147,9 +147,8 @@ class Backend(Generic[_ResourceHandleType]):
     def teardown(self,
                  handle: _ResourceHandleType,
                  terminate: bool,
-                 purge: bool = False,
-                 explicitly_requested: bool = False) -> None:
-        self._teardown(handle, terminate, purge, explicitly_requested)
+                 purge: bool = False) -> None:
+        self._teardown(handle, terminate, purge)
     def register_info(self, **kwargs) -> None:
         """Register backend-specific information."""
@@ -201,6 +200,5 @@ class Backend(Generic[_ResourceHandleType]):
     def _teardown(self,
                   handle: _ResourceHandleType,
                   terminate: bool,
-                  purge: bool = False,
-                  explicitly_requested: bool = False):
+                  purge: bool = False):
         raise NotImplementedError

sky/backends/backend_utils.py CHANGED Viewed

@@ -241,7 +241,7 @@ def _optimize_file_mounts(tmp_yaml_path: str) -> None:
         subprocess.CalledProcessError: If the file mounts are failed to be
             copied.
     """
-    yaml_config = common_utils.read_yaml(tmp_yaml_path)
+    yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
     file_mounts = yaml_config.get('file_mounts', {})
     # Remove the file mounts added by the newline.
@@ -325,7 +325,7 @@ def _optimize_file_mounts(tmp_yaml_path: str) -> None:
             shell=True,
             check=True)
-    common_utils.dump_yaml(tmp_yaml_path, yaml_config)
+    yaml_utils.dump_yaml(tmp_yaml_path, yaml_config)
 def path_size_megabytes(path: str) -> int:
@@ -510,7 +510,7 @@ def _replace_yaml_dicts(
         for key in exclude_restore_key_name[:-1]:
             curr = curr[key]
         curr[exclude_restore_key_name[-1]] = value
-    return common_utils.dump_yaml_str(new_config)
+    return yaml_utils.dump_yaml_str(new_config)
 def get_expirable_clouds(
@@ -937,7 +937,7 @@ def write_cluster_config(
             tmp_yaml_path,
             cluster_config_overrides=cluster_config_overrides,
             context=region.name)
-        yaml_obj = common_utils.read_yaml(tmp_yaml_path)
+        yaml_obj = yaml_utils.read_yaml(tmp_yaml_path)
         pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
             'ray_head_default']['node_config']
@@ -976,7 +976,7 @@ def write_cluster_config(
     # Read the cluster name from the tmp yaml file, to take the backward
     # compatbility restortion above into account.
     # TODO: remove this after 2 minor releases, 0.10.0.
-    yaml_config = common_utils.read_yaml(tmp_yaml_path)
+    yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
     config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
     # Make sure to do this before we optimize file mounts. Optimization is
@@ -1022,7 +1022,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
     This function's output removes comments included in the jinja2 template.
     """
-    config = common_utils.read_yaml(tmp_yaml_path)
+    config = yaml_utils.read_yaml(tmp_yaml_path)
     # Check the availability of the cloud type.
     if isinstance(cloud, (
             clouds.AWS,
@@ -1054,7 +1054,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
         config = auth.setup_hyperbolic_authentication(config)
     else:
         assert False, cloud
-    common_utils.dump_yaml(tmp_yaml_path, config)
+    yaml_utils.dump_yaml(tmp_yaml_path, config)
 def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
@@ -1156,7 +1156,7 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
     """
     # Load the yaml contents so that we can directly remove keys.
-    yaml_config = common_utils.read_yaml(tmp_yaml_path)
+    yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
     for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
         dict_to_remove_from = yaml_config
         found_key = True
@@ -1175,7 +1175,7 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
     config_hash = hashlib.sha256()
     yaml_hash = hashlib.sha256(
-        common_utils.dump_yaml_str(yaml_config).encode('utf-8'))
+        yaml_utils.dump_yaml_str(yaml_config).encode('utf-8'))
     config_hash.update(yaml_hash.digest())
     file_mounts = yaml_config.get('file_mounts', {})
@@ -1409,6 +1409,62 @@ def ssh_credential_from_yaml(
     return credentials
+def ssh_credentials_from_handles(
+    handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
+) -> List[Dict[str, Any]]:
+    """Returns ssh_user, ssh_private_key and ssh_control name.
+    """
+    non_empty_cluster_yaml_paths = [
+        handle.cluster_yaml
+        for handle in handles
+        if handle.cluster_yaml is not None
+    ]
+    cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
+        non_empty_cluster_yaml_paths)
+    cluster_yaml_dicts_to_index = {
+        cluster_yaml_path: cluster_yaml_dict
+        for cluster_yaml_path, cluster_yaml_dict in zip(
+            non_empty_cluster_yaml_paths, cluster_yaml_dicts)
+    }
+    credentials_to_return: List[Dict[str, Any]] = []
+    for handle in handles:
+        if handle.cluster_yaml is None:
+            credentials_to_return.append(dict())
+            continue
+        ssh_user = handle.ssh_user
+        docker_user = handle.docker_user
+        config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
+        auth_section = config['auth']
+        if ssh_user is None:
+            ssh_user = auth_section['ssh_user'].strip()
+        ssh_private_key_path = auth_section.get('ssh_private_key')
+        ssh_control_name = config.get('cluster_name', '__default__')
+        ssh_proxy_command = auth_section.get('ssh_proxy_command')
+        # Update the ssh_user placeholder in proxy command, if required
+        if (ssh_proxy_command is not None and
+                constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
+            ssh_proxy_command = ssh_proxy_command.replace(
+                constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
+        credentials = {
+            'ssh_user': ssh_user,
+            'ssh_private_key': ssh_private_key_path,
+            'ssh_control_name': ssh_control_name,
+            'ssh_proxy_command': ssh_proxy_command,
+        }
+        if docker_user is not None:
+            credentials['docker_user'] = docker_user
+        ssh_provider_module = config['provider']['module']
+        # If we are running ssh command on kubernetes node.
+        if 'kubernetes' in ssh_provider_module:
+            credentials['disable_control_master'] = True
+        credentials_to_return.append(credentials)
+    return credentials_to_return
 def parallel_data_transfer_to_nodes(
         runners: List[command_runner.CommandRunner],
         source: Optional[str],
@@ -2027,9 +2083,7 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             'Cluster has no YAML file. Removing the cluster from cache.',
             global_user_state.ClusterEventType.STATUS_CHANGE,
             nop_if_duplicate=True)
-        global_user_state.remove_cluster(cluster_name,
-                                         terminate=True,
-                                         remove_events=True)
+        global_user_state.remove_cluster(cluster_name, terminate=True)
         logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
                      'Removing the cluster from cache.')
         return None
@@ -2058,7 +2112,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
                 f'{output}\n', stderr)
         return (*_count_healthy_nodes_from_ray(output), output, stderr)
+    ray_status_details: Optional[str] = None
     def run_ray_status_to_check_ray_cluster_healthy() -> bool:
+        nonlocal ray_status_details
         try:
             # NOTE: fetching the IPs is very slow as it calls into
             # `ray get head-ip/worker-ips`. Using cached IPs is safe because
@@ -2136,19 +2193,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
                 #   showing up
                 time.sleep(1)
+            ray_status_details = (
+                f'{ready_head + ready_workers}/{total_nodes} ready')
             raise RuntimeError(
                 f'Refreshing status ({cluster_name!r}): ray status not showing '
                 f'all nodes ({ready_head + ready_workers}/'
                 f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
         except exceptions.FetchClusterInfoError:
+            ray_status_details = 'failed to get IPs'
             logger.debug(
                 f'Refreshing status ({cluster_name!r}) failed to get IPs.')
         except RuntimeError as e:
+            if ray_status_details is None:
+                ray_status_details = str(e)
             logger.debug(common_utils.format_exception(e))
         except Exception as e:  # pylint: disable=broad-except
             # This can be raised by `external_ssh_ports()`, due to the
             # underlying call to kubernetes API.
+            ray_status_details = str(e)
             logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
                          exc_info=e)
         return False
@@ -2261,6 +2324,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
     #  (2) Otherwise, we will reset the autostop setting, unless the cluster is
     #      autostopping/autodowning.
     some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
+    # If all nodes are up and ray cluster is health, we would have returned
+    # earlier. So if all_nodes_up is True and we are here, it means the ray
+    # cluster must have been unhealthy.
+    ray_cluster_unhealthy = all_nodes_up
     some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
                                  for status in node_statuses)
     is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
@@ -2271,8 +2338,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         if some_nodes_terminated:
             init_reason = 'one or more nodes terminated'
+        elif ray_cluster_unhealthy:
+            init_reason = f'ray cluster is unhealthy ({ray_status_details})'
         elif some_nodes_not_stopped:
-            init_reason = 'some nodes are up and some nodes are stopped'
+            init_reason = 'some but not all nodes are stopped'
         logger.debug('The cluster is abnormal. Setting to INIT status. '
                      f'node_statuses: {node_statuses}')
         if record['autostop'] >= 0:
@@ -2367,7 +2436,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             # Some status reason clears after a certain time (e.g. k8s events
             # are only stored for an hour by default), so it is possible that
             # the previous event has a status reason, but now it does not.
-            init_reason_regex = f'^Cluster is abnormal because {init_reason} .*'
+            init_reason_regex = (f'^Cluster is abnormal because '
+                                 f'{re.escape(init_reason)}.*')
         log_message = f'Cluster is abnormal because {init_reason}'
         if status_reason:
             log_message += f' ({status_reason})'
@@ -2387,10 +2457,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         return global_user_state.get_cluster_from_name(cluster_name)
     # Now is_abnormal is False: either node_statuses is empty or all nodes are
     # STOPPED.
+    verb = 'terminated' if to_terminate else 'stopped'
     backend = backends.CloudVmRayBackend()
     global_user_state.add_cluster_event(
-        cluster_name, None, 'All nodes terminated, cleaning up the cluster.',
-        global_user_state.ClusterEventType.STATUS_CHANGE)
+        cluster_name,
+        None,
+        f'All nodes {verb}, cleaning up the cluster.',
+        global_user_state.ClusterEventType.STATUS_CHANGE,
+        # This won't do anything for a terminated cluster, but it's needed for a
+        # stopped cluster.
+        nop_if_duplicate=True,
+    )
     backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
     return global_user_state.get_cluster_from_name(cluster_name)
@@ -2918,44 +2995,57 @@ def get_clusters(
             logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
         records = new_records
-    def _update_record_with_credentials_and_resources_str(
-            record: Optional[Dict[str, Any]]) -> None:
+    def _update_records_with_credentials_and_resources_str(
+            records: List[Optional[Dict[str, Any]]]) -> None:
         """Add the credentials to the record.
         This is useful for the client side to setup the ssh config of the
         cluster.
         """
-        if record is None:
-            return
-        handle = record['handle']
-        if handle is None:
-            return
-        record['resources_str'] = resources_utils.get_readable_resources_repr(
-            handle, simplify=True)
-        record[
-            'resources_str_full'] = resources_utils.get_readable_resources_repr(
-                handle, simplify=False)
-        credentials = ssh_credential_from_yaml(handle.cluster_yaml,
-                                               handle.docker_user,
-                                               handle.ssh_user)
-        if not credentials:
+        records_with_handle = []
+        # only act on records that have a handle
+        for record in records:
+            if record is None:
+                continue
+            handle = record['handle']
+            if handle is None:
+                continue
+            record[
+                'resources_str'] = resources_utils.get_readable_resources_repr(
+                    handle, simplify=True)
+            record[
+                'resources_str_full'] = resources_utils.get_readable_resources_repr(
+                    handle, simplify=False)
+            records_with_handle.append(record)
+        if len(records_with_handle) == 0:
             return
-        ssh_private_key_path = credentials.get('ssh_private_key', None)
-        if ssh_private_key_path is not None:
-            if not os.path.exists(os.path.expanduser(ssh_private_key_path)):
-                auth.create_ssh_key_files_from_db(ssh_private_key_path)
-            with open(os.path.expanduser(ssh_private_key_path),
-                      'r',
-                      encoding='utf-8') as f:
-                credentials['ssh_private_key_content'] = f.read()
-        else:
-            private_key_path, _ = auth.get_or_generate_keys()
-            with open(os.path.expanduser(private_key_path),
-                      'r',
-                      encoding='utf-8') as f:
-                credentials['ssh_private_key_content'] = f.read()
-        record['credentials'] = credentials
+        handles = [record['handle'] for record in records_with_handle]
+        credentials = ssh_credentials_from_handles(handles)
+        cached_private_keys: Dict[str, str] = {}
+        for record, credential in zip(records_with_handle, credentials):
+            if not credential:
+                continue
+            ssh_private_key_path = credential.get('ssh_private_key', None)
+            if ssh_private_key_path is not None:
+                expanded_private_key_path = os.path.expanduser(
+                    ssh_private_key_path)
+                if not os.path.exists(expanded_private_key_path):
+                    auth.create_ssh_key_files_from_db(ssh_private_key_path)
+            else:
+                private_key_path, _ = auth.get_or_generate_keys()
+                expanded_private_key_path = os.path.expanduser(private_key_path)
+            if expanded_private_key_path in cached_private_keys:
+                credential['ssh_private_key_content'] = cached_private_keys[
+                    expanded_private_key_path]
+            else:
+                with open(expanded_private_key_path, 'r',
+                          encoding='utf-8') as f:
+                    credential['ssh_private_key_content'] = f.read()
+                    cached_private_keys[expanded_private_key_path] = credential[
+                        'ssh_private_key_content']
+            record['credentials'] = credential
     def _update_records_with_resources(
             records: List[Optional[Dict[str, Any]]]) -> None:
@@ -2982,9 +3072,7 @@ def get_clusters(
                 if handle.launched_resources.accelerators else None)
     # Add auth_config to the records
-    for record in records:
-        _update_record_with_credentials_and_resources_str(record)
+    _update_records_with_credentials_and_resources_str(records)
     if refresh == common.StatusRefreshMode.NONE:
         # Add resources to the records
         _update_records_with_resources(records)
@@ -3024,7 +3112,7 @@ def get_clusters(
                 cluster_name,
                 force_refresh_statuses=force_refresh_statuses,
                 acquire_per_cluster_status_lock=True)
-            _update_record_with_credentials_and_resources_str(record)
+            _update_records_with_credentials_and_resources_str([record])
         except (exceptions.ClusterStatusFetchingError,
                 exceptions.CloudUserIdentityError,
                 exceptions.ClusterOwnerIdentityMismatchError) as e:

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -65,6 +65,7 @@ from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import directory_utils
 from sky.utils import env_options
+from sky.utils import lock_events
 from sky.utils import locks
 from sky.utils import log_utils
 from sky.utils import message_utils
@@ -1972,7 +1973,7 @@ class RetryingVmProvisioner(object):
                 ray_config = global_user_state.get_cluster_yaml_dict(
                     cluster_config_file)
                 ray_config['upscaling_speed'] = 0
-                common_utils.dump_yaml(cluster_config_file, ray_config)
+                yaml_utils.dump_yaml(cluster_config_file, ray_config)
                 start = time.time()
                 returncode, stdout, stderr = ray_up()
                 logger.debug(
@@ -2498,7 +2499,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         self.stable_internal_external_ips = stable_internal_external_ips
     @context_utils.cancellation_guard
-    @annotations.lru_cache(scope='global')
+    # we expect different request to be acting on different clusters
+    # (= different handles) so we have no real expectation of cache hit
+    # across requests.
+    # Do not change this cache to global scope
+    # without understanding https://github.com/skypilot-org/skypilot/pull/6908
+    @annotations.lru_cache(scope='request', maxsize=10)
     @timeline.event
     def get_command_runners(self,
                             force_cached: bool = False,
@@ -2854,7 +2860,12 @@ class LocalResourcesHandle(CloudVmRayResourceHandle):
         self.is_grpc_enabled = False
     @context_utils.cancellation_guard
-    @annotations.lru_cache(scope='global')
+    # we expect different request to be acting on different clusters
+    # (= different handles) so we have no real expectation of cache hit
+    # across requests.
+    # Do not change this cache to global scope
+    # without understanding https://github.com/skypilot-org/skypilot/pull/6908
+    @annotations.lru_cache(scope='request', maxsize=10)
     @timeline.event
     def get_command_runners(self,
                             force_cached: bool = False,
@@ -3112,7 +3123,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         retry_until_up: bool = False,
         skip_unnecessary_provisioning: bool = False,
     ) -> Tuple[Optional[CloudVmRayResourceHandle], bool]:
-        with timeline.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
+        with lock_events.DistributedLockEvent(lock_id, _CLUSTER_LOCK_TIMEOUT):
+            # Reset spinner message to remove any mention of being blocked
+            # by other requests.
+            rich_utils.force_update_status(
+                ux_utils.spinner_message('Launching'))
             # Try to launch the exiting cluster first. If no existing
             # cluster, this function will create a to_provision_config
             # with required resources.
@@ -3208,8 +3224,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                             global_user_state.ClusterEventType.STATUS_CHANGE,
                             nop_if_duplicate=True)
                         global_user_state.remove_cluster(cluster_name,
-                                                         terminate=True,
-                                                         remove_events=False)
+                                                         terminate=True)
                         usage_lib.messages.usage.update_final_cluster_status(
                             None)
                     logger.error(
@@ -4011,8 +4026,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def _teardown(self,
                   handle: CloudVmRayResourceHandle,
                   terminate: bool,
-                  purge: bool = False,
-                  explicitly_requested: bool = False):
+                  purge: bool = False):
         """Tear down or stop the cluster.
         Args:
@@ -4087,8 +4101,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         # ClusterOwnerIdentityMismatchError. The argument/flag
                         # `purge` should bypass such ID mismatch errors.
                         refresh_cluster_status=(
-                            not is_identity_mismatch_and_purge),
-                        explicitly_requested=explicitly_requested)
+                            not is_identity_mismatch_and_purge))
                 if terminate:
                     lock.force_unlock()
                 break
@@ -4477,8 +4490,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                          purge: bool = False,
                          post_teardown_cleanup: bool = True,
                          refresh_cluster_status: bool = True,
-                         remove_from_db: bool = True,
-                         explicitly_requested: bool = False) -> None:
+                         remove_from_db: bool = True) -> None:
         """Teardown the cluster without acquiring the cluster status lock.
         NOTE: This method should not be called without holding the cluster
@@ -4542,8 +4554,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                            f'provision yaml so it '
                            'has not been provisioned. Skipped.')
             global_user_state.remove_cluster(handle.cluster_name,
-                                             terminate=terminate,
-                                             remove_events=False)
+                                             terminate=terminate)
             return
         log_path = os.path.join(os.path.expanduser(self.log_dir),
                                 'teardown.log')
@@ -4600,12 +4611,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     raise
             if post_teardown_cleanup:
-                self.post_teardown_cleanup(
-                    handle,
-                    terminate,
-                    purge,
-                    remove_from_db,
-                    explicitly_requested=explicitly_requested)
+                self.post_teardown_cleanup(handle, terminate, purge,
+                                           remove_from_db)
             return
         if (isinstance(cloud, clouds.IBM) and terminate and
@@ -4649,7 +4656,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                              prefix='sky_',
                                              delete=False,
                                              suffix='.yml') as f:
-                common_utils.dump_yaml(f.name, config)
+                yaml_utils.dump_yaml(f.name, config)
                 f.flush()
                 teardown_verb = 'Terminating' if terminate else 'Stopping'
@@ -4705,8 +4712,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                               terminate: bool,
                               purge: bool = False,
                               remove_from_db: bool = True,
-                              failover: bool = False,
-                              explicitly_requested: bool = False) -> None:
+                              failover: bool = False) -> None:
         """Cleanup local configs/caches and delete TPUs after teardown.
         This method will handle the following cleanup steps:
@@ -4884,8 +4890,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         if not terminate or remove_from_db:
             global_user_state.remove_cluster(handle.cluster_name,
-                                             terminate=terminate,
-                                             remove_events=explicitly_requested)
+                                             terminate=terminate)
     def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
         """Remove the YAML config of a cluster."""

sky/backends/local_docker_backend.py CHANGED Viewed

@@ -256,9 +256,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
                 logger.error(
                     'Unable to run container - nvidia runtime for docker not '
                     'found. Have you installed nvidia-docker on your machine?')
-            global_user_state.remove_cluster(cluster_name,
-                                             terminate=True,
-                                             remove_events=False)
+            global_user_state.remove_cluster(cluster_name, terminate=True)
             raise e
         self.containers[handle] = container
         logger.info(
@@ -325,8 +323,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
     def _teardown(self,
                   handle: LocalDockerResourceHandle,
                   terminate: bool,
-                  purge: bool = False,
-                  explicitly_requested: bool = False):
+                  purge: bool = False):
         """Teardown kills the container."""
         del purge  # Unused.
         if not terminate:
@@ -342,9 +339,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
             container.remove(force=True)
         cluster_name = handle.get_cluster_name()
-        global_user_state.remove_cluster(cluster_name,
-                                         terminate=True,
-                                         remove_events=explicitly_requested)
+        global_user_state.remove_cluster(cluster_name, terminate=True)
     # --- Utilities ---

skypilot-nightly 1.0.0.dev20250827__py3-none-any.whl → 1.0.0.dev20250829__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250827py3-none-any.whl → 1.0.0.dev20250829py3-none-any.whl