PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241227py3-none-any.whl → 1.0.0.dev20250124py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

sky/__init__.py +2 -2
sky/adaptors/common.py +15 -9
sky/adaptors/do.py +20 -0
sky/adaptors/oci.py +32 -1
sky/authentication.py +20 -8
sky/backends/backend_utils.py +44 -0
sky/backends/cloud_vm_ray_backend.py +202 -41
sky/backends/wheel_utils.py +4 -1
sky/check.py +31 -1
sky/cli.py +39 -43
sky/cloud_stores.py +71 -2
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +137 -50
sky/clouds/cloud.py +4 -0
sky/clouds/do.py +303 -0
sky/clouds/gcp.py +9 -0
sky/clouds/kubernetes.py +3 -3
sky/clouds/oci.py +20 -9
sky/clouds/service_catalog/__init__.py +7 -3
sky/clouds/service_catalog/constants.py +1 -1
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
sky/clouds/service_catalog/do_catalog.py +111 -0
sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
sky/clouds/utils/oci_utils.py +15 -2
sky/core.py +8 -5
sky/data/data_transfer.py +37 -0
sky/data/data_utils.py +19 -4
sky/data/mounting_utils.py +99 -15
sky/data/storage.py +961 -130
sky/global_user_state.py +1 -1
sky/jobs/__init__.py +2 -0
sky/jobs/constants.py +8 -7
sky/jobs/controller.py +19 -22
sky/jobs/core.py +46 -2
sky/jobs/recovery_strategy.py +114 -143
sky/jobs/scheduler.py +283 -0
sky/jobs/state.py +290 -21
sky/jobs/utils.py +346 -95
sky/optimizer.py +6 -3
sky/provision/aws/config.py +59 -29
sky/provision/azure/instance.py +1 -1
sky/provision/do/__init__.py +11 -0
sky/provision/do/config.py +14 -0
sky/provision/do/constants.py +10 -0
sky/provision/do/instance.py +287 -0
sky/provision/do/utils.py +306 -0
sky/provision/docker_utils.py +22 -11
sky/provision/gcp/instance_utils.py +15 -9
sky/provision/kubernetes/instance.py +3 -2
sky/provision/kubernetes/utils.py +125 -20
sky/provision/oci/query_utils.py +17 -14
sky/provision/provisioner.py +0 -1
sky/provision/runpod/instance.py +10 -1
sky/provision/runpod/utils.py +170 -13
sky/resources.py +1 -1
sky/serve/autoscalers.py +359 -301
sky/serve/controller.py +10 -8
sky/serve/core.py +84 -7
sky/serve/load_balancer.py +27 -10
sky/serve/replica_managers.py +1 -3
sky/serve/serve_state.py +10 -5
sky/serve/serve_utils.py +28 -1
sky/serve/service.py +4 -3
sky/serve/service_spec.py +31 -0
sky/setup_files/dependencies.py +4 -1
sky/skylet/constants.py +8 -4
sky/skylet/events.py +7 -3
sky/skylet/job_lib.py +10 -30
sky/skylet/log_lib.py +8 -8
sky/skylet/log_lib.pyi +3 -0
sky/skylet/providers/command_runner.py +5 -7
sky/skylet/skylet.py +1 -1
sky/task.py +28 -1
sky/templates/do-ray.yml.j2 +98 -0
sky/templates/jobs-controller.yaml.j2 +41 -7
sky/templates/runpod-ray.yml.j2 +13 -0
sky/templates/sky-serve-controller.yaml.j2 +4 -0
sky/usage/usage_lib.py +10 -2
sky/utils/accelerator_registry.py +12 -8
sky/utils/controller_utils.py +114 -39
sky/utils/db_utils.py +18 -4
sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
sky/utils/log_utils.py +2 -0
sky/utils/resources_utils.py +25 -21
sky/utils/schemas.py +27 -0
sky/utils/subprocess_utils.py +54 -10
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '138679859b9844a8737f8dff1bf5a739e77e96c4'
+_SKYPILOT_COMMIT_SHA = '1c94d0f001ed6519873a59a7b46681d64dd696d2'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20241227'
+__version__ = '1.0.0.dev20250124'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/adaptors/common.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Lazy import for modules to avoid import error when not used."""
 import functools
 import importlib
+import threading
 from typing import Any, Callable, Optional, Tuple
@@ -24,17 +25,22 @@ class LazyImport:
         self._module = None
         self._import_error_message = import_error_message
         self._set_loggers = set_loggers
+        self._lock = threading.RLock()
     def load_module(self):
-        if self._module is None:
-            try:
-                self._module = importlib.import_module(self._module_name)
-                if self._set_loggers is not None:
-                    self._set_loggers()
-            except ImportError as e:
-                if self._import_error_message is not None:
-                    raise ImportError(self._import_error_message) from e
-                raise
+        # Avoid extra imports when multiple threads try to import the same
+        # module. The overhead is minor since import can only run in serial
+        # due to GIL even in multi-threaded environments.
+        with self._lock:
+            if self._module is None:
+                try:
+                    self._module = importlib.import_module(self._module_name)
+                    if self._set_loggers is not None:
+                        self._set_loggers()
+                except ImportError as e:
+                    if self._import_error_message is not None:
+                        raise ImportError(self._import_error_message) from e
+                    raise
         return self._module
     def __getattr__(self, name: str) -> Any:

sky/adaptors/do.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Digital Ocean cloud adaptors"""
+# pylint: disable=import-outside-toplevel
+from sky.adaptors import common
+_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for DO. '
+                         'Try pip install "skypilot[do]"')
+pydo = common.LazyImport('pydo', import_error_message=_IMPORT_ERROR_MESSAGE)
+azure = common.LazyImport('azure', import_error_message=_IMPORT_ERROR_MESSAGE)
+_LAZY_MODULES = (pydo, azure)
+# `pydo`` inherits Azure exceptions. See:
+# https://github.com/digitalocean/pydo/blob/7b01498d99eb0d3a772366b642e5fab3d6fc6aa2/examples/poc_droplets_volumes_sshkeys.py#L6
+@common.load_lazy_modules(modules=_LAZY_MODULES)
+def exceptions():
+    """Azure exceptions."""
+    from azure.core import exceptions as azure_exceptions
+    return azure_exceptions

sky/adaptors/oci.py CHANGED Viewed

@@ -1,9 +1,11 @@
 """Oracle OCI cloud adaptor"""
+import functools
 import logging
 import os
 from sky.adaptors import common
+from sky.clouds.utils import oci_utils
 # Suppress OCI circuit breaker logging before lazy import, because
 # oci modules prints additional message during imports, i.e., the
@@ -30,10 +32,16 @@ def get_config_file() -> str:
 def get_oci_config(region=None, profile='DEFAULT'):
     conf_file_path = get_config_file()
+    if not profile or profile == 'DEFAULT':
+        config_profile = oci_utils.oci_config.get_profile()
+    else:
+        config_profile = profile
     oci_config = oci.config.from_file(file_location=conf_file_path,
-                                      profile_name=profile)
+                                      profile_name=config_profile)
     if region is not None:
         oci_config['region'] = region
     return oci_config
@@ -54,6 +62,29 @@ def get_identity_client(region=None, profile='DEFAULT'):
     return oci.identity.IdentityClient(get_oci_config(region, profile))
+def get_object_storage_client(region=None, profile='DEFAULT'):
+    return oci.object_storage.ObjectStorageClient(
+        get_oci_config(region, profile))
 def service_exception():
     """OCI service exception."""
     return oci.exceptions.ServiceError
+def with_oci_env(f):
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        # pylint: disable=line-too-long
+        enter_env_cmds = [
+            'conda info --envs | grep "sky-oci-cli-env" || conda create -n sky-oci-cli-env python=3.10 -y',
+            '. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true',
+            'conda activate sky-oci-cli-env', 'pip install oci-cli',
+            'export OCI_CLI_SUPPRESS_FILE_PERMISSIONS_WARNING=True'
+        ]
+        operation_cmd = [f(*args, **kwargs)]
+        leave_env_cmds = ['conda deactivate']
+        return ' && '.join(enter_env_cmds + operation_cmd + leave_env_cmds)
+    return wrapper

sky/authentication.py CHANGED Viewed

@@ -408,14 +408,26 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
         secret = k8s.client.V1Secret(
             metadata=k8s.client.V1ObjectMeta(**secret_metadata),
             string_data={secret_field_name: public_key})
-    if kubernetes_utils.check_secret_exists(secret_name, namespace, context):
-        logger.debug(f'Key {secret_name} exists in the cluster, patching it...')
-        kubernetes.core_api(context).patch_namespaced_secret(
-            secret_name, namespace, secret)
-    else:
-        logger.debug(
-            f'Key {secret_name} does not exist in the cluster, creating it...')
-        kubernetes.core_api(context).create_namespaced_secret(namespace, secret)
+    try:
+        if kubernetes_utils.check_secret_exists(secret_name, namespace,
+                                                context):
+            logger.debug(f'Key {secret_name} exists in the cluster, '
+                         'patching it...')
+            kubernetes.core_api(context).patch_namespaced_secret(
+                secret_name, namespace, secret)
+        else:
+            logger.debug(f'Key {secret_name} does not exist in the cluster, '
+                         'creating it...')
+            kubernetes.core_api(context).create_namespaced_secret(
+                namespace, secret)
+    except kubernetes.api_exception() as e:
+        if e.status == 409 and e.reason == 'AlreadyExists':
+            logger.debug(f'Key {secret_name} was created concurrently, '
+                         'patching it...')
+            kubernetes.core_api(context).patch_namespaced_secret(
+                secret_name, namespace, secret)
+        else:
+            raise e
     private_key_path, _ = get_or_generate_keys()
     if network_mode == nodeport_mode:

sky/backends/backend_utils.py CHANGED Viewed

@@ -650,6 +650,42 @@ def _replace_yaml_dicts(
     return common_utils.dump_yaml_str(new_config)
+def get_expirable_clouds(
+        enabled_clouds: Sequence[clouds.Cloud]) -> List[clouds.Cloud]:
+    """Returns a list of clouds that use local credentials and whose credentials can expire.
+    This function checks each cloud in the provided sequence to determine if it uses local credentials
+    and if its credentials can expire. If both conditions are met, the cloud is added to the list of
+    expirable clouds.
+    Args:
+        enabled_clouds (Sequence[clouds.Cloud]): A sequence of cloud objects to check.
+    Returns:
+        list[clouds.Cloud]: A list of cloud objects that use local credentials and whose credentials can expire.
+    """
+    expirable_clouds = []
+    local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
+    for cloud in enabled_clouds:
+        remote_identities = skypilot_config.get_nested(
+            (str(cloud).lower(), 'remote_identity'), None)
+        if remote_identities is None:
+            remote_identities = schemas.get_default_remote_identity(
+                str(cloud).lower())
+        local_credential_expiring = cloud.can_credential_expire()
+        if isinstance(remote_identities, str):
+            if remote_identities == local_credentials_value and local_credential_expiring:
+                expirable_clouds.append(cloud)
+        elif isinstance(remote_identities, list):
+            for profile in remote_identities:
+                if list(profile.values(
+                ))[0] == local_credentials_value and local_credential_expiring:
+                    expirable_clouds.append(cloud)
+                    break
+    return expirable_clouds
 # TODO: too many things happening here - leaky abstraction. Refactor.
 @timeline.event
 def write_cluster_config(
@@ -926,6 +962,13 @@ def write_cluster_config(
             tmp_yaml_path,
             cluster_config_overrides=to_provision.cluster_config_overrides)
         kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
+        yaml_obj = common_utils.read_yaml(tmp_yaml_path)
+        pod_config = yaml_obj['available_node_types']['ray_head_default'][
+            'node_config']
+        valid, message = kubernetes_utils.check_pod_config(pod_config)
+        if not valid:
+            raise exceptions.InvalidCloudConfigs(
+                f'Invalid pod_config. Details: {message}')
     if dryrun:
         # If dryrun, return the unfinished tmp yaml path.
@@ -1000,6 +1043,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
             clouds.Cudo,
             clouds.Paperspace,
             clouds.Azure,
+            clouds.DO,
     )):
         config = auth.configure_ssh_info(config)
     elif isinstance(cloud, clouds.GCP):

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -10,6 +10,7 @@ import os
 import pathlib
 import re
 import shlex
+import shutil
 import signal
 import subprocess
 import sys
@@ -26,6 +27,7 @@ import filelock
 import sky
 from sky import backends
+from sky import check as sky_check
 from sky import cloud_stores
 from sky import clouds
 from sky import exceptions
@@ -34,7 +36,6 @@ from sky import jobs as managed_jobs
 from sky import optimizer
 from sky import provision as provision_lib
 from sky import resources as resources_lib
-from sky import serve as serve_lib
 from sky import sky_logging
 from sky import status_lib
 from sky import task as task_lib
@@ -44,6 +45,7 @@ from sky.clouds import service_catalog
 from sky.clouds.utils import gcp_utils
 from sky.data import data_utils
 from sky.data import storage as storage_lib
+from sky.jobs import constants as managed_jobs_constants
 from sky.provision import common as provision_common
 from sky.provision import instance_setup
 from sky.provision import metadata_utils
@@ -154,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
 # might be added during ssh.
 _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
+_RESOURCES_UNAVAILABLE_LOG = (
+    'Reasons for provision failures (for details, please check the log above):')
 def _is_command_length_over_limit(command: str) -> bool:
     """Check if the length of the command exceeds the limit.
@@ -178,6 +183,7 @@ def _get_cluster_config_template(cloud):
         clouds.SCP: 'scp-ray.yml.j2',
         clouds.OCI: 'oci-ray.yml.j2',
         clouds.Paperspace: 'paperspace-ray.yml.j2',
+        clouds.DO: 'do-ray.yml.j2',
         clouds.RunPod: 'runpod-ray.yml.j2',
         clouds.Kubernetes: 'kubernetes-ray.yml.j2',
         clouds.Vsphere: 'vsphere-ray.yml.j2',
@@ -1995,6 +2001,23 @@ class RetryingVmProvisioner(object):
                                        skip_unnecessary_provisioning else None)
         failover_history: List[Exception] = list()
+        resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
+        # If the user is using local credentials which may expire, the
+        # controller may leak resources if the credentials expire while a job
+        # is running. Here we check the enabled clouds and expiring credentials
+        # and raise a warning to the user.
+        if task.is_controller_task():
+            enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
+            expirable_clouds = backend_utils.get_expirable_clouds(
+                enabled_clouds)
+            if len(expirable_clouds) > 0:
+                warnings = (f'\033[93mWarning: Credentials used for '
+                            f'{expirable_clouds} may expire. Clusters may be '
+                            f'leaked if the credentials expire while jobs '
+                            f'are running. It is recommended to use credentials'
+                            f' that never expire or a service account.\033[0m')
+                logger.warning(warnings)
         # Retrying launchable resources.
         while True:
@@ -2070,6 +2093,8 @@ class RetryingVmProvisioner(object):
                 # Add failed resources to the blocklist, only when it
                 # is in fallback mode.
                 _add_to_blocked_resources(self._blocked_resources, to_provision)
+                assert len(failover_history) > 0
+                resource_exceptions[to_provision] = failover_history[-1]
             else:
                 # If we reach here, it means that the existing cluster must have
                 # a previous status of INIT, because other statuses (UP,
@@ -2114,7 +2139,14 @@ class RetryingVmProvisioner(object):
                 # possible resources or the requested resources is too
                 # restrictive. If we reach here, our failover logic finally
                 # ends here.
-                raise e.with_failover_history(failover_history)
+                table = log_utils.create_table(['Resource', 'Reason'])
+                for (resource, exception) in resource_exceptions.items():
+                    table.add_row(
+                        [resources_utils.format_resource(resource), exception])
+                table.max_table_width = shutil.get_terminal_size().columns
+                raise exceptions.ResourcesUnavailableError(
+                    _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
+                    failover_history=failover_history)
             to_provision = task.best_resources
             assert task in self._dag.tasks, 'Internal logic error.'
             assert to_provision is not None, task
@@ -2877,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         'the `--retry-until-up` flag.')
                     with ux_utils.print_exception_no_traceback():
                         raise exceptions.ResourcesUnavailableError(
-                            error_message,
+                            error_message + '\n' + str(e),
                             failover_history=e.failover_history) from None
             if dryrun:
                 record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3309,7 +3341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # even if some of them raise exceptions. We should replace it with
         # multi-process.
         rich_utils.stop_safe_status()
-        subprocess_utils.run_in_parallel(_setup_node, range(num_nodes))
+        subprocess_utils.run_in_parallel(_setup_node, list(range(num_nodes)))
         if detach_setup:
             # Only set this when setup needs to be run outside the self._setup()
@@ -3873,42 +3905,157 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             stdin=subprocess.DEVNULL,
         )
-    def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
-                        service_name: str, target: serve_lib.ServiceComponent,
-                        replica_id: Optional[int], follow: bool) -> None:
-        """Tail the logs of a service.
+    def sync_down_managed_job_logs(
+            self,
+            handle: CloudVmRayResourceHandle,
+            job_id: Optional[int] = None,
+            job_name: Optional[str] = None,
+            controller: bool = False,
+            local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
+        """Sync down logs for a managed job.
         Args:
-            handle: The handle to the sky serve controller.
-            service_name: The name of the service.
-            target: The component to tail the logs of. Could be controller,
-                load balancer, or replica.
-            replica_id: The replica ID to tail the logs of. Only used when
-                target is replica.
-            follow: Whether to follow the logs.
-        """
-        if target != serve_lib.ServiceComponent.REPLICA:
-            code = serve_lib.ServeCodeGen.stream_serve_process_logs(
-                service_name,
-                stream_controller=(
-                    target == serve_lib.ServiceComponent.CONTROLLER),
-                follow=follow)
-        else:
-            assert replica_id is not None, service_name
-            code = serve_lib.ServeCodeGen.stream_replica_logs(
-                service_name, replica_id, follow)
+            handle: The handle to the cluster.
+            job_id: The job ID to sync down logs for.
+            job_name: The job name to sync down logs for.
+            controller: Whether to sync down logs for the controller.
+            local_dir: The local directory to sync down logs to.
-        signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
-        signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
+        Returns:
+            A dictionary mapping job_id to log path.
+        """
+        # if job_name and job_id should not both be specified
+        assert job_name is None or job_id is None, (job_name, job_id)
-        self.run_on_head(
+        if job_id is None:
+            # generate code to get the job_id
+            # if job_name is None, get all job_ids
+            # TODO: Only get the latest job_id, since that's the only one we use
+            code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
+                job_name=job_name)
+            returncode, job_ids, stderr = self.run_on_head(handle,
+                                                           code,
+                                                           stream_logs=False,
+                                                           require_outputs=True,
+                                                           separate_stderr=True)
+            subprocess_utils.handle_returncode(returncode, code,
+                                               'Failed to sync down logs.',
+                                               stderr)
+            job_ids = common_utils.decode_payload(job_ids)
+            if not job_ids:
+                logger.info(f'{colorama.Fore.YELLOW}'
+                            'No matching job found'
+                            f'{colorama.Style.RESET_ALL}')
+                return {}
+            elif len(job_ids) > 1:
+                name_str = ''
+                if job_name is not None:
+                    name_str = ('Multiple jobs IDs found under the name '
+                                f'{job_name}. ')
+                logger.info(f'{colorama.Fore.YELLOW}'
+                            f'{name_str}'
+                            'Downloading the latest job logs.'
+                            f'{colorama.Style.RESET_ALL}')
+            # list should aready be in descending order
+            job_id = job_ids[0]
+        # get the run_timestamp
+        # the function takes in [job_id]
+        code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
+            [str(job_id)])
+        returncode, run_timestamps, stderr = self.run_on_head(
             handle,
             code,
-            stream_logs=True,
-            process_stream=False,
-            ssh_mode=command_runner.SshMode.INTERACTIVE,
-            stdin=subprocess.DEVNULL,
-        )
+            stream_logs=False,
+            require_outputs=True,
+            separate_stderr=True)
+        subprocess_utils.handle_returncode(returncode, code,
+                                           'Failed to sync logs.', stderr)
+        # returns with a dict of {job_id: run_timestamp}
+        run_timestamps = common_utils.decode_payload(run_timestamps)
+        if not run_timestamps:
+            logger.info(f'{colorama.Fore.YELLOW}'
+                        'No matching log directories found'
+                        f'{colorama.Style.RESET_ALL}')
+            return {}
+        run_timestamp = list(run_timestamps.values())[0]
+        job_id = list(run_timestamps.keys())[0]
+        local_log_dir = ''
+        if controller:  # download controller logs
+            remote_log = os.path.join(
+                managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
+                f'{job_id}.log')
+            local_log_dir = os.path.expanduser(
+                os.path.join(local_dir, run_timestamp))
+            logger.info(f'{colorama.Fore.CYAN}'
+                        f'Job {job_id} local logs: {local_log_dir}'
+                        f'{colorama.Style.RESET_ALL}')
+            runners = handle.get_command_runners()
+            def _rsync_down(args) -> None:
+                """Rsync down logs from remote nodes.
+                Args:
+                    args: A tuple of (runner, local_log_dir, remote_log_dir)
+                """
+                (runner, local_log_dir, remote_log) = args
+                try:
+                    os.makedirs(local_log_dir, exist_ok=True)
+                    runner.rsync(
+                        source=remote_log,
+                        target=f'{local_log_dir}/controller.log',
+                        up=False,
+                        stream_logs=False,
+                    )
+                except exceptions.CommandError as e:
+                    if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
+                        # Raised by rsync_down. Remote log dir may not exist
+                        # since the job can be run on some part of the nodes.
+                        logger.debug(
+                            f'{runner.node_id} does not have the tasks/*.')
+                    else:
+                        raise
+            parallel_args = [
+                (runner, local_log_dir, remote_log) for runner in runners
+            ]
+            subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
+        else:  # download job logs
+            local_log_dir = os.path.expanduser(
+                os.path.join(local_dir, 'managed_jobs', run_timestamp))
+            os.makedirs(os.path.dirname(local_log_dir), exist_ok=True)
+            log_file = os.path.join(local_log_dir, 'run.log')
+            code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
+                                                              job_id=job_id,
+                                                              follow=False,
+                                                              controller=False)
+            # With the stdin=subprocess.DEVNULL, the ctrl-c will not
+            # kill the process, so we need to handle it manually here.
+            if threading.current_thread() is threading.main_thread():
+                signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
+                signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
+            # We redirect the output to the log file
+            # and disable the STDOUT and STDERR
+            self.run_on_head(
+                handle,
+                code,
+                log_path=log_file,
+                stream_logs=False,
+                process_stream=False,
+                ssh_mode=command_runner.SshMode.INTERACTIVE,
+                stdin=subprocess.DEVNULL,
+            )
+        logger.info(f'{colorama.Fore.CYAN}'
+                    f'Job {job_id} logs: {local_log_dir}'
+                    f'{colorama.Style.RESET_ALL}')
+        return {str(job_id): local_log_dir}
     def teardown_no_lock(self,
                          handle: CloudVmRayResourceHandle,
@@ -4198,11 +4345,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         attempts = 0
         while True:
             logger.debug(f'instance statuses attempt {attempts + 1}')
-            node_status_dict = provision_lib.query_instances(
-                repr(cloud),
-                cluster_name_on_cloud,
-                config['provider'],
-                non_terminated_only=False)
+            try:
+                node_status_dict = provision_lib.query_instances(
+                    repr(cloud),
+                    cluster_name_on_cloud,
+                    config['provider'],
+                    non_terminated_only=False)
+            except Exception as e:  # pylint: disable=broad-except
+                if purge:
+                    logger.warning(
+                        f'Failed to query instances. Skipping since purge is '
+                        f'set. Details: '
+                        f'{common_utils.format_exception(e, use_bracket=True)}')
+                    break
+                raise
             unexpected_node_state: Optional[Tuple[str, str]] = None
             for node_id, node_status in node_status_dict.items():
@@ -4221,8 +4377,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
             else:
                 (node_id, node_status) = unexpected_node_state
-                raise RuntimeError(f'Instance {node_id} in unexpected state '
-                                   f'{node_status}.')
+                if purge:
+                    logger.warning(f'Instance {node_id} in unexpected '
+                                   f'state {node_status}. Skipping since purge '
+                                   'is set.')
+                    break
+                raise RuntimeError(f'Instance {node_id} in unexpected '
+                                   f'state {node_status}.')
         global_user_state.remove_cluster(handle.cluster_name,
                                          terminate=terminate)

sky/backends/wheel_utils.py CHANGED Viewed

@@ -153,7 +153,10 @@ def build_sky_wheel() -> Tuple[pathlib.Path, str]:
         if not path.exists():
             return -1.
         try:
-            return max(os.path.getmtime(root) for root, _, _ in os.walk(path))
+            return max(
+                os.path.getmtime(os.path.join(root, f))
+                for root, dirs, files in os.walk(path)
+                for f in (*dirs, *files))
         except ValueError:
             return -1.

sky/check.py CHANGED Viewed

@@ -155,7 +155,8 @@ def check(
         # Pretty print for UX.
         if not quiet:
             enabled_clouds_str = '\n  :heavy_check_mark: '.join(
-                [''] + sorted(all_enabled_clouds))
+                [''] +
+                [_format_enabled_cloud(c) for c in sorted(all_enabled_clouds)])
             rich.print('\n[green]:tada: Enabled clouds :tada:'
                        f'{enabled_clouds_str}[/green]')
@@ -222,3 +223,32 @@ def get_cloud_credential_file_mounts(
         r2_credential_mounts = cloudflare.get_credential_file_mounts()
         file_mounts.update(r2_credential_mounts)
     return file_mounts
+def _format_enabled_cloud(cloud_name: str) -> str:
+    if cloud_name == repr(sky_clouds.Kubernetes()):
+        # Get enabled contexts for Kubernetes
+        existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
+        if not existing_contexts:
+            return cloud_name
+        # Check if allowed_contexts is explicitly set in config
+        allowed_contexts = skypilot_config.get_nested(
+            ('kubernetes', 'allowed_contexts'), None)
+        # Format the context info with consistent styling
+        if allowed_contexts is not None:
+            contexts_formatted = []
+            for i, context in enumerate(existing_contexts):
+                # TODO: We should use ux_utils.INDENT_SYMBOL and
+                # INDENT_LAST_SYMBOL but, they are formatted for colorama, while
+                # here we are using rich. We should migrate this file to
+                # use colorama as we do in the rest of the codebase.
+                symbol = ('└── ' if i == len(existing_contexts) - 1 else '├── ')
+                contexts_formatted.append(f'\n        {symbol}{context}')
+            context_info = f'Allowed contexts:{"".join(contexts_formatted)}'
+        else:
+            context_info = f'Active context: {existing_contexts[0]}'
+        return f'{cloud_name}[/green][dim]\n    └── {context_info}[/dim][green]'
+    return cloud_name

skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241227py3-none-any.whl → 1.0.0.dev20250124py3-none-any.whl