PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241203__py3-none-any.whl → 1.0.0.dev20241205__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241203py3-none-any.whl → 1.0.0.dev20241205py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sky/__init__.py +4 -2
sky/backends/backend.py +42 -15
sky/backends/backend_utils.py +143 -9
sky/backends/cloud_vm_ray_backend.py +103 -25
sky/backends/local_docker_backend.py +11 -7
sky/cli.py +11 -2
sky/clouds/service_catalog/common.py +2 -2
sky/core.py +25 -18
sky/exceptions.py +7 -0
sky/execution.py +30 -11
sky/global_user_state.py +23 -10
sky/jobs/controller.py +28 -8
sky/jobs/core.py +61 -35
sky/jobs/recovery_strategy.py +2 -1
sky/jobs/state.py +33 -1
sky/jobs/utils.py +16 -2
sky/setup_files/dependencies.py +141 -0
sky/setup_files/setup.py +12 -124
sky/skylet/constants.py +36 -11
sky/skylet/log_lib.py +3 -1
sky/skylet/log_lib.pyi +3 -0
sky/templates/kubernetes-ray.yml.j2 +4 -2
sky/utils/common_utils.py +19 -0
sky/utils/controller_utils.py +60 -98
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/METADATA +3 -2
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/RECORD +30 -29
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/top_level.txt +0 -0

sky/core.py CHANGED Viewed

@@ -268,7 +268,8 @@ def _start(
     cluster_status, handle = backend_utils.refresh_cluster_status_handle(
         cluster_name)
     if handle is None:
-        raise ValueError(f'Cluster {cluster_name!r} does not exist.')
+        raise exceptions.ClusterDoesNotExist(
+            f'Cluster {cluster_name!r} does not exist.')
     if not force and cluster_status == status_lib.ClusterStatus.UP:
         sky_logging.print(f'Cluster {cluster_name!r} is already up.')
         return handle
@@ -359,12 +360,13 @@ def start(
             Useful for upgrading SkyPilot runtime.
     Raises:
-        ValueError: argument values are invalid: (1) the specified cluster does
-          not exist; (2) if ``down`` is set to True but
-          ``idle_minutes_to_autostop`` is None; (3) if the specified cluster is
-          the managed jobs controller, and either ``idle_minutes_to_autostop``
-          is not None or ``down`` is True (omit them to use the default
-          autostop settings).
+        ValueError: argument values are invalid: (1) if ``down`` is set to True
+          but ``idle_minutes_to_autostop`` is None; (2) if the specified
+          cluster is the managed jobs controller, and either
+          ``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
+          them to use the default autostop settings).
+        sky.exceptions.ClusterDoesNotExist: the specified cluster does not
+          exist.
         sky.exceptions.NotSupportedError: if the cluster to restart was
           launched using a non-default backend that does not support this
           operation.
@@ -412,7 +414,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
             related resources.
     Raises:
-        ValueError: the specified cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: the specified cluster does not
+          exist.
         RuntimeError: failed to stop the cluster.
         sky.exceptions.NotSupportedError: if the specified cluster is a spot
           cluster, or a TPU VM Pod cluster, or the managed jobs controller.
@@ -423,7 +426,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
             f'is not supported.')
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     if handle is None:
-        raise ValueError(f'Cluster {cluster_name!r} does not exist.')
+        raise exceptions.ClusterDoesNotExist(
+            f'Cluster {cluster_name!r} does not exist.')
     backend = backend_utils.get_backend_from_handle(handle)
@@ -467,14 +471,16 @@ def down(cluster_name: str, purge: bool = False) -> None:
             resources.
     Raises:
-        ValueError: the specified cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: the specified cluster does not
+          exist.
         RuntimeError: failed to tear down the cluster.
         sky.exceptions.NotSupportedError: the specified cluster is the managed
           jobs controller.
     """
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     if handle is None:
-        raise ValueError(f'Cluster {cluster_name!r} does not exist.')
+        raise exceptions.ClusterDoesNotExist(
+            f'Cluster {cluster_name!r} does not exist.')
     usage_lib.record_cluster_name_for_current_operation(cluster_name)
     backend = backend_utils.get_backend_from_handle(handle)
@@ -521,7 +527,7 @@ def autostop(
           rather than autostop (restartable).
     Raises:
-        ValueError: if the cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend or the cluster is TPU VM Pod.
@@ -615,7 +621,7 @@ def queue(cluster_name: str,
             }
         ]
     raises:
-        ValueError: if the cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
@@ -674,7 +680,8 @@ def cancel(
             worker node is preempted in the spot cluster.
     Raises:
-        ValueError: if arguments are invalid, or the cluster does not exist.
+        ValueError: if arguments are invalid.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the specified cluster is a
           controller that does not support this operation.
@@ -750,8 +757,8 @@ def tail_logs(cluster_name: str,
     Please refer to the sky.cli.tail_logs for the document.
     Raises:
-        ValueError: arguments are invalid or the cluster is not supported or
-          the cluster does not exist.
+        ValueError: if arguments are invalid or the cluster is not supported.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
@@ -793,7 +800,7 @@ def download_logs(
     Returns:
         Dict[str, str]: a mapping of job_id to local log path.
     Raises:
-        ValueError: if the cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.
@@ -838,7 +845,7 @@ def job_status(cluster_name: str,
         If job_ids is None and there is no job on the cluster, it will return
         {None: None}.
     Raises:
-        ValueError: if the cluster does not exist.
+        sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
         sky.exceptions.ClusterNotUpError: if the cluster is not UP.
         sky.exceptions.NotSupportedError: if the cluster is not based on
           CloudVmRayBackend.

sky/exceptions.py CHANGED Viewed

@@ -132,6 +132,13 @@ class ClusterSetUpError(Exception):
     pass
+class ClusterDoesNotExist(ValueError):
+    """Raise when trying to operate on a cluster that does not exist."""
+    # This extends ValueError for compatibility reasons - we used to throw
+    # ValueError instead of this.
+    pass
 class NotSupportedError(Exception):
     """Raised when a feature is not supported."""
     pass

sky/execution.py CHANGED Viewed

@@ -108,6 +108,7 @@ def _execute(
     idle_minutes_to_autostop: Optional[int] = None,
     no_setup: bool = False,
     clone_disk_from: Optional[str] = None,
+    skip_unnecessary_provisioning: bool = False,
     # Internal only:
     # pylint: disable=invalid-name
     _is_launched_by_jobs_controller: bool = False,
@@ -128,8 +129,9 @@ def _execute(
         Note that if errors occur during provisioning/data syncing/setting up,
         the cluster will not be torn down for debugging purposes.
       stream_logs: bool; whether to stream all tasks' outputs to the client.
-      handle: Optional[backends.ResourceHandle]; if provided, execution will use
-        an existing backend cluster handle instead of provisioning a new one.
+      handle: Optional[backends.ResourceHandle]; if provided, execution will
+        attempt to use an existing backend cluster handle instead of
+        provisioning a new one.
       backend: Backend; backend to use for executing the tasks. Defaults to
         CloudVmRayBackend()
       retry_until_up: bool; whether to retry the provisioning until the cluster
@@ -150,6 +152,11 @@ def _execute(
       idle_minutes_to_autostop: int; if provided, the cluster will be set to
         autostop after this many minutes of idleness.
       no_setup: bool; whether to skip setup commands or not when (re-)launching.
+      clone_disk_from: Optional[str]; if set, clone the disk from the specified
+        cluster.
+      skip_unecessary_provisioning: bool; if True, compare the calculated
+        cluster config to the current cluster's config. If they match, shortcut
+        provisioning even if we have Stage.PROVISION.
     Returns:
       job_id: Optional[int]; the job ID of the submitted job. None if the
@@ -288,13 +295,18 @@ def _execute(
     try:
         if Stage.PROVISION in stages:
-            if handle is None:
-                handle = backend.provision(task,
-                                           task.best_resources,
-                                           dryrun=dryrun,
-                                           stream_logs=stream_logs,
-                                           cluster_name=cluster_name,
-                                           retry_until_up=retry_until_up)
+            assert handle is None or skip_unnecessary_provisioning, (
+                'Provisioning requested, but handle is already set. PROVISION '
+                'should be excluded from stages or '
+                'skip_unecessary_provisioning should be set. ')
+            handle = backend.provision(
+                task,
+                task.best_resources,
+                dryrun=dryrun,
+                stream_logs=stream_logs,
+                cluster_name=cluster_name,
+                retry_until_up=retry_until_up,
+                skip_unnecessary_provisioning=skip_unnecessary_provisioning)
         if handle is None:
             assert dryrun, ('If not dryrun, handle must be set or '
@@ -469,6 +481,7 @@ def launch(
     handle = None
     stages = None
+    skip_unnecessary_provisioning = False
     # Check if cluster exists and we are doing fast provisioning
     if fast and cluster_name is not None:
         cluster_status, maybe_handle = (
@@ -502,12 +515,16 @@ def launch(
         if cluster_status == status_lib.ClusterStatus.UP:
             handle = maybe_handle
             stages = [
+                # Provisioning will be short-circuited if the existing
+                # cluster config hash matches the calculated one.
+                Stage.PROVISION,
                 Stage.SYNC_WORKDIR,
                 Stage.SYNC_FILE_MOUNTS,
                 Stage.PRE_EXEC,
                 Stage.EXEC,
                 Stage.DOWN,
             ]
+            skip_unnecessary_provisioning = True
     return _execute(
         entrypoint=entrypoint,
@@ -525,6 +542,7 @@ def launch(
         idle_minutes_to_autostop=idle_minutes_to_autostop,
         no_setup=no_setup,
         clone_disk_from=clone_disk_from,
+        skip_unnecessary_provisioning=skip_unnecessary_provisioning,
         _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
         _is_launched_by_sky_serve_controller=
         _is_launched_by_sky_serve_controller,
@@ -581,8 +599,9 @@ def exec(  # pylint: disable=redefined-builtin
             submitted.
     Raises:
-        ValueError: if the specified cluster does not exist or is not in UP
-            status.
+        ValueError: if the specified cluster is not in UP status.
+        sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
+            exist.
         sky.exceptions.NotSupportedError: if the specified cluster is a
             controller that does not support this operation.

sky/global_user_state.py CHANGED Viewed

@@ -61,7 +61,8 @@ def create_table(cursor, conn):
         cluster_hash TEXT DEFAULT null,
         storage_mounts_metadata BLOB DEFAULT null,
         cluster_ever_up INTEGER DEFAULT 0,
-        status_updated_at INTEGER DEFAULT null)""")
+        status_updated_at INTEGER DEFAULT null,
+        config_hash TEXT DEFAULT null)""")
     # Table for Cluster History
     # usage_intervals: List[Tuple[int, int]]
@@ -135,6 +136,9 @@ def create_table(cursor, conn):
     db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
                                  'INTEGER DEFAULT null')
+    db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
+                                 'TEXT DEFAULT null')
     conn.commit()
@@ -145,7 +149,8 @@ def add_or_update_cluster(cluster_name: str,
                           cluster_handle: 'backends.ResourceHandle',
                           requested_resources: Optional[Set[Any]],
                           ready: bool,
-                          is_launch: bool = True):
+                          is_launch: bool = True,
+                          config_hash: Optional[str] = None):
     """Adds or updates cluster_name -> cluster_handle mapping.
     Args:
@@ -197,7 +202,8 @@ def add_or_update_cluster(cluster_name: str,
         # specified.
         '(name, launched_at, handle, last_use, status, '
         'autostop, to_down, metadata, owner, cluster_hash, '
-        'storage_mounts_metadata, cluster_ever_up, status_updated_at) '
+        'storage_mounts_metadata, cluster_ever_up, status_updated_at, '
+        'config_hash) '
         'VALUES ('
         # name
         '?, '
@@ -236,7 +242,9 @@ def add_or_update_cluster(cluster_name: str,
         # cluster_ever_up
         '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
         # status_updated_at
-        '?'
+        '?,'
+        # config_hash
+        'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?))'
         ')',
         (
             # name
@@ -270,6 +278,9 @@ def add_or_update_cluster(cluster_name: str,
             int(ready),
             # status_updated_at
             status_updated_at,
+            # config_hash
+            config_hash,
+            cluster_name,
         ))
     launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
@@ -585,15 +596,15 @@ def get_cluster_from_name(
     rows = _DB.cursor.execute(
         'SELECT name, launched_at, handle, last_use, status, autostop, '
         'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
-        'cluster_ever_up, status_updated_at FROM clusters WHERE name=(?)',
-        (cluster_name,)).fetchall()
+        'cluster_ever_up, status_updated_at, config_hash '
+        'FROM clusters WHERE name=(?)', (cluster_name,)).fetchall()
     for row in rows:
         # Explicitly specify the number of fields to unpack, so that
         # we can add new fields to the database in the future without
         # breaking the previous code.
         (name, launched_at, handle, last_use, status, autostop, metadata,
          to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
-         status_updated_at) = row[:13]
+         status_updated_at, config_hash) = row[:14]
         # TODO: use namedtuple instead of dict
         record = {
             'name': name,
@@ -610,6 +621,7 @@ def get_cluster_from_name(
                 _load_storage_mounts_metadata(storage_mounts_metadata),
             'cluster_ever_up': bool(cluster_ever_up),
             'status_updated_at': status_updated_at,
+            'config_hash': config_hash,
         }
         return record
     return None
@@ -619,13 +631,13 @@ def get_clusters() -> List[Dict[str, Any]]:
     rows = _DB.cursor.execute(
         'select name, launched_at, handle, last_use, status, autostop, '
         'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
-        'cluster_ever_up, status_updated_at from clusters '
-        'order by launched_at desc').fetchall()
+        'cluster_ever_up, status_updated_at, config_hash '
+        'from clusters order by launched_at desc').fetchall()
     records = []
     for row in rows:
         (name, launched_at, handle, last_use, status, autostop, metadata,
          to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
-         status_updated_at) = row[:13]
+         status_updated_at, config_hash) = row[:14]
         # TODO: use namedtuple instead of dict
         record = {
             'name': name,
@@ -642,6 +654,7 @@ def get_clusters() -> List[Dict[str, Any]]:
                 _load_storage_mounts_metadata(storage_mounts_metadata),
             'cluster_ever_up': bool(cluster_ever_up),
             'status_updated_at': status_updated_at,
+            'config_hash': config_hash,
         }
         records.append(record)

sky/jobs/controller.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pathlib
 import time
 import traceback
 import typing
-from typing import Tuple
+from typing import Optional, Tuple
 import filelock
@@ -87,18 +87,28 @@ class JobsController:
             task.update_envs(task_envs)
     def _download_log_and_stream(
-            self,
-            handle: cloud_vm_ray_backend.CloudVmRayResourceHandle) -> None:
-        """Downloads and streams the logs of the latest job.
+        self, task_id: Optional[int],
+        handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
+    ) -> None:
+        """Downloads and streams the logs of the current job with given task ID.
         We do not stream the logs from the cluster directly, as the
         donwload and stream should be faster, and more robust against
         preemptions or ssh disconnection during the streaming.
         """
+        if handle is None:
+            logger.info(f'Cluster for job {self._job_id} is not found. '
+                        'Skipping downloading and streaming the logs.')
+            return
         managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
                                             'managed_jobs')
-        controller_utils.download_and_stream_latest_job_log(
+        log_file = controller_utils.download_and_stream_latest_job_log(
             self._backend, handle, managed_job_logs_dir)
+        if log_file is not None:
+            # Set the path of the log file for the current task, so it can be
+            # accessed even after the job is finished
+            managed_job_state.set_local_log_file(self._job_id, task_id,
+                                                 log_file)
         logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
     def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
@@ -213,7 +223,8 @@ class JobsController:
             if job_status == job_lib.JobStatus.SUCCEEDED:
                 end_time = managed_job_utils.get_job_timestamp(
                     self._backend, cluster_name, get_end_time=True)
-                # The job is done.
+                # The job is done. Set the job to SUCCEEDED first before start
+                # downloading and streaming the logs to make it more responsive.
                 managed_job_state.set_succeeded(self._job_id,
                                                 task_id,
                                                 end_time=end_time,
@@ -221,12 +232,21 @@ class JobsController:
                 logger.info(
                     f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
                     f'Cleaning up the cluster {cluster_name}.')
+                clusters = backend_utils.get_clusters(
+                    cluster_names=[cluster_name],
+                    refresh=False,
+                    include_controller=False)
+                if clusters:
+                    assert len(clusters) == 1, (clusters, cluster_name)
+                    handle = clusters[0].get('handle')
+                    # Best effort to download and stream the logs.
+                    self._download_log_and_stream(task_id, handle)
                 # Only clean up the cluster, not the storages, because tasks may
                 # share storages.
                 recovery_strategy.terminate_cluster(cluster_name=cluster_name)
                 return True
-            # For single-node jobs, nonterminated job_status indicates a
+            # For single-node jobs, non-terminated job_status indicates a
             # healthy cluster. We can safely continue monitoring.
             # For multi-node jobs, since the job may not be set to FAILED
             # immediately (depending on user program) when only some of the
@@ -278,7 +298,7 @@ class JobsController:
                         'The user job failed. Please check the logs below.\n'
                         f'== Logs of the user job (ID: {self._job_id}) ==\n')
-                    self._download_log_and_stream(handle)
+                    self._download_log_and_stream(task_id, handle)
                     managed_job_status = (
                         managed_job_state.ManagedJobStatus.FAILED)
                     if job_status == job_lib.JobStatus.FAILED_SETUP:

sky/jobs/core.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """SDK functions for managed jobs."""
 import os
 import tempfile
+import typing
 from typing import Any, Dict, List, Optional, Union
 import uuid
@@ -29,6 +30,9 @@ from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
+if typing.TYPE_CHECKING:
+    from sky.backends import cloud_vm_ray_backend
 @timeline.event
 @usage_lib.entrypoint
@@ -225,6 +229,40 @@ def queue_from_kubernetes_pod(
     return jobs
+def _maybe_restart_controller(
+        refresh: bool, stopped_message: str, spinner_message: str
+) -> 'cloud_vm_ray_backend.CloudVmRayResourceHandle':
+    """Restart controller if refresh is True and it is stopped."""
+    jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
+    if refresh:
+        stopped_message = ''
+    try:
+        handle = backend_utils.is_controller_accessible(
+            controller=jobs_controller_type, stopped_message=stopped_message)
+    except exceptions.ClusterNotUpError as e:
+        if not refresh:
+            raise
+        handle = None
+        controller_status = e.cluster_status
+    if handle is not None:
+        return handle
+    sky_logging.print(f'{colorama.Fore.YELLOW}'
+                      f'Restarting {jobs_controller_type.value.name}...'
+                      f'{colorama.Style.RESET_ALL}')
+    rich_utils.force_update_status(
+        ux_utils.spinner_message(f'{spinner_message} - restarting '
+                                 'controller'))
+    handle = sky.start(jobs_controller_type.value.cluster_name)
+    controller_status = status_lib.ClusterStatus.UP
+    rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
+    assert handle is not None, (controller_status, refresh)
+    return handle
 @usage_lib.entrypoint
 def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -252,34 +290,11 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
             does not exist.
         RuntimeError: if failed to get the managed jobs with ssh.
     """
-    jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
-    stopped_message = ''
-    if not refresh:
-        stopped_message = 'No in-progress managed jobs.'
-    try:
-        handle = backend_utils.is_controller_accessible(
-            controller=jobs_controller_type, stopped_message=stopped_message)
-    except exceptions.ClusterNotUpError as e:
-        if not refresh:
-            raise
-        handle = None
-        controller_status = e.cluster_status
-    if refresh and handle is None:
-        sky_logging.print(f'{colorama.Fore.YELLOW}'
-                          'Restarting controller for latest status...'
-                          f'{colorama.Style.RESET_ALL}')
-        rich_utils.force_update_status(
-            ux_utils.spinner_message('Checking managed jobs - restarting '
-                                     'controller'))
-        handle = sky.start(jobs_controller_type.value.cluster_name)
-        controller_status = status_lib.ClusterStatus.UP
-        rich_utils.force_update_status(
-            ux_utils.spinner_message('Checking managed jobs'))
-    assert handle is not None, (controller_status, refresh)
+    handle = _maybe_restart_controller(refresh,
+                                       stopped_message='No in-progress '
+                                       'managed jobs.',
+                                       spinner_message='Checking '
+                                       'managed jobs')
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend)
@@ -371,7 +386,7 @@ def cancel(name: Optional[str] = None,
 @usage_lib.entrypoint
 def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
-              controller: bool) -> None:
+              controller: bool, refresh: bool) -> None:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Tail logs of managed jobs.
@@ -382,15 +397,26 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
         sky.exceptions.ClusterNotUpError: the jobs controller is not up.
     """
     # TODO(zhwu): Automatically restart the jobs controller
+    if name is not None and job_id is not None:
+        with ux_utils.print_exception_no_traceback():
+            raise ValueError('Cannot specify both name and job_id.')
     jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
-    handle = backend_utils.is_controller_accessible(
-        controller=jobs_controller_type,
+    job_name_or_id_str = ''
+    if job_id is not None:
+        job_name_or_id_str = str(job_id)
+    elif name is not None:
+        job_name_or_id_str = f'-n {name}'
+    else:
+        job_name_or_id_str = ''
+    handle = _maybe_restart_controller(
+        refresh,
         stopped_message=(
-            'Please restart the jobs controller with '
-            f'`sky start {jobs_controller_type.value.cluster_name}`.'))
+            f'{jobs_controller_type.value.name.capitalize()} is stopped. To '
+            f'get the logs, run: {colorama.Style.BRIGHT}sky jobs logs '
+            f'-r {job_name_or_id_str}{colorama.Style.RESET_ALL}'),
+        spinner_message='Retrieving job logs')
-    if name is not None and job_id is not None:
-        raise ValueError('Cannot specify both name and job_id.')
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend), backend

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -50,8 +50,9 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
             usage_lib.messages.usage.set_internal()
             sky.down(cluster_name)
             return
-        except ValueError:
+        except exceptions.ClusterDoesNotExist:
             # The cluster is already down.
+            logger.debug(f'The cluster {cluster_name} is already down.')
             return
         except Exception as e:  # pylint: disable=broad-except
             retry_cnt += 1

sky/jobs/state.py CHANGED Viewed

@@ -66,7 +66,8 @@ def create_table(cursor, conn):
         spot_job_id INTEGER,
         task_id INTEGER DEFAULT 0,
         task_name TEXT,
-        specs TEXT)""")
+        specs TEXT,
+        local_log_file TEXT DEFAULT NULL)""")
     conn.commit()
     db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
@@ -103,6 +104,8 @@ def create_table(cursor, conn):
                                  value_to_replace_existing_entries=json.dumps({
                                      'max_restarts_on_errors': 0,
                                  }))
+    db_utils.add_column_to_table(cursor, conn, 'spot', 'local_log_file',
+                                 'TEXT DEFAULT NULL')
     # `job_info` contains the mapping from job_id to the job_name.
     # In the future, it may contain more information about each job.
@@ -157,6 +160,7 @@ columns = [
     'task_id',
     'task_name',
     'specs',
+    'local_log_file',
     # columns from the job_info table
     '_job_info_job_id',  # This should be the same as job_id
     'job_name',
@@ -512,6 +516,20 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
             callback_func('CANCELLED')
+def set_local_log_file(job_id: int, task_id: Optional[int],
+                       local_log_file: str):
+    """Set the local log file for a job."""
+    filter_str = 'spot_job_id=(?)'
+    filter_args = [local_log_file, job_id]
+    if task_id is not None:
+        filter_str += ' AND task_id=(?)'
+        filter_args.append(task_id)
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        cursor.execute(
+            'UPDATE spot SET local_log_file=(?) '
+            f'WHERE {filter_str}', filter_args)
 # ======== utility functions ========
 def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
     """Get non-terminal job ids by name."""
@@ -662,3 +680,17 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
             WHERE spot_job_id=(?) AND task_id=(?)""",
             (job_id, task_id)).fetchone()
         return json.loads(task_specs[0])
+def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
+    """Get the local log directory for a job."""
+    filter_str = 'spot_job_id=(?)'
+    filter_args = [job_id]
+    if task_id is not None:
+        filter_str += ' AND task_id=(?)'
+        filter_args.append(task_id)
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        local_log_file = cursor.execute(
+            f'SELECT local_log_file FROM spot '
+            f'WHERE {filter_str}', filter_args).fetchone()
+        return local_log_file[-1] if local_log_file else None

skypilot-nightly 1.0.0.dev20241203__py3-none-any.whl → 1.0.0.dev20241205__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241203py3-none-any.whl → 1.0.0.dev20241205py3-none-any.whl