PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sky/__init__.py +64 -32
sky/adaptors/aws.py +23 -6
sky/adaptors/azure.py +432 -15
sky/adaptors/cloudflare.py +5 -5
sky/adaptors/common.py +19 -9
sky/adaptors/do.py +20 -0
sky/adaptors/gcp.py +3 -2
sky/adaptors/kubernetes.py +122 -88
sky/adaptors/nebius.py +100 -0
sky/adaptors/oci.py +39 -1
sky/adaptors/vast.py +29 -0
sky/admin_policy.py +101 -0
sky/authentication.py +117 -98
sky/backends/backend.py +52 -20
sky/backends/backend_utils.py +669 -557
sky/backends/cloud_vm_ray_backend.py +1099 -808
sky/backends/local_docker_backend.py +14 -8
sky/backends/wheel_utils.py +38 -20
sky/benchmark/benchmark_utils.py +22 -23
sky/check.py +76 -27
sky/cli.py +1586 -1139
sky/client/__init__.py +1 -0
sky/client/cli.py +5683 -0
sky/client/common.py +345 -0
sky/client/sdk.py +1765 -0
sky/cloud_stores.py +283 -19
sky/clouds/__init__.py +7 -2
sky/clouds/aws.py +303 -112
sky/clouds/azure.py +185 -179
sky/clouds/cloud.py +115 -37
sky/clouds/cudo.py +29 -22
sky/clouds/do.py +313 -0
sky/clouds/fluidstack.py +44 -54
sky/clouds/gcp.py +206 -65
sky/clouds/ibm.py +26 -21
sky/clouds/kubernetes.py +345 -91
sky/clouds/lambda_cloud.py +40 -29
sky/clouds/nebius.py +297 -0
sky/clouds/oci.py +129 -90
sky/clouds/paperspace.py +22 -18
sky/clouds/runpod.py +53 -34
sky/clouds/scp.py +28 -24
sky/clouds/service_catalog/__init__.py +19 -13
sky/clouds/service_catalog/aws_catalog.py +29 -12
sky/clouds/service_catalog/azure_catalog.py +33 -6
sky/clouds/service_catalog/common.py +95 -75
sky/clouds/service_catalog/constants.py +3 -3
sky/clouds/service_catalog/cudo_catalog.py +13 -3
sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
sky/clouds/service_catalog/do_catalog.py +111 -0
sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
sky/clouds/service_catalog/gcp_catalog.py +16 -2
sky/clouds/service_catalog/ibm_catalog.py +2 -2
sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
sky/clouds/service_catalog/lambda_catalog.py +8 -3
sky/clouds/service_catalog/nebius_catalog.py +116 -0
sky/clouds/service_catalog/oci_catalog.py +31 -4
sky/clouds/service_catalog/paperspace_catalog.py +2 -2
sky/clouds/service_catalog/runpod_catalog.py +2 -2
sky/clouds/service_catalog/scp_catalog.py +2 -2
sky/clouds/service_catalog/vast_catalog.py +104 -0
sky/clouds/service_catalog/vsphere_catalog.py +2 -2
sky/clouds/utils/aws_utils.py +65 -0
sky/clouds/utils/azure_utils.py +91 -0
sky/clouds/utils/gcp_utils.py +5 -9
sky/clouds/utils/oci_utils.py +47 -5
sky/clouds/utils/scp_utils.py +4 -3
sky/clouds/vast.py +280 -0
sky/clouds/vsphere.py +22 -18
sky/core.py +361 -107
sky/dag.py +41 -28
sky/data/data_transfer.py +37 -0
sky/data/data_utils.py +211 -32
sky/data/mounting_utils.py +182 -30
sky/data/storage.py +2118 -270
sky/data/storage_utils.py +126 -5
sky/exceptions.py +179 -8
sky/execution.py +158 -85
sky/global_user_state.py +150 -34
sky/jobs/__init__.py +12 -10
sky/jobs/client/__init__.py +0 -0
sky/jobs/client/sdk.py +302 -0
sky/jobs/constants.py +49 -11
sky/jobs/controller.py +161 -99
sky/jobs/dashboard/dashboard.py +171 -25
sky/jobs/dashboard/templates/index.html +572 -60
sky/jobs/recovery_strategy.py +157 -156
sky/jobs/scheduler.py +307 -0
sky/jobs/server/__init__.py +1 -0
sky/jobs/server/core.py +598 -0
sky/jobs/server/dashboard_utils.py +69 -0
sky/jobs/server/server.py +190 -0
sky/jobs/state.py +627 -122
sky/jobs/utils.py +615 -206
sky/models.py +27 -0
sky/optimizer.py +142 -83
sky/provision/__init__.py +20 -5
sky/provision/aws/config.py +124 -42
sky/provision/aws/instance.py +130 -53
sky/provision/azure/__init__.py +7 -0
sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
sky/provision/azure/config.py +220 -0
sky/provision/azure/instance.py +1012 -37
sky/provision/common.py +31 -3
sky/provision/constants.py +25 -0
sky/provision/cudo/__init__.py +2 -1
sky/provision/cudo/cudo_utils.py +112 -0
sky/provision/cudo/cudo_wrapper.py +37 -16
sky/provision/cudo/instance.py +28 -12
sky/provision/do/__init__.py +11 -0
sky/provision/do/config.py +14 -0
sky/provision/do/constants.py +10 -0
sky/provision/do/instance.py +287 -0
sky/provision/do/utils.py +301 -0
sky/provision/docker_utils.py +82 -46
sky/provision/fluidstack/fluidstack_utils.py +57 -125
sky/provision/fluidstack/instance.py +15 -43
sky/provision/gcp/config.py +19 -9
sky/provision/gcp/constants.py +7 -1
sky/provision/gcp/instance.py +55 -34
sky/provision/gcp/instance_utils.py +339 -80
sky/provision/gcp/mig_utils.py +210 -0
sky/provision/instance_setup.py +172 -133
sky/provision/kubernetes/__init__.py +1 -0
sky/provision/kubernetes/config.py +104 -90
sky/provision/kubernetes/constants.py +8 -0
sky/provision/kubernetes/instance.py +680 -325
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
sky/provision/kubernetes/network.py +54 -20
sky/provision/kubernetes/network_utils.py +70 -21
sky/provision/kubernetes/utils.py +1370 -251
sky/provision/lambda_cloud/__init__.py +11 -0
sky/provision/lambda_cloud/config.py +10 -0
sky/provision/lambda_cloud/instance.py +265 -0
sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
sky/provision/logging.py +1 -1
sky/provision/nebius/__init__.py +11 -0
sky/provision/nebius/config.py +11 -0
sky/provision/nebius/instance.py +285 -0
sky/provision/nebius/utils.py +318 -0
sky/provision/oci/__init__.py +15 -0
sky/provision/oci/config.py +51 -0
sky/provision/oci/instance.py +436 -0
sky/provision/oci/query_utils.py +681 -0
sky/provision/paperspace/constants.py +6 -0
sky/provision/paperspace/instance.py +4 -3
sky/provision/paperspace/utils.py +2 -0
sky/provision/provisioner.py +207 -130
sky/provision/runpod/__init__.py +1 -0
sky/provision/runpod/api/__init__.py +3 -0
sky/provision/runpod/api/commands.py +119 -0
sky/provision/runpod/api/pods.py +142 -0
sky/provision/runpod/instance.py +64 -8
sky/provision/runpod/utils.py +239 -23
sky/provision/vast/__init__.py +10 -0
sky/provision/vast/config.py +11 -0
sky/provision/vast/instance.py +247 -0
sky/provision/vast/utils.py +162 -0
sky/provision/vsphere/common/vim_utils.py +1 -1
sky/provision/vsphere/instance.py +8 -18
sky/provision/vsphere/vsphere_utils.py +1 -1
sky/resources.py +247 -102
sky/serve/__init__.py +9 -9
sky/serve/autoscalers.py +361 -299
sky/serve/client/__init__.py +0 -0
sky/serve/client/sdk.py +366 -0
sky/serve/constants.py +12 -3
sky/serve/controller.py +106 -36
sky/serve/load_balancer.py +63 -12
sky/serve/load_balancing_policies.py +84 -2
sky/serve/replica_managers.py +42 -34
sky/serve/serve_state.py +62 -32
sky/serve/serve_utils.py +271 -160
sky/serve/server/__init__.py +0 -0
sky/serve/{core.py → server/core.py} +271 -90
sky/serve/server/server.py +112 -0
sky/serve/service.py +52 -16
sky/serve/service_spec.py +95 -32
sky/server/__init__.py +1 -0
sky/server/common.py +430 -0
sky/server/constants.py +21 -0
sky/server/html/log.html +174 -0
sky/server/requests/__init__.py +0 -0
sky/server/requests/executor.py +472 -0
sky/server/requests/payloads.py +487 -0
sky/server/requests/queues/__init__.py +0 -0
sky/server/requests/queues/mp_queue.py +76 -0
sky/server/requests/requests.py +567 -0
sky/server/requests/serializers/__init__.py +0 -0
sky/server/requests/serializers/decoders.py +192 -0
sky/server/requests/serializers/encoders.py +166 -0
sky/server/server.py +1106 -0
sky/server/stream_utils.py +141 -0
sky/setup_files/MANIFEST.in +2 -5
sky/setup_files/dependencies.py +159 -0
sky/setup_files/setup.py +14 -125
sky/sky_logging.py +59 -14
sky/skylet/autostop_lib.py +2 -2
sky/skylet/constants.py +183 -50
sky/skylet/events.py +22 -10
sky/skylet/job_lib.py +403 -258
sky/skylet/log_lib.py +111 -71
sky/skylet/log_lib.pyi +6 -0
sky/skylet/providers/command_runner.py +6 -8
sky/skylet/providers/ibm/node_provider.py +2 -2
sky/skylet/providers/scp/config.py +11 -3
sky/skylet/providers/scp/node_provider.py +8 -8
sky/skylet/skylet.py +3 -1
sky/skylet/subprocess_daemon.py +69 -17
sky/skypilot_config.py +119 -57
sky/task.py +205 -64
sky/templates/aws-ray.yml.j2 +37 -7
sky/templates/azure-ray.yml.j2 +27 -82
sky/templates/cudo-ray.yml.j2 +7 -3
sky/templates/do-ray.yml.j2 +98 -0
sky/templates/fluidstack-ray.yml.j2 +7 -4
sky/templates/gcp-ray.yml.j2 +26 -6
sky/templates/ibm-ray.yml.j2 +3 -2
sky/templates/jobs-controller.yaml.j2 +46 -11
sky/templates/kubernetes-ingress.yml.j2 +7 -0
sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
sky/templates/kubernetes-ray.yml.j2 +292 -25
sky/templates/lambda-ray.yml.j2 +30 -40
sky/templates/nebius-ray.yml.j2 +79 -0
sky/templates/oci-ray.yml.j2 +18 -57
sky/templates/paperspace-ray.yml.j2 +10 -6
sky/templates/runpod-ray.yml.j2 +26 -4
sky/templates/scp-ray.yml.j2 +3 -2
sky/templates/sky-serve-controller.yaml.j2 +12 -1
sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
sky/templates/vast-ray.yml.j2 +70 -0
sky/templates/vsphere-ray.yml.j2 +8 -3
sky/templates/websocket_proxy.py +64 -0
sky/usage/constants.py +10 -1
sky/usage/usage_lib.py +130 -37
sky/utils/accelerator_registry.py +35 -51
sky/utils/admin_policy_utils.py +147 -0
sky/utils/annotations.py +51 -0
sky/utils/cli_utils/status_utils.py +81 -23
sky/utils/cluster_utils.py +356 -0
sky/utils/command_runner.py +452 -89
sky/utils/command_runner.pyi +77 -3
sky/utils/common.py +54 -0
sky/utils/common_utils.py +319 -108
sky/utils/config_utils.py +204 -0
sky/utils/control_master_utils.py +48 -0
sky/utils/controller_utils.py +548 -266
sky/utils/dag_utils.py +93 -32
sky/utils/db_utils.py +18 -4
sky/utils/env_options.py +29 -7
sky/utils/kubernetes/create_cluster.sh +8 -60
sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
sky/utils/kubernetes/gpu_labeler.py +4 -4
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
sky/utils/kubernetes/rsync_helper.sh +24 -0
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
sky/utils/log_utils.py +240 -33
sky/utils/message_utils.py +81 -0
sky/utils/registry.py +127 -0
sky/utils/resources_utils.py +94 -22
sky/utils/rich_utils.py +247 -18
sky/utils/schemas.py +284 -64
sky/{status_lib.py → utils/status_lib.py} +12 -7
sky/utils/subprocess_utils.py +212 -46
sky/utils/timeline.py +12 -7
sky/utils/ux_utils.py +168 -15
skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
sky/clouds/cloud_registry.py +0 -31
sky/jobs/core.py +0 -330
sky/skylet/providers/azure/__init__.py +0 -2
sky/skylet/providers/azure/azure-vm-template.json +0 -301
sky/skylet/providers/azure/config.py +0 -170
sky/skylet/providers/azure/node_provider.py +0 -466
sky/skylet/providers/lambda_cloud/__init__.py +0 -2
sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
sky/skylet/providers/oci/__init__.py +0 -2
sky/skylet/providers/oci/node_provider.py +0 -488
sky/skylet/providers/oci/query_helper.py +0 -383
sky/skylet/providers/oci/utils.py +0 -21
sky/utils/cluster_yaml_utils.py +0 -24
sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -6,18 +6,18 @@ ManagedJobCodeGen.
 """
 import collections
 import enum
-import inspect
 import os
 import pathlib
 import shlex
-import shutil
 import textwrap
 import time
+import traceback
 import typing
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 import colorama
 import filelock
+import psutil
 from typing_extensions import Literal
 from sky import backends
@@ -26,14 +26,18 @@ from sky import global_user_state
 from sky import sky_logging
 from sky.backends import backend_utils
 from sky.jobs import constants as managed_job_constants
+from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
+from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import log_utils
+from sky.utils import message_utils
 from sky.utils import rich_utils
 from sky.utils import subprocess_utils
+from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
     import sky
@@ -41,14 +45,7 @@ if typing.TYPE_CHECKING:
 logger = sky_logging.init_logger(__name__)
-# Add user hash so that two users don't have the same controller VM on
-# shared-account clouds such as GCP.
-JOB_CONTROLLER_NAME: str = (
-    f'sky-jobs-controller-{common_utils.get_user_hash()}')
-LEGACY_JOB_CONTROLLER_NAME: str = (
-    f'sky-spot-controller-{common_utils.get_user_hash()}')
 SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
-LEGACY_SIGNAL_FILE_PREFIX = '/tmp/sky_spot_controller_signal_{}'
 # Controller checks its job's status every this many seconds.
 JOB_STATUS_CHECK_GAP_SECONDS = 20
@@ -57,17 +54,21 @@ JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
 _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
-_JOB_WAITING_STATUS_MESSAGE = ('[bold cyan]Waiting for the task to start'
-                               '{status_str}.[/] It may take a few minutes.')
+_JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
+    'Waiting for task to start[/]'
+    '{status_str}. It may take a few minutes.\n'
+    '  [dim]View controller logs: sky jobs logs --controller {job_id}')
 _JOB_CANCELLED_MESSAGE = (
-    '[bold cyan]Waiting for the task status to be updated.'
-    '[/] It may take a minute.')
+    ux_utils.spinner_message('Waiting for task status to be updated.') +
+    ' It may take a minute.')
 # The maximum time to wait for the managed job status to transition to terminal
 # state, after the job finished. This is a safeguard to avoid the case where
 # the managed job status fails to be updated and keep the `sky jobs logs`
-# blocking for a long time.
-_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
+# blocking for a long time. This should be significantly longer than the
+# JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
+# update the state.
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
 class UserSignal(enum.Enum):
@@ -78,11 +79,50 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
+def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
+    """Terminate the cluster."""
+    from sky import core  # pylint: disable=import-outside-toplevel
+    retry_cnt = 0
+    # In some cases, e.g. botocore.exceptions.NoCredentialsError due to AWS
+    # metadata service throttling, the failed sky.down attempt can take 10-11
+    # seconds. In this case, we need the backoff to significantly reduce the
+    # rate of requests - that is, significantly increase the time between
+    # requests. We set the initial backoff to 15 seconds, so that once it grows
+    # exponentially it will quickly dominate the 10-11 seconds that we already
+    # see between requests. We set the max backoff very high, since it's
+    # generally much more important to eventually succeed than to fail fast.
+    backoff = common_utils.Backoff(
+        initial_backoff=15,
+        # 1.6 ** 5 = 10.48576 < 20, so we won't hit this with default max_retry
+        max_backoff_factor=20)
+    while True:
+        try:
+            usage_lib.messages.usage.set_internal()
+            core.down(cluster_name)
+            return
+        except exceptions.ClusterDoesNotExist:
+            # The cluster is already down.
+            logger.debug(f'The cluster {cluster_name} is already down.')
+            return
+        except Exception as e:  # pylint: disable=broad-except
+            retry_cnt += 1
+            if retry_cnt >= max_retry:
+                raise RuntimeError(
+                    f'Failed to terminate the cluster {cluster_name}.') from e
+            logger.error(
+                f'Failed to terminate the cluster {cluster_name}. Retrying.'
+                f'Details: {common_utils.format_exception(e)}')
+            with ux_utils.enable_traceback():
+                logger.error(f'  Traceback: {traceback.format_exc()}')
+            time.sleep(backoff.current_backoff())
 def get_job_status(backend: 'backends.CloudVmRayBackend',
                    cluster_name: str) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
-    It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_SETUP or CANCELLED.
+    It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
+    FAILED_SETUP or CANCELLED.
     """
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
@@ -101,57 +141,222 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
     return status
-def update_managed_job_status(job_id: Optional[int] = None):
-    """Update managed job status if the controller job failed abnormally.
+def _controller_process_alive(pid: int, job_id: int) -> bool:
+    """Check if the controller process is alive."""
+    try:
+        process = psutil.Process(pid)
+        # The last two args of the command line should be --job-id <id>
+        job_args = process.cmdline()[-2:]
+        return process.is_running() and job_args == ['--job-id', str(job_id)]
+    except psutil.NoSuchProcess:
+        return False
+def update_managed_jobs_statuses(job_id: Optional[int] = None):
+    """Update managed job status if the controller process failed abnormally.
+    Check the status of the controller process. If it is not running, it must
+    have exited abnormally, and we should set the job status to
+    FAILED_CONTROLLER. `end_at` will be set to the current timestamp for the job
+    when above happens, which could be not accurate based on the frequency this
+    function is called.
-    Check the status of the controller job. If it is not running, it must have
-    exited abnormally, and we should set the job status to FAILED_CONTROLLER.
-    `end_at` will be set to the current timestamp for the job when above
-    happens, which could be not accurate based on the frequency this function
-    is called.
+    Note: we expect that job_id, if provided, refers to a nonterminal job or a
+    job that has not completed its cleanup (schedule state not DONE).
     """
-    if job_id is None:
-        job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
-    else:
-        job_ids = [job_id]
-    for job_id_ in job_ids:
-        controller_status = job_lib.get_status(job_id_)
+    def _cleanup_job_clusters(job_id: int) -> Optional[str]:
+        """Clean up clusters for a job. Returns error message if any.
+        This function should not throw any exception. If it fails, it will
+        capture the error message, and log/return it.
+        """
+        error_msg = None
+        tasks = managed_job_state.get_managed_jobs(job_id)
+        for task in tasks:
+            task_name = task['job_name']
+            cluster_name = generate_managed_job_cluster_name(task_name, job_id)
+            handle = global_user_state.get_handle_from_cluster_name(
+                cluster_name)
+            if handle is not None:
+                try:
+                    terminate_cluster(cluster_name)
+                except Exception as e:  # pylint: disable=broad-except
+                    error_msg = (
+                        f'Failed to terminate cluster {cluster_name}: '
+                        f'{common_utils.format_exception(e, use_bracket=True)}')
+                    logger.exception(error_msg, exc_info=e)
+        return error_msg
+    # For backwards compatible jobs
+    # TODO(cooperc): Remove before 0.11.0.
+    def _handle_legacy_job(job_id: int):
+        controller_status = job_lib.get_status(job_id)
         if controller_status is None or controller_status.is_terminal():
-            logger.error(f'Controller for job {job_id_} has exited abnormally. '
-                         'Setting the job status to FAILED_CONTROLLER.')
-            tasks = managed_job_state.get_managed_jobs(job_id_)
-            for task in tasks:
-                task_name = task['job_name']
-                # Tear down the abnormal cluster to avoid resource leakage.
-                cluster_name = generate_managed_job_cluster_name(
-                    task_name, job_id_)
-                handle = global_user_state.get_handle_from_cluster_name(
-                    cluster_name)
-                if handle is not None:
-                    backend = backend_utils.get_backend_from_handle(handle)
-                    max_retry = 3
-                    for retry_cnt in range(max_retry):
-                        try:
-                            backend.teardown(handle, terminate=True)
-                            break
-                        except RuntimeError:
-                            logger.error('Failed to tear down the cluster '
-                                         f'{cluster_name!r}. Retrying '
-                                         f'[{retry_cnt}/{max_retry}].')
-            # The controller job for this managed job is not running: it must
-            # have exited abnormally, and we should set the job status to
-            # FAILED_CONTROLLER.
-            # The `set_failed` will only update the task's status if the
-            # status is non-terminal.
+            logger.error(f'Controller process for legacy job {job_id} is '
+                         'in an unexpected state.')
+            cleanup_error = _cleanup_job_clusters(job_id)
+            if cleanup_error:
+                # Unconditionally set the job to failed_controller if the
+                # cleanup fails.
+                managed_job_state.set_failed(
+                    job_id,
+                    task_id=None,
+                    failure_type=managed_job_state.ManagedJobStatus.
+                    FAILED_CONTROLLER,
+                    failure_reason=
+                    'Legacy controller process has exited abnormally, and '
+                    f'cleanup failed: {cleanup_error}. For more details, run: '
+                    f'sky jobs logs --controller {job_id}',
+                    override_terminal=True)
+                return
+            # It's possible for the job to have transitioned to
+            # another terminal state while between when we checked its
+            # state and now. In that case, set_failed won't do
+            # anything, which is fine.
             managed_job_state.set_failed(
-                job_id_,
+                job_id,
                 task_id=None,
                 failure_type=managed_job_state.ManagedJobStatus.
                 FAILED_CONTROLLER,
-                failure_reason=
-                'Controller process has exited abnormally. For more details,'
-                f' run: sky jobs logs --controller {job_id_}')
+                failure_reason=(
+                    'Legacy controller process has exited abnormally. For '
+                    f'more details, run: sky jobs logs --controller {job_id}'))
+    # Get jobs that need checking (non-terminal or not DONE)
+    job_ids = managed_job_state.get_jobs_to_check_status(job_id)
+    if not job_ids:
+        # job_id is already terminal, or if job_id is None, there are no jobs
+        # that need to be checked.
+        return
+    for job_id in job_ids:
+        tasks = managed_job_state.get_managed_jobs(job_id)
+        # Note: controller_pid and schedule_state are in the job_info table
+        # which is joined to the spot table, so all tasks with the same job_id
+        # will have the same value for these columns. This is what lets us just
+        # take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
+        schedule_state = tasks[0]['schedule_state']
+        # Backwards compatibility: this job was submitted when ray was still
+        # used for managing the parallelism of job controllers, before #4485.
+        # TODO(cooperc): Remove before 0.11.0.
+        if (schedule_state is
+                managed_job_state.ManagedJobScheduleState.INVALID):
+            _handle_legacy_job(job_id)
+            continue
+        # Handle jobs with schedule state (non-legacy jobs):
+        pid = tasks[0]['controller_pid']
+        if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
+            # There are two cases where we could get a job that is DONE.
+            # 1. At query time (get_jobs_to_check_status), the job was not yet
+            #    DONE, but since then (before get_managed_jobs is called) it has
+            #    hit a terminal status, marked itself done, and exited. This is
+            #    fine.
+            # 2. The job is DONE, but in a non-terminal status. This is
+            #    unexpected. For instance, the task status is RUNNING, but the
+            #    job schedule_state is DONE.
+            if all(task['status'].is_terminal() for task in tasks):
+                # Turns out this job is fine, even though it got pulled by
+                # get_jobs_to_check_status. Probably case #1 above.
+                continue
+            logger.error(f'Job {job_id} has DONE schedule state, but some '
+                         f'tasks are not terminal. Task statuses: '
+                         f'{", ".join(task["status"].value for task in tasks)}')
+            failure_reason = ('Inconsistent internal job state. This is a bug.')
+        elif pid is None:
+            # Non-legacy job and controller process has not yet started.
+            controller_status = job_lib.get_status(job_id)
+            if controller_status == job_lib.JobStatus.FAILED_SETUP:
+                # We should fail the case where the controller status is
+                # FAILED_SETUP, as it is due to the failure of dependency setup
+                # on the controller.
+                # TODO(cooperc): We should also handle the case where controller
+                # status is FAILED_DRIVER or FAILED.
+                logger.error('Failed to setup the cloud dependencies for '
+                             'the managed job.')
+            elif (schedule_state in [
+                    managed_job_state.ManagedJobScheduleState.INACTIVE,
+                    managed_job_state.ManagedJobScheduleState.WAITING,
+            ]):
+                # It is expected that the controller hasn't been started yet.
+                continue
+            elif (schedule_state ==
+                  managed_job_state.ManagedJobScheduleState.LAUNCHING):
+                # This is unlikely but technically possible. There's a brief
+                # period between marking job as scheduled (LAUNCHING) and
+                # actually launching the controller process and writing the pid
+                # back to the table.
+                # TODO(cooperc): Find a way to detect if we get stuck in this
+                # state.
+                logger.info(f'Job {job_id} is in {schedule_state.value} state, '
+                            'but controller process hasn\'t started yet.')
+                continue
+            logger.error(f'Expected to find a controller pid for state '
+                         f'{schedule_state.value} but found none.')
+            failure_reason = f'No controller pid set for {schedule_state.value}'
+        else:
+            logger.debug(f'Checking controller pid {pid}')
+            if _controller_process_alive(pid, job_id):
+                # The controller is still running, so this job is fine.
+                continue
+            # Double check job is not already DONE before marking as failed, to
+            # avoid the race where the controller marked itself as DONE and
+            # exited between the state check and the pid check. Since the job
+            # controller process will mark itself DONE _before_ exiting, if it
+            # has exited and it's still not DONE now, it is abnormal.
+            if (managed_job_state.get_job_schedule_state(job_id) ==
+                    managed_job_state.ManagedJobScheduleState.DONE):
+                # Never mind, the job is DONE now. This is fine.
+                continue
+            logger.error(f'Controller process for {job_id} seems to be dead.')
+            failure_reason = 'Controller process is dead'
+        # At this point, either pid is None or process is dead.
+        # The controller process for this managed job is not running: it must
+        # have exited abnormally, and we should set the job status to
+        # FAILED_CONTROLLER.
+        logger.error(f'Controller process for job {job_id} has exited '
+                     'abnormally. Setting the job status to FAILED_CONTROLLER.')
+        # Cleanup clusters and capture any errors.
+        cleanup_error = _cleanup_job_clusters(job_id)
+        cleanup_error_msg = ''
+        if cleanup_error:
+            cleanup_error_msg = f'Also, cleanup failed: {cleanup_error}. '
+        # Set all tasks to FAILED_CONTROLLER, regardless of current status.
+        # This may change a job from SUCCEEDED or another terminal state to
+        # FAILED_CONTROLLER. This is what we want - we are sure that this
+        # controller process crashed, so we want to capture that even if the
+        # underlying job succeeded.
+        # Note: 2+ invocations of update_managed_jobs_statuses could be running
+        # at the same time, so this could override the FAILED_CONTROLLER status
+        # set by another invocation of update_managed_jobs_statuses. That should
+        # be okay. The only difference could be that one process failed to clean
+        # up the cluster while the other succeeds. No matter which
+        # failure_reason ends up in the database, the outcome is acceptable.
+        # We assume that no other code path outside the controller process will
+        # update the job status.
+        managed_job_state.set_failed(
+            job_id,
+            task_id=None,
+            failure_type=managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
+            failure_reason=
+            f'Controller process has exited abnormally ({failure_reason}). '
+            f'{cleanup_error_msg}'
+            f'For more details, run: sky jobs logs --controller {job_id}',
+            override_terminal=True)
+        scheduler.job_done(job_id, idempotent=True)
 def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
@@ -167,10 +372,32 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
     subprocess_utils.handle_returncode(returncode, code,
                                        'Failed to get job time.',
                                        stdout + stderr)
-    stdout = common_utils.decode_payload(stdout)
+    stdout = message_utils.decode_payload(stdout)
     return float(stdout)
+def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
+                            cluster_name: str) -> float:
+    """Try to get the end time of the job.
+    If the job is preempted or we can't connect to the instance for whatever
+    reason, fall back to the current time.
+    """
+    try:
+        return get_job_timestamp(backend, cluster_name, get_end_time=True)
+    except exceptions.CommandError as e:
+        if e.returncode == 255:
+            # Failed to connect - probably the instance was preempted since the
+            # job completed. We shouldn't crash here, so just log and use the
+            # current time.
+            logger.info(f'Failed to connect to the instance {cluster_name} '
+                        'since the job completed. Assuming the instance '
+                        'was preempted.')
+            return time.time()
+        else:
+            raise
 def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
     """Run event callback for the task."""
@@ -222,19 +449,21 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
     return f'{cluster_name}-{job_id}'
-def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
+def cancel_jobs_by_id(job_ids: Optional[List[int]],
+                      all_users: bool = False) -> str:
     """Cancel jobs by id.
     If job_ids is None, cancel all jobs.
     """
     if job_ids is None:
-        job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
+        job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
+            None, all_users)
     job_ids = list(set(job_ids))
-    if len(job_ids) == 0:
+    if not job_ids:
         return 'No job to cancel.'
     job_id_str = ', '.join(map(str, job_ids))
     logger.info(f'Cancelling jobs {job_id_str}.')
-    cancelled_job_ids = []
+    cancelled_job_ids: List[int] = []
     for job_id in job_ids:
         # Check the status of the managed job status. If it is in
         # terminal state, we can safely skip it.
@@ -247,24 +476,19 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
                         f'{job_status.value}. Skipped.')
             continue
-        update_managed_job_status(job_id)
+        update_managed_jobs_statuses(job_id)
         # Send the signal to the jobs controller.
         signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
-        legacy_signal_file = pathlib.Path(
-            LEGACY_SIGNAL_FILE_PREFIX.format(job_id))
         # Filelock is needed to prevent race condition between signal
         # check/removal and signal writing.
         with filelock.FileLock(str(signal_file) + '.lock'):
             with signal_file.open('w', encoding='utf-8') as f:
                 f.write(UserSignal.CANCEL.value)
                 f.flush()
-            # Backward compatibility for managed jobs launched before #3419. It
-            # can be removed in the future 0.8.0 release.
-            shutil.copy(str(signal_file), str(legacy_signal_file))
         cancelled_job_ids.append(job_id)
-    if len(cancelled_job_ids) == 0:
+    if not cancelled_job_ids:
         return 'No job to cancel.'
     identity_str = f'Job with ID {cancelled_job_ids[0]} is'
     if len(cancelled_job_ids) > 1:
@@ -277,7 +501,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
 def cancel_job_by_name(job_name: str) -> str:
     """Cancel a job by name."""
     job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
-    if len(job_ids) == 0:
+    if not job_ids:
         return f'No running job found with name {job_name!r}.'
     if len(job_ids) > 1:
         return (f'{colorama.Fore.RED}Multiple running jobs found '
@@ -289,52 +513,57 @@ def cancel_job_by_name(job_name: str) -> str:
 def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
     """Stream logs by job id."""
-    controller_status = job_lib.get_status(job_id)
-    status_msg = ('[bold cyan]Waiting for controller process to be RUNNING'
-                  '{status_str}[/].')
-    status_display = rich_utils.safe_status(status_msg.format(status_str=''))
+    def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
+        # If we see CANCELLING, just exit - we could miss some job logs but the
+        # job will be terminated momentarily anyway so we don't really care.
+        return (not status.is_terminal() and
+                status != managed_job_state.ManagedJobStatus.CANCELLING)
+    msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
+    status_display = rich_utils.safe_status(msg)
     num_tasks = managed_job_state.get_num_tasks(job_id)
     with status_display:
-        prev_msg = None
-        while (controller_status != job_lib.JobStatus.RUNNING and
-               (controller_status is None or
-                not controller_status.is_terminal())):
-            status_str = 'None'
-            if controller_status is not None:
-                status_str = controller_status.value
-            msg = status_msg.format(status_str=f' (status: {status_str})')
-            if msg != prev_msg:
-                status_display.update(msg)
-                prev_msg = msg
-            time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
-            controller_status = job_lib.get_status(job_id)
-        msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='')
-        status_display.update(msg)
         prev_msg = msg
-        managed_job_status = managed_job_state.get_status(job_id)
-        while managed_job_status is None:
+        while (managed_job_status :=
+               managed_job_state.get_status(job_id)) is None:
             time.sleep(1)
-            managed_job_status = managed_job_state.get_status(job_id)
-        if managed_job_status.is_terminal():
+        if not should_keep_logging(managed_job_status):
             job_msg = ''
             if managed_job_status.is_failed():
                 job_msg = ('\nFailure reason: '
                            f'{managed_job_state.get_failure_reason(job_id)}')
+            log_file = managed_job_state.get_local_log_file(job_id, None)
+            if log_file is not None:
+                with open(os.path.expanduser(log_file), 'r',
+                          encoding='utf-8') as f:
+                    # Stream the logs to the console without reading the whole
+                    # file into memory.
+                    start_streaming = False
+                    for line in f:
+                        if log_lib.LOG_FILE_START_STREAMING_AT in line:
+                            start_streaming = True
+                        if start_streaming:
+                            print(line, end='', flush=True)
+                return ''
             return (f'{colorama.Fore.YELLOW}'
                     f'Job {job_id} is already in terminal state '
-                    f'{managed_job_status.value}. Logs will not be shown.'
-                    f'{colorama.Style.RESET_ALL}{job_msg}')
+                    f'{managed_job_status.value}. For more details, run: '
+                    f'sky jobs logs --controller {job_id}'
+                    f'{colorama.Style.RESET_ALL}'
+                    f'{job_msg}')
         backend = backends.CloudVmRayBackend()
         task_id, managed_job_status = (
             managed_job_state.get_latest_task_id_status(job_id))
-        # task_id and managed_job_status can be None if the controller process
-        # just started and the managed job status has not set to PENDING yet.
-        while (managed_job_status is None or
-               not managed_job_status.is_terminal()):
+        # We wait for managed_job_status to be not None above. Once we see that
+        # it's not None, we don't expect it to every become None again.
+        assert managed_job_status is not None, (job_id, task_id,
+                                                managed_job_status)
+        while should_keep_logging(managed_job_status):
             handle = None
             if task_id is not None:
                 task_name = managed_job_state.get_task_name(job_id, task_id)
@@ -356,15 +585,19 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                 logger.debug(
                     f'INFO: The log is not ready yet{status_str}. '
                     f'Waiting for {JOB_STATUS_CHECK_GAP_SECONDS} seconds.')
-                msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str)
+                msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str=status_str,
+                                                         job_id=job_id)
                 if msg != prev_msg:
                     status_display.update(msg)
                     prev_msg = msg
                 time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
                 task_id, managed_job_status = (
                     managed_job_state.get_latest_task_id_status(job_id))
+                assert managed_job_status is not None, (job_id, task_id,
+                                                        managed_job_status)
                 continue
-            assert managed_job_status is not None
+            assert (managed_job_status ==
+                    managed_job_state.ManagedJobStatus.RUNNING)
             assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
             status_display.stop()
             returncode = backend.tail_logs(handle,
@@ -379,29 +612,76 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                 job_statuses = backend.get_job_status(handle, stream_logs=False)
                 job_status = list(job_statuses.values())[0]
                 assert job_status is not None, 'No job found.'
+                assert task_id is not None, job_id
                 if job_status != job_lib.JobStatus.CANCELLED:
-                    assert task_id is not None, job_id
-                    if task_id < num_tasks - 1 and follow:
-                        # The log for the current job is finished. We need to
-                        # wait until next job to be started.
-                        logger.debug(
-                            f'INFO: Log for the current task ({task_id}) '
-                            'is finished. Waiting for the next task\'s log '
-                            'to be started.')
-                        status_display.update('Waiting for the next task: '
-                                              f'{task_id + 1}.')
+                    if not follow:
+                        break
+                    # Logs for retrying failed tasks.
+                    if (job_status
+                            in job_lib.JobStatus.user_code_failure_states()):
+                        task_specs = managed_job_state.get_task_specs(
+                            job_id, task_id)
+                        if task_specs.get('max_restarts_on_errors', 0) == 0:
+                            # We don't need to wait for the managed job status
+                            # update, as the job is guaranteed to be in terminal
+                            # state afterwards.
+                            break
+                        print()
+                        status_display.update(
+                            ux_utils.spinner_message(
+                                'Waiting for next restart for the failed task'))
                         status_display.start()
-                        original_task_id = task_id
-                        while True:
-                            task_id, managed_job_status = (
-                                managed_job_state.get_latest_task_id_status(
-                                    job_id))
-                            if original_task_id != task_id:
-                                break
+                        def is_managed_job_status_updated(
+                            status: Optional[managed_job_state.ManagedJobStatus]
+                        ) -> bool:
+                            """Check if local managed job status reflects remote
+                            job failure.
+                            Ensures synchronization between remote cluster
+                            failure detection (JobStatus.FAILED) and controller
+                            retry logic.
+                            """
+                            return (status !=
+                                    managed_job_state.ManagedJobStatus.RUNNING)
+                        while not is_managed_job_status_updated(
+                                managed_job_status :=
+                                managed_job_state.get_status(job_id)):
                             time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
+                        assert managed_job_status is not None, (
+                            job_id, managed_job_status)
                         continue
-                    else:
+                    if task_id == num_tasks - 1:
                         break
+                    # The log for the current job is finished. We need to
+                    # wait until next job to be started.
+                    logger.debug(
+                        f'INFO: Log for the current task ({task_id}) '
+                        'is finished. Waiting for the next task\'s log '
+                        'to be started.')
+                    # Add a newline to avoid the status display below
+                    # removing the last line of the task output.
+                    print()
+                    status_display.update(
+                        ux_utils.spinner_message(
+                            f'Waiting for the next task: {task_id + 1}'))
+                    status_display.start()
+                    original_task_id = task_id
+                    while True:
+                        task_id, managed_job_status = (
+                            managed_job_state.get_latest_task_id_status(job_id))
+                        if original_task_id != task_id:
+                            break
+                        time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
+                    assert managed_job_status is not None, (job_id, task_id,
+                                                            managed_job_status)
+                    continue
                 # The job can be cancelled by the user or the controller (when
                 # the cluster is partially preempted).
                 logger.debug(
@@ -415,7 +695,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
             # state.
             managed_job_status = managed_job_state.get_status(job_id)
             assert managed_job_status is not None, job_id
-            if managed_job_status.is_terminal():
+            if not should_keep_logging(managed_job_status):
                 break
             logger.info(f'{colorama.Fore.YELLOW}The job cluster is preempted '
                         f'or failed.{colorama.Style.RESET_ALL}')
@@ -430,6 +710,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
             # managed job state is updated.
             time.sleep(3 * JOB_STATUS_CHECK_GAP_SECONDS)
             managed_job_status = managed_job_state.get_status(job_id)
+            assert managed_job_status is not None, (job_id, managed_job_status)
     # The managed_job_status may not be in terminal status yet, since the
     # controller has not updated the managed job state yet. We wait for a while,
@@ -437,15 +718,16 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
     wait_seconds = 0
     managed_job_status = managed_job_state.get_status(job_id)
     assert managed_job_status is not None, job_id
-    while (not managed_job_status.is_terminal() and follow and
+    while (should_keep_logging(managed_job_status) and follow and
            wait_seconds < _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS):
         time.sleep(1)
         wait_seconds += 1
         managed_job_status = managed_job_state.get_status(job_id)
         assert managed_job_status is not None, job_id
-    logger.info(f'Logs finished for job {job_id} '
-                f'(status: {managed_job_status.value}).')
+    logger.info(
+        ux_utils.finishing_message(f'Managed job finished: {job_id} '
+                                   f'(status: {managed_job_status.value}).'))
     return ''
@@ -458,6 +740,7 @@ def stream_logs(job_id: Optional[int],
         job_id = managed_job_state.get_latest_job_id()
         if job_id is None:
             return 'No managed job found.'
     if controller:
         if job_id is None:
             assert job_name is not None
@@ -465,32 +748,99 @@ def stream_logs(job_id: Optional[int],
             # We manually filter the jobs by name, instead of using
             # get_nonterminal_job_ids_by_name, as with `controller=True`, we
             # should be able to show the logs for jobs in terminal states.
-            managed_jobs = list(
-                filter(lambda job: job['job_name'] == job_name, managed_jobs))
-            if len(managed_jobs) == 0:
+            managed_job_ids: Set[int] = {
+                job['job_id']
+                for job in managed_jobs
+                if job['job_name'] == job_name
+            }
+            if not managed_job_ids:
                 return f'No managed job found with name {job_name!r}.'
-            if len(managed_jobs) > 1:
-                job_ids_str = ', '.join(job['job_id'] for job in managed_jobs)
-                raise ValueError(
-                    f'Multiple managed jobs found with name {job_name!r} (Job '
-                    f'IDs: {job_ids_str}). Please specify the job_id instead.')
-            job_id = managed_jobs[0]['job_id']
+            if len(managed_job_ids) > 1:
+                job_ids_str = ', '.join(
+                    str(job_id) for job_id in managed_job_ids)
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        f'Multiple managed jobs found with name {job_name!r} '
+                        f'(Job IDs: {job_ids_str}). Please specify the job_id '
+                        'instead.')
+            job_id = managed_job_ids.pop()
         assert job_id is not None, (job_id, job_name)
-        # TODO: keep the following code sync with
-        # job_lib.JobLibCodeGen.tail_logs, we do not directly call that function
-        # as the following code need to be run in the current machine, instead
-        # of running remotely.
-        run_timestamp = job_lib.get_run_timestamp(job_id)
-        if run_timestamp is None:
-            return f'No managed job contrller log found with job_id {job_id}.'
-        log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp)
-        log_lib.tail_logs(job_id=job_id, log_dir=log_dir, follow=follow)
+        controller_log_path = os.path.join(
+            os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
+            f'{job_id}.log')
+        job_status = None
+        # Wait for the log file to be written
+        while not os.path.exists(controller_log_path):
+            if not follow:
+                # Assume that the log file hasn't been written yet. Since we
+                # aren't following, just return.
+                return ''
+            job_status = managed_job_state.get_status(job_id)
+            if job_status is None:
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(f'Job {job_id} not found.')
+            if job_status.is_terminal():
+                # Don't keep waiting. If the log file is not created by this
+                # point, it never will be. This job may have been submitted
+                # using an old version that did not create the log file, so this
+                # is not considered an exceptional case.
+                return ''
+            time.sleep(log_lib.SKY_LOG_WAITING_GAP_SECONDS)
+        # This code is based on log_lib.tail_logs. We can't use that code
+        # exactly because state works differently between managed jobs and
+        # normal jobs.
+        with open(controller_log_path, 'r', newline='', encoding='utf-8') as f:
+            # Note: we do not need to care about start_stream_at here, since
+            # that should be in the job log printed above.
+            for line in f:
+                print(line, end='')
+            # Flush.
+            print(end='', flush=True)
+            if follow:
+                while True:
+                    # Print all new lines, if there are any.
+                    line = f.readline()
+                    while line is not None and line != '':
+                        print(line, end='')
+                        line = f.readline()
+                    # Flush.
+                    print(end='', flush=True)
+                    # Check if the job if finished.
+                    # TODO(cooperc): The controller can still be
+                    # cleaning up if job is in a terminal status
+                    # (e.g. SUCCEEDED). We want to follow those logs
+                    # too. Use DONE instead?
+                    job_status = managed_job_state.get_status(job_id)
+                    assert job_status is not None, (job_id, job_name)
+                    if job_status.is_terminal():
+                        break
+                    time.sleep(log_lib.SKY_LOG_TAILING_GAP_SECONDS)
+                # Wait for final logs to be written.
+                time.sleep(1 + log_lib.SKY_LOG_TAILING_GAP_SECONDS)
+            # Print any remaining logs including incomplete line.
+            print(f.read(), end='', flush=True)
+        if follow:
+            return ux_utils.finishing_message(
+                f'Job finished (status: {job_status}).')
         return ''
     if job_id is None:
         assert job_name is not None
         job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
-        if len(job_ids) == 0:
+        if not job_ids:
             return f'No running managed job found with name {job_name!r}.'
         if len(job_ids) > 1:
             raise ValueError(
@@ -520,6 +870,7 @@ def dump_managed_job_queue() -> str:
             job_duration = 0
         job['job_duration'] = job_duration
         job['status'] = job['status'].value
+        job['schedule_state'] = job['schedule_state'].value
         cluster_name = generate_managed_job_cluster_name(
             job['task_name'], job['job_id'])
@@ -534,12 +885,12 @@ def dump_managed_job_queue() -> str:
             job['cluster_resources'] = '-'
             job['region'] = '-'
-    return common_utils.encode_payload(jobs)
+    return message_utils.encode_payload(jobs)
 def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
     """Load job queue from json string."""
-    jobs = common_utils.decode_payload(payload)
+    jobs = message_utils.decode_payload(payload)
     for job in jobs:
         job['status'] = managed_job_state.ManagedJobStatus(job['status'])
     return jobs
@@ -568,6 +919,7 @@ def _get_job_status_from_tasks(
 @typing.overload
 def format_job_table(tasks: List[Dict[str, Any]],
                      show_all: bool,
+                     show_user: bool,
                      return_rows: Literal[False] = False,
                      max_jobs: Optional[int] = None) -> str:
     ...
@@ -576,6 +928,7 @@ def format_job_table(tasks: List[Dict[str, Any]],
 @typing.overload
 def format_job_table(tasks: List[Dict[str, Any]],
                      show_all: bool,
+                     show_user: bool,
                      return_rows: Literal[True],
                      max_jobs: Optional[int] = None) -> List[List[str]]:
     ...
@@ -584,6 +937,7 @@ def format_job_table(tasks: List[Dict[str, Any]],
 def format_job_table(
         tasks: List[Dict[str, Any]],
         show_all: bool,
+        show_user: bool,
         return_rows: bool = False,
         max_jobs: Optional[int] = None) -> Union[str, List[List[str]]]:
     """Returns managed jobs as a formatted string.
@@ -599,11 +953,21 @@ def format_job_table(
       a list of "rows" (each of which is a list of str).
     """
     jobs = collections.defaultdict(list)
+    # Check if the tasks have user information from kubernetes.
+    # This is only used for sky status --kubernetes.
+    tasks_have_k8s_user = any([task.get('user') for task in tasks])
+    if max_jobs and tasks_have_k8s_user:
+        raise ValueError('max_jobs is not supported when tasks have user info.')
+    def get_hash(task):
+        if tasks_have_k8s_user:
+            return (task['user'], task['job_id'])
+        return task['job_id']
     for task in tasks:
         # The tasks within the same job_id are already sorted
         # by the task_id.
-        jobs[task['job_id']].append(task)
-    jobs = dict(jobs)
+        jobs[get_hash(task)].append(task)
     status_counts: Dict[str, int] = collections.defaultdict(int)
     for job_tasks in jobs.values():
@@ -611,17 +975,29 @@ def format_job_table(
         if not managed_job_status.is_terminal():
             status_counts[managed_job_status.value] += 1
-    if max_jobs is not None:
-        job_ids = sorted(jobs.keys(), reverse=True)
-        job_ids = job_ids[:max_jobs]
-        jobs = {job_id: jobs[job_id] for job_id in job_ids}
+    user_cols: List[str] = []
+    if show_user:
+        user_cols = ['USER']
+        if show_all:
+            user_cols.append('USER_ID')
     columns = [
-        'ID', 'TASK', 'NAME', 'RESOURCES', 'SUBMITTED', 'TOT. DURATION',
-        'JOB DURATION', '#RECOVERIES', 'STATUS'
+        'ID',
+        'TASK',
+        'NAME',
+        *user_cols,
+        'RESOURCES',
+        'SUBMITTED',
+        'TOT. DURATION',
+        'JOB DURATION',
+        '#RECOVERIES',
+        'STATUS',
     ]
     if show_all:
-        columns += ['STARTED', 'CLUSTER', 'REGION', 'FAILURE']
+        # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
+        columns += ['STARTED', 'CLUSTER', 'REGION', 'SCHED. STATE', 'DETAILS']
+    if tasks_have_k8s_user:
+        columns.insert(0, 'USER')
     job_table = log_utils.create_table(columns)
     status_counts: Dict[str, int] = collections.defaultdict(int)
@@ -636,9 +1012,33 @@ def format_job_table(
     for task in all_tasks:
         # The tasks within the same job_id are already sorted
         # by the task_id.
-        jobs[task['job_id']].append(task)
+        jobs[get_hash(task)].append(task)
+    def generate_details(failure_reason: Optional[str]) -> str:
+        if failure_reason is not None:
+            return f'Failure: {failure_reason}'
+        return '-'
+    def get_user_column_values(task: Dict[str, Any]) -> List[str]:
+        user_values: List[str] = []
+        if show_user:
+            user_name = '-'
+            user_hash = task.get('user_hash', None)
+            if user_hash:
+                user = global_user_state.get_user(user_hash)
+                user_name = user.name if user.name else '-'
+            user_values = [user_name]
+            if show_all:
+                user_values.append(user_hash if user_hash is not None else '-')
+        return user_values
+    for job_hash, job_tasks in jobs.items():
+        if show_all:
+            schedule_state = job_tasks[0]['schedule_state']
-    for job_id, job_tasks in jobs.items():
         if len(job_tasks) > 1:
             # Aggregate the tasks into a new row in the table.
             job_name = job_tasks[0]['job_name']
@@ -661,7 +1061,6 @@ def format_job_table(
                     end_at = None
                 recovery_cnt += task['recovery_count']
-            failure_reason = job_tasks[current_task_id]['failure_reason']
             job_duration = log_utils.readable_time_duration(0,
                                                             job_duration,
                                                             absolute=True)
@@ -674,10 +1073,14 @@ def format_job_table(
             if not managed_job_status.is_terminal():
                 status_str += f' (task: {current_task_id})'
+            user_values = get_user_column_values(job_tasks[0])
+            job_id = job_hash[1] if tasks_have_k8s_user else job_hash
             job_values = [
                 job_id,
                 '',
                 job_name,
+                *user_values,
                 '-',
                 submitted,
                 total_duration,
@@ -686,12 +1089,16 @@ def format_job_table(
                 status_str,
             ]
             if show_all:
+                failure_reason = job_tasks[current_task_id]['failure_reason']
                 job_values.extend([
                     '-',
                     '-',
                     '-',
-                    failure_reason if failure_reason is not None else '-',
+                    job_tasks[0]['schedule_state'],
+                    generate_details(failure_reason),
                 ])
+            if tasks_have_k8s_user:
+                job_values.insert(0, job_tasks[0].get('user', '-'))
             job_table.add_row(job_values)
         for task in job_tasks:
@@ -700,10 +1107,12 @@ def format_job_table(
             job_duration = log_utils.readable_time_duration(
                 0, task['job_duration'], absolute=True)
             submitted = log_utils.readable_time_duration(task['submitted_at'])
+            user_values = get_user_column_values(task)
             values = [
                 task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
                 task['task_id'] if len(job_tasks) > 1 else '-',
                 task['task_name'],
+                *user_values,
                 task['resources'],
                 # SUBMITTED
                 submitted if submitted != '-' else submitted,
@@ -716,14 +1125,20 @@ def format_job_table(
                 task['status'].colored_str(),
             ]
             if show_all:
+                # schedule_state is only set at the job level, so if we have
+                # more than one task, only display on the aggregated row.
+                schedule_state = (task['schedule_state']
+                                  if len(job_tasks) == 1 else '-')
                 values.extend([
                     # STARTED
                     log_utils.readable_time_duration(task['start_at']),
                     task['cluster_resources'],
                     task['region'],
-                    task['failure_reason']
-                    if task['failure_reason'] is not None else '-',
+                    schedule_state,
+                    generate_details(task['failure_reason']),
                 ])
+            if tasks_have_k8s_user:
+                values.insert(0, task.get('user', '-'))
             job_table.add_row(values)
         if len(job_tasks) > 1:
@@ -751,36 +1166,34 @@ class ManagedJobCodeGen:
       >> codegen = ManagedJobCodeGen.show_jobs(...)
     """
-    # TODO: the try..except.. block is for backward compatibility. Remove it in
-    # v0.8.0.
     _PREFIX = textwrap.dedent("""\
-        managed_job_version = 0
-        try:
-            from sky.jobs import utils
-            from sky.jobs import constants as managed_job_constants
-            from sky.jobs import state as managed_job_state
-            managed_job_version = managed_job_constants.MANAGED_JOBS_VERSION
-        except ImportError:
-            from sky.spot import spot_state as managed_job_state
-            from sky.spot import spot_utils as utils
+        from sky.jobs import utils
+        from sky.jobs import state as managed_job_state
+        from sky.jobs import constants as managed_job_constants
+        managed_job_version = managed_job_constants.MANAGED_JOBS_VERSION
         """)
     @classmethod
     def get_job_table(cls) -> str:
         code = textwrap.dedent("""\
-        if managed_job_version < 1:
-            job_table = utils.dump_spot_job_queue()
-        else:
-            job_table = utils.dump_managed_job_queue()
+        job_table = utils.dump_managed_job_queue()
         print(job_table, flush=True)
         """)
         return cls._build(code)
     @classmethod
-    def cancel_jobs_by_id(cls, job_ids: Optional[List[int]]) -> str:
+    def cancel_jobs_by_id(cls,
+                          job_ids: Optional[List[int]],
+                          all_users: bool = False) -> str:
         code = textwrap.dedent(f"""\
-        msg = utils.cancel_jobs_by_id({job_ids})
+        if managed_job_version < 2:
+            # For backward compatibility, since all_users is not supported
+            # before #4787. Assume th
+            # TODO(cooperc): Remove compatibility before 0.12.0
+            msg = utils.cancel_jobs_by_id({job_ids})
+        else:
+            msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users})
         print(msg, end="", flush=True)
         """)
         return cls._build(code)
@@ -793,33 +1206,24 @@ class ManagedJobCodeGen:
         """)
         return cls._build(code)
+    @classmethod
+    def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
+        code = textwrap.dedent(f"""\
+        from sky.utils import message_utils
+        job_id = managed_job_state.get_all_job_ids_by_name({job_name!r})
+        print(message_utils.encode_payload(job_id), end="", flush=True)
+        """)
+        return cls._build(code)
     @classmethod
     def stream_logs(cls,
                     job_name: Optional[str],
                     job_id: Optional[int],
                     follow: bool = True,
                     controller: bool = False) -> str:
-        # We inspect the source code of the function here for backward
-        # compatibility.
-        # TODO: change to utils.stream_logs(job_id, job_name, follow) in v0.8.0.
-        # Import libraries required by `stream_logs`. The try...except... block
-        # should be removed in v0.8.0.
-        code = textwrap.dedent("""\
-        import os
-        from sky.skylet import job_lib, log_lib
-        from sky.skylet import constants
-        try:
-            from sky.jobs.utils import stream_logs_by_id
-        except ImportError:
-            from sky.spot.spot_utils import stream_logs_by_id
-        from typing import Optional
-        """)
-        code += inspect.getsource(stream_logs)
-        code += textwrap.dedent(f"""\
-        msg = stream_logs({job_id!r}, {job_name!r},
-                           follow={follow}, controller={controller})
+        code = textwrap.dedent(f"""\
+        msg = utils.stream_logs({job_id!r}, {job_name!r},
+                                follow={follow}, controller={controller})
         print(msg, flush=True)
         """)
         return cls._build(code)
@@ -829,13 +1233,13 @@ class ManagedJobCodeGen:
         dag_name = managed_job_dag.name
         # Add the managed job to queue table.
         code = textwrap.dedent(f"""\
-            managed_job_state.set_job_name({job_id}, {dag_name!r})
+            managed_job_state.set_job_info({job_id}, {dag_name!r})
             """)
         for task_id, task in enumerate(managed_job_dag.tasks):
             resources_str = backend_utils.get_task_resources_str(
                 task, is_managed_job=True)
             code += textwrap.dedent(f"""\
-                managed_job_state.set_pending({job_id}, {task_id},
+                managed_job_state.set_pending({job_id}, {task_id},
                                   {task.name!r}, {resources_str!r})
                 """)
         return cls._build(code)
@@ -843,4 +1247,9 @@ class ManagedJobCodeGen:
     @classmethod
     def _build(cls, code: str) -> str:
         generated_code = cls._PREFIX + '\n' + code
-        return f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(generated_code)}'
+        # Use the local user id to make sure the operation goes to the correct
+        # user.
+        return (
+            f'export {constants.USER_ID_ENV_VAR}='
+            f'"{common_utils.get_user_hash()}"; '
+            f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(generated_code)}')

skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl