PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

sky/__init__.py +4 -2
sky/adaptors/slurm.py +159 -72
sky/backends/backend_utils.py +52 -10
sky/backends/cloud_vm_ray_backend.py +192 -32
sky/backends/task_codegen.py +40 -2
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +0 -7
sky/catalog/vast_catalog.py +30 -6
sky/check.py +11 -8
sky/client/cli/command.py +106 -54
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +8 -0
sky/client/sdk_async.py +9 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +44 -12
sky/clouds/ssh.py +1 -1
sky/clouds/vast.py +30 -17
sky/core.py +69 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +29 -4
sky/global_user_state.py +108 -16
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +9 -0
sky/optimizer.py +2 -1
sky/provision/__init__.py +11 -9
sky/provision/kubernetes/utils.py +122 -15
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +2 -1
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/instance.py +75 -29
sky/provision/slurm/utils.py +213 -107
sky/provision/vast/utils.py +1 -0
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +16 -0
sky/server/requests/payloads.py +18 -0
sky/server/requests/request_names.py +2 -0
sky/server/requests/requests.py +28 -10
sky/server/requests/serializers/encoders.py +5 -0
sky/server/requests/serializers/return_value_serializers.py +14 -4
sky/server/server.py +434 -107
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +21 -10
sky/sky_logging.py +2 -1
sky/skylet/constants.py +22 -5
sky/skylet/executor/slurm.py +4 -6
sky/skylet/job_lib.py +89 -4
sky/skylet/services.py +18 -3
sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/templates/kubernetes-ray.yml.j2 +4 -6
sky/templates/slurm-ray.yml.j2 +32 -2
sky/templates/websocket_proxy.py +18 -41
sky/users/permission.py +61 -51
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +256 -94
sky/utils/command_runner.pyi +16 -0
sky/utils/common_utils.py +30 -29
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +63 -20
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
/sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/global_user_state.py CHANGED Viewed

@@ -16,7 +16,7 @@ import re
 import threading
 import time
 import typing
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union
 import uuid
 import sqlalchemy
@@ -1020,8 +1020,46 @@ async def cluster_event_retention_daemon():
         await asyncio.sleep(sleep_amount)
-def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
-                       event_type: ClusterEventType) -> List[str]:
+@typing.overload
+def get_cluster_events(
+    cluster_name: Optional[str],
+    cluster_hash: Optional[str],
+    event_type: ClusterEventType,
+    include_timestamps: Literal[False],
+    limit: Optional[int] = ...,
+) -> List[str]:
+    ...
+@typing.overload
+def get_cluster_events(
+    cluster_name: Optional[str],
+    cluster_hash: Optional[str],
+    event_type: ClusterEventType,
+    include_timestamps: Literal[True],
+    limit: Optional[int] = ...,
+) -> List[Dict[str, Union[str, int]]]:
+    ...
+@typing.overload
+def get_cluster_events(
+    cluster_name: Optional[str],
+    cluster_hash: Optional[str],
+    event_type: ClusterEventType,
+    include_timestamps: bool = ...,
+    limit: Optional[int] = ...,
+) -> Union[List[str], List[Dict[str, Union[str, int]]]]:
+    ...
+def get_cluster_events(
+    cluster_name: Optional[str],
+    cluster_hash: Optional[str],
+    event_type: ClusterEventType,
+    include_timestamps: bool = False,
+    limit: Optional[int] = None
+) -> Union[List[str], List[Dict[str, Union[str, int]]]]:
     """Returns the cluster events for the cluster.
     Args:
@@ -1030,22 +1068,44 @@ def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
         cluster_hash: Hash of the cluster. Cannot be specified if cluster_name
             is specified.
         event_type: Type of the event.
+        include_timestamps: If True, returns list of dicts with 'reason' and
+            'transitioned_at' fields. If False, returns list of reason strings.
+        limit: If specified, returns at most this many events (most recent).
+            If None, returns all events.
+    Returns:
+        If include_timestamps is False: List of reason strings.
+        If include_timestamps is True: List of dicts with 'reason' and
+            'transitioned_at' (unix timestamp) fields.
+        Events are ordered from oldest to newest.
     """
     assert _SQLALCHEMY_ENGINE is not None
-    if cluster_name is not None and cluster_hash is not None:
-        raise ValueError('Cannot specify both cluster_name and cluster_hash')
-    if cluster_name is None and cluster_hash is None:
-        raise ValueError('Must specify either cluster_name or cluster_hash')
-    if cluster_name is not None:
-        cluster_hash = _get_hash_for_existing_cluster(cluster_name)
-        if cluster_hash is None:
-            raise ValueError(f'Hash for cluster {cluster_name} not found.')
-    with orm.Session(_SQLALCHEMY_ENGINE) as session:
-        rows = session.query(cluster_event_table).filter_by(
-            cluster_hash=cluster_hash, type=event_type.value).order_by(
-                cluster_event_table.c.transitioned_at.asc()).all()
+    cluster_hash = _resolve_cluster_hash(cluster_hash, cluster_name)
+    if cluster_hash is None:
+        raise ValueError(f'Hash for cluster {cluster_name} not found.')
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        if limit is not None:
+            # To get the most recent N events in ASC order, we use a subquery:
+            # 1. Get most recent N events (ORDER BY DESC LIMIT N)
+            # 2. Re-order them by ASC
+            subquery = session.query(cluster_event_table).filter_by(
+                cluster_hash=cluster_hash, type=event_type.value).order_by(
+                    cluster_event_table.c.transitioned_at.desc()).limit(
+                        limit).subquery()
+            rows = session.query(subquery).order_by(
+                subquery.c.transitioned_at.asc()).all()
+        else:
+            rows = session.query(cluster_event_table).filter_by(
+                cluster_hash=cluster_hash, type=event_type.value).order_by(
+                    cluster_event_table.c.transitioned_at.asc()).all()
+    if include_timestamps:
+        return [{
+            'reason': row.reason,
+            'transitioned_at': row.transitioned_at
+        } for row in rows]
     return [row.reason for row in rows]
@@ -1537,6 +1597,38 @@ def _get_hash_for_existing_cluster(cluster_name: str) -> Optional[str]:
     return row.cluster_hash
+def _resolve_cluster_hash(cluster_hash: Optional[str] = None,
+                          cluster_name: Optional[str] = None) -> Optional[str]:
+    """Resolve cluster_hash from either cluster_hash or cluster_name.
+    Validates that exactly one of cluster_hash or cluster_name is provided,
+    then resolves cluster_name to cluster_hash if needed.
+    Args:
+        cluster_hash: Direct cluster hash, if known.
+        cluster_name: Cluster name to resolve to hash.
+    Returns:
+        The cluster_hash string, or None if cluster_name was provided but
+        the cluster doesn't exist.
+    Raises:
+        ValueError: If both or neither of cluster_hash/cluster_name are
+        provided.
+    """
+    if cluster_hash is not None and cluster_name is not None:
+        raise ValueError(f'Cannot specify both cluster_hash ({cluster_hash}) '
+                         f'and cluster_name ({cluster_name})')
+    if cluster_hash is None and cluster_name is None:
+        raise ValueError('Must specify either cluster_hash or cluster_name')
+    if cluster_name is not None:
+        return _get_hash_for_existing_cluster(cluster_name)
+    return cluster_hash
 @_init_db
 @metrics_lib.time_me
 def get_launched_resources_from_cluster_hash(

sky/jobs/client/sdk.py CHANGED Viewed

@@ -7,6 +7,7 @@ import click
 from sky import sky_logging
 from sky.adaptors import common as adaptors_common
+from sky.backends import backend_utils
 from sky.client import common as client_common
 from sky.client import sdk
 from sky.schemas.api import responses
@@ -100,9 +101,13 @@ def launch(
                 pool_statuses = sdk.get(pool_status_request_id)
                 if not pool_statuses:
                     raise click.UsageError(f'Pool {pool!r} not found.')
-                resources = pool_statuses[0]['requested_resources_str']
-                click.secho(f'Use resources from pool {pool!r}: {resources}.',
-                            fg='green')
+                # Show the job's requested resources, not the pool worker
+                # resources
+                job_resources_str = backend_utils.get_task_resources_str(
+                    dag.tasks[0], is_managed_job=True)
+                click.secho(
+                    f'Use resources from pool {pool!r}: {job_resources_str}.',
+                    fg='green')
                 if num_jobs is not None:
                     job_identity = f'{num_jobs} managed jobs'
             prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'

sky/jobs/controller.py CHANGED Viewed

@@ -2,6 +2,7 @@
 """
 import asyncio
 import io
+import json
 import os
 import pathlib
 import resource
@@ -11,7 +12,7 @@ import threading
 import time
 import traceback
 import typing
-from typing import Dict, Optional, Set
+from typing import Dict, List, Optional, Set
 import dotenv
@@ -31,6 +32,7 @@ from sky.jobs import recovery_strategy
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
+from sky.server import plugins
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.usage import usage_lib
@@ -43,11 +45,16 @@ from sky.utils import controller_utils
 from sky.utils import dag_utils
 from sky.utils import status_lib
 from sky.utils import ux_utils
+from sky.utils.plugin_extensions import ExternalClusterFailure
+from sky.utils.plugin_extensions import ExternalFailureSource
 if typing.TYPE_CHECKING:
     import psutil
+    from sky.schemas.generated import jobsv1_pb2
 else:
     psutil = adaptors_common.LazyImport('psutil')
+    jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
 logger = sky_logging.init_logger('sky.jobs.controller')
@@ -236,6 +243,64 @@ class JobController:
             await context_utils.to_thread(managed_job_utils.terminate_cluster,
                                           cluster_name)
+    async def _get_job_exit_codes(
+        self, job_id: Optional[int],
+        handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
+    ) -> Optional[list]:
+        """Retrieve exit codes from the remote cluster.
+        Args:
+            job_id: The job ID on the remote cluster.
+            handle: The handle to the cluster.
+        Returns:
+            List of exit codes, or None if not available.
+        """
+        try:
+            use_legacy = not handle.is_grpc_enabled_with_flag
+            if not use_legacy:
+                try:
+                    request = jobsv1_pb2.GetJobExitCodesRequest()
+                    if job_id is not None:
+                        request.job_id = job_id
+                    response = await context_utils.to_thread(
+                        backend_utils.invoke_skylet_with_retries,
+                        lambda: cloud_vm_ray_backend.SkyletClient(
+                            handle.get_grpc_channel()).get_job_exit_codes(
+                                request))
+                    exit_codes = list(
+                        response.exit_codes) if response.exit_codes else None
+                    return exit_codes
+                except exceptions.SkyletMethodNotImplementedError:
+                    # Fall back to legacy if RPC not implemented
+                    use_legacy = True
+            if use_legacy:
+                # Use existing SSH-based code generation
+                code = job_lib.JobLibCodeGen.get_job_exit_codes(job_id)
+                returncode, stdout, stderr = await context_utils.to_thread(
+                    self._backend.run_on_head,
+                    handle,
+                    code,
+                    stream_logs=False,
+                    require_outputs=True,
+                    separate_stderr=True)
+                if returncode != 0:
+                    logger.debug(f'Failed to retrieve exit codes: {stderr}')
+                    return None
+                exit_codes = json.loads(stdout.strip())
+                return exit_codes
+        except Exception as e:  # pylint: disable=broad-except
+            logger.debug(f'Failed to retrieve job exit codes: {e}')
+            return None
+        return None
     async def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
         """Busy loop monitoring cluster status and handling recovery.
@@ -334,6 +399,12 @@ class JobController:
             resources_str = backend_utils.get_task_resources_str(
                 task, is_managed_job=True)
+            # Get full_resources_json using get_resource_config which handles
+            # heterogeneous resource configurations (any_of/ordered).
+            full_resources_json = None
+            if task.resources:
+                full_resources_json = task.get_resource_config()
             await managed_job_state.set_starting_async(
                 self._job_id,
                 task_id,
@@ -342,9 +413,12 @@ class JobController:
                 resources_str=resources_str,
                 specs={
                     'max_restarts_on_errors':
-                        self._strategy_executor.max_restarts_on_errors
+                        self._strategy_executor.max_restarts_on_errors,
+                    'recover_on_exit_codes':
+                        self._strategy_executor.recover_on_exit_codes
                 },
-                callback_func=callback_func)
+                callback_func=callback_func,
+                full_resources_json=full_resources_json)
             logger.info(f'Submitted managed job {self._job_id} '
                         f'(task: {task_id}, name: {task.name!r}); '
                         f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
@@ -365,9 +439,8 @@ class JobController:
             launch_time = time.time() - launch_start
             logger.info(f'Cluster launch completed in {launch_time:.2f}s')
             assert remote_job_submitted_at is not None, remote_job_submitted_at
-        if self._pool is None:
-            job_id_on_pool_cluster = None
-        else:
+        job_id_on_pool_cluster: Optional[int] = None
+        if self._pool:
             # Update the cluster name when using pool.
             cluster_name, job_id_on_pool_cluster = (
                 await
@@ -411,6 +484,8 @@ class JobController:
             except KeyError:
                 pass
+        transient_job_check_error_start_time = None
+        job_check_backoff = None
         while True:
             status_check_count += 1
@@ -462,19 +537,38 @@ class JobController:
             # recovering, we will set the job status to None, which will force
             # enter the recovering logic.
             job_status = None
+            transient_job_check_error_reason = None
             if not force_transit_to_recovering:
                 try:
-                    job_status = await managed_job_utils.get_job_status(
-                        self._backend,
-                        cluster_name,
-                        job_id=job_id_on_pool_cluster,
-                    )
+                    job_status, transient_job_check_error_reason = await (
+                        managed_job_utils.get_job_status(
+                            self._backend,
+                            cluster_name,
+                            job_id=job_id_on_pool_cluster,
+                        ))
                 except exceptions.FetchClusterInfoError as fetch_e:
                     logger.info(
                         'Failed to fetch the job status. Start recovery.\n'
                         f'Exception: {common_utils.format_exception(fetch_e)}\n'
                         f'Traceback: {traceback.format_exc()}')
+            # When job status check fails, we need to retry to avoid false alarm
+            # for job failure, as it could be a transient error for
+            # communication issue.
+            if transient_job_check_error_reason is not None:
+                logger.info(
+                    'Potential transient error when fetching the job '
+                    f'status. Reason: {transient_job_check_error_reason}.\n'
+                    'Check cluster status to determine if the job is '
+                    'preempted or failed.')
+                if transient_job_check_error_start_time is None:
+                    transient_job_check_error_start_time = time.time()
+                    job_check_backoff = common_utils.Backoff(
+                        initial_backoff=1, max_backoff_factor=5)
+            else:
+                transient_job_check_error_start_time = None
+                job_check_backoff = None
             if job_status == job_lib.JobStatus.SUCCEEDED:
                 logger.info(f'Task {task_id} succeeded! '
                             'Getting end time and cleaning up')
@@ -550,15 +644,16 @@ class JobController:
             # Pull the actual cluster status from the cloud provider to
             # determine whether the cluster is preempted or failed.
-            # TODO(zhwu): For hardware failure, such as GPU failure, it may not
-            # be reflected in the cluster status, depending on the cloud, which
-            # can also cause failure of the job, and we need to recover it
-            # rather than fail immediately.
-            (cluster_status,
-             handle) = backend_utils.refresh_cluster_status_handle(
-                 cluster_name,
-                 force_refresh_statuses=set(status_lib.ClusterStatus))
+            # NOTE: Some failures may not be reflected in the cluster status
+            # depending on the cloud, which can also cause failure of the job.
+            # Plugins can report such failures via ExternalFailureSource.
+            # TODO(cooperc): do we need to add this to asyncio thread?
+            (cluster_status, handle) = await context_utils.to_thread(
+                backend_utils.refresh_cluster_status_handle,
+                cluster_name,
+                force_refresh_statuses=set(status_lib.ClusterStatus))
+            external_failures: Optional[List[ExternalClusterFailure]] = None
             if cluster_status != status_lib.ClusterStatus.UP:
                 # The cluster is (partially) preempted or failed. It can be
                 # down, INIT or STOPPED, based on the interruption behavior of
@@ -569,6 +664,15 @@ class JobController:
                 logger.info(
                     f'Cluster is preempted or failed{cluster_status_str}. '
                     'Recovering...')
+                if ExternalFailureSource.is_registered():
+                    cluster_failures = await context_utils.to_thread(
+                        ExternalFailureSource.get, cluster_name=cluster_name)
+                    if cluster_failures:
+                        logger.info(
+                            f'Detected cluster failures: {cluster_failures}')
+                        external_failures = (
+                            ExternalClusterFailure.from_failure_list(
+                                cluster_failures))
             else:
                 if job_status is not None and not job_status.is_terminal():
                     # The multi-node job is still running, continue monitoring.
@@ -612,18 +716,37 @@ class JobController:
                             'can be caused by the job taking too much memory '
                             'or other resources. Try adding more memory, CPU, '
                             f'or disk in your job definition. {failure_reason}')
+                    # Retrieve exit codes from the failed job
+                    exit_codes = await self._get_job_exit_codes(
+                        job_id_on_pool_cluster, handle)
                     should_restart_on_failure = (
-                        self._strategy_executor.should_restart_on_failure())
+                        self._strategy_executor.should_restart_on_failure(
+                            exit_codes=exit_codes))
                     if should_restart_on_failure:
                         max_restarts = (
                             self._strategy_executor.max_restarts_on_errors)
-                        logger.info(
-                            f'User program crashed '
-                            f'({managed_job_status.value}). '
-                            f'Retry the job as max_restarts_on_errors is '
-                            f'set to {max_restarts}. '
+                        exit_code_msg = (
+                            '(Retry the job as '
+                            f'max_restarts_on_errors is set to {max_restarts}. '
                             f'[{self._strategy_executor.restart_cnt_on_failure}'
-                            f'/{max_restarts}]')
+                            f'/{max_restarts}])')
+                        if (exit_codes and
+                                self._strategy_executor.recover_on_exit_codes):
+                            recover_codes = (
+                                self._strategy_executor.recover_on_exit_codes)
+                            matching_codes = [
+                                c for c in exit_codes if c in recover_codes
+                            ]
+                            if matching_codes:
+                                exit_code_msg = (
+                                    f'(Exit code(s) {matching_codes} matched '
+                                    'recover_on_exit_codes '
+                                    f'[{recover_codes}])')
+                        logger.info(
+                            'User program crashed '
+                            f'({managed_job_status.value}). {exit_code_msg}')
                     else:
                         logger.info(
                             f'Task {task_id} failed and will not be retried')
@@ -655,9 +778,42 @@ class JobController:
                     # job status. Try to recover the job (will not restart the
                     # cluster, if the cluster is healthy).
                     assert job_status is None, job_status
-                    logger.info('Failed to fetch the job status while the '
-                                'cluster is healthy. Try to recover the job '
-                                '(the cluster will not be restarted).')
+                    if transient_job_check_error_reason is not None:
+                        assert (transient_job_check_error_start_time
+                                is not None), (
+                                    transient_job_check_error_start_time,
+                                    transient_job_check_error_reason)
+                        assert job_check_backoff is not None, (
+                            job_check_backoff, transient_job_check_error_reason)
+                        elapsed = time.time(
+                        ) - transient_job_check_error_start_time
+                        if (elapsed < managed_job_utils.
+                                JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS):
+                            remaining_timeout = (
+                                managed_job_utils.
+                                JOB_STATUS_FETCH_TOTAL_TIMEOUT_SECONDS -
+                                elapsed)
+                            backoff_time = min(
+                                job_check_backoff.current_backoff(),
+                                remaining_timeout)
+                            logger.info(
+                                'Failed to fetch the job status while the '
+                                'cluster is healthy. Retrying to avoid false'
+                                'alarm for job failure. Retrying in '
+                                f'{backoff_time:.1f} seconds...')
+                            await asyncio.sleep(backoff_time)
+                            continue
+                        else:
+                            logger.info(
+                                'Failed to fetch the job status after retrying '
+                                f'for {elapsed:.1f} seconds. Try to recover '
+                                'the job by restarting the job/cluster.')
+                    else:
+                        logger.info(
+                            'Failed to fetch the job status due to '
+                            'unrecoverable error. Try to recover the job by'
+                            ' restarting the job/cluster.')
             # When the handle is None, the cluster should be cleaned up already.
             if handle is not None:
                 resources = handle.launched_resources
@@ -688,7 +844,9 @@ class JobController:
                 job_id=self._job_id,
                 task_id=task_id,
                 force_transit_to_recovering=force_transit_to_recovering,
-                callback_func=callback_func)
+                callback_func=callback_func,
+                external_failures=external_failures,
+            )
             recovered_time = await self._strategy_executor.recover()
@@ -1183,6 +1341,8 @@ async def main(controller_uuid: str):
     context_utils.hijack_sys_attrs()
+    plugins.load_plugins(plugins.ExtensionContext())
     controller = ControllerManager(controller_uuid)
     # Will happen multiple times, who cares though

skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl