PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

sky/__init__.py +4 -2
sky/adaptors/slurm.py +159 -72
sky/backends/backend_utils.py +52 -10
sky/backends/cloud_vm_ray_backend.py +192 -32
sky/backends/task_codegen.py +40 -2
sky/catalog/data_fetchers/fetch_gcp.py +9 -1
sky/catalog/data_fetchers/fetch_nebius.py +1 -1
sky/catalog/data_fetchers/fetch_vast.py +4 -2
sky/catalog/seeweb_catalog.py +30 -15
sky/catalog/shadeform_catalog.py +5 -2
sky/catalog/slurm_catalog.py +0 -7
sky/catalog/vast_catalog.py +30 -6
sky/check.py +11 -8
sky/client/cli/command.py +106 -54
sky/client/interactive_utils.py +190 -0
sky/client/sdk.py +8 -0
sky/client/sdk_async.py +9 -0
sky/clouds/aws.py +60 -2
sky/clouds/azure.py +2 -0
sky/clouds/kubernetes.py +2 -0
sky/clouds/runpod.py +38 -7
sky/clouds/slurm.py +44 -12
sky/clouds/ssh.py +1 -1
sky/clouds/vast.py +30 -17
sky/core.py +69 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +26 -12
sky/data/mounting_utils.py +29 -4
sky/global_user_state.py +108 -16
sky/jobs/client/sdk.py +8 -3
sky/jobs/controller.py +191 -31
sky/jobs/recovery_strategy.py +109 -11
sky/jobs/server/core.py +81 -4
sky/jobs/server/server.py +14 -0
sky/jobs/state.py +417 -19
sky/jobs/utils.py +73 -80
sky/models.py +9 -0
sky/optimizer.py +2 -1
sky/provision/__init__.py +11 -9
sky/provision/kubernetes/utils.py +122 -15
sky/provision/kubernetes/volume.py +52 -17
sky/provision/provisioner.py +2 -1
sky/provision/runpod/instance.py +3 -1
sky/provision/runpod/utils.py +13 -1
sky/provision/runpod/volume.py +25 -9
sky/provision/slurm/instance.py +75 -29
sky/provision/slurm/utils.py +213 -107
sky/provision/vast/utils.py +1 -0
sky/resources.py +135 -13
sky/schemas/api/responses.py +4 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
sky/schemas/db/spot_jobs/009_job_events.py +32 -0
sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
sky/schemas/db/spot_jobs/011_add_links.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +9 -5
sky/schemas/generated/jobsv1_pb2.pyi +12 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
sky/serve/serve_utils.py +232 -40
sky/server/common.py +17 -0
sky/server/constants.py +1 -1
sky/server/metrics.py +6 -3
sky/server/plugins.py +16 -0
sky/server/requests/payloads.py +18 -0
sky/server/requests/request_names.py +2 -0
sky/server/requests/requests.py +28 -10
sky/server/requests/serializers/encoders.py +5 -0
sky/server/requests/serializers/return_value_serializers.py +14 -4
sky/server/server.py +434 -107
sky/server/uvicorn.py +5 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +21 -10
sky/sky_logging.py +2 -1
sky/skylet/constants.py +22 -5
sky/skylet/executor/slurm.py +4 -6
sky/skylet/job_lib.py +89 -4
sky/skylet/services.py +18 -3
sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
sky/templates/kubernetes-ray.yml.j2 +4 -6
sky/templates/slurm-ray.yml.j2 +32 -2
sky/templates/websocket_proxy.py +18 -41
sky/users/permission.py +61 -51
sky/utils/auth_utils.py +42 -0
sky/utils/cli_utils/status_utils.py +19 -5
sky/utils/cluster_utils.py +10 -3
sky/utils/command_runner.py +256 -94
sky/utils/command_runner.pyi +16 -0
sky/utils/common_utils.py +30 -29
sky/utils/context.py +32 -0
sky/utils/db/db_utils.py +36 -6
sky/utils/db/migration_utils.py +41 -21
sky/utils/infra_utils.py +5 -1
sky/utils/instance_links.py +139 -0
sky/utils/interactive_utils.py +49 -0
sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
sky/utils/kubernetes/rsync_helper.sh +5 -1
sky/utils/plugin_extensions/__init__.py +14 -0
sky/utils/plugin_extensions/external_failure_source.py +176 -0
sky/utils/resources_utils.py +10 -8
sky/utils/rich_utils.py +9 -11
sky/utils/schemas.py +63 -20
sky/utils/status_lib.py +7 -0
sky/utils/subprocess_utils.py +17 -0
sky/volumes/client/sdk.py +6 -3
sky/volumes/server/core.py +65 -27
sky_templates/ray/start_cluster +8 -4
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
/sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import os
 import traceback
 import typing
-from typing import Optional, Set
+from typing import List, Optional, Set
 from sky import backends
 from sky import dag as dag_lib
@@ -30,6 +30,7 @@ from sky.usage import usage_lib
 from sky.utils import common_utils
 from sky.utils import context_utils
 from sky.utils import env_options
+from sky.utils import instance_links as instance_links_utils
 from sky.utils import registry
 from sky.utils import status_lib
 from sky.utils import ux_utils
@@ -74,6 +75,7 @@ class StrategyExecutor:
         starting: Set[int],
         starting_lock: asyncio.Lock,
         starting_signal: asyncio.Condition,
+        recover_on_exit_codes: Optional[List[int]] = None,
     ) -> None:
         """Initialize the strategy executor.
@@ -87,6 +89,8 @@ class StrategyExecutor:
             starting: Set of job IDs that are currently starting.
             starting_lock: Lock to synchronize starting jobs.
             starting_signal: Condition to signal when a job can start.
+            recover_on_exit_codes: List of exit codes that should trigger
+                recovery regardless of max_restarts_on_errors limit.
         """
         assert isinstance(backend, backends.CloudVmRayBackend), (
             'Only CloudVMRayBackend is supported.')
@@ -99,6 +103,7 @@ class StrategyExecutor:
         self.cluster_name = cluster_name
         self.backend = backend
         self.max_restarts_on_errors = max_restarts_on_errors
+        self.recover_on_exit_codes = recover_on_exit_codes or []
         self.job_id = job_id
         self.task_id = task_id
         self.pool = pool
@@ -123,6 +128,9 @@ class StrategyExecutor:
     ) -> 'StrategyExecutor':
         """Create a strategy from a task."""
+        # TODO(cooperc): Consider defaulting to FAILOVER if using k8s with a
+        # single context, since there are not multiple clouds/regions to
+        # failover through.
         resource_list = list(task.resources)
         job_recovery = resource_list[0].job_recovery
         for resource in resource_list:
@@ -144,16 +152,26 @@ class StrategyExecutor:
             job_recovery_name: Optional[str] = name
             max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
                                                       0)
+            recover_exit_codes = job_recovery.pop('recover_on_exit_codes', None)
+            # Normalize single integer to list
+            recover_on_exit_codes: Optional[List[int]] = None
+            if isinstance(recover_exit_codes, int):
+                recover_on_exit_codes = [recover_exit_codes]
+            elif isinstance(recover_exit_codes, list):
+                recover_on_exit_codes = [
+                    int(code) for code in recover_exit_codes
+                ]
         else:
             job_recovery_name = job_recovery
             max_restarts_on_errors = 0
+            recover_on_exit_codes = None
         job_recovery_strategy = (registry.JOBS_RECOVERY_STRATEGY_REGISTRY.
                                  from_str(job_recovery_name))
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
                                      max_restarts_on_errors, job_id, task_id,
                                      pool, starting, starting_lock,
-                                     starting_signal)
+                                     starting_signal, recover_on_exit_codes)
     async def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -275,19 +293,25 @@ class StrategyExecutor:
                 break
             try:
-                status = await managed_job_utils.get_job_status(
-                    self.backend,
-                    self.cluster_name,
-                    job_id=self.job_id_on_pool_cluster)
+                status, transient_error_reason = (
+                    await managed_job_utils.get_job_status(
+                        self.backend,
+                        self.cluster_name,
+                        job_id=self.job_id_on_pool_cluster))
             except Exception as e:  # pylint: disable=broad-except
+                transient_error_reason = common_utils.format_exception(e)
                 # If any unexpected error happens, retry the job checking
                 # loop.
                 # Note: the CommandError is already handled in the
                 # get_job_status, so it should not happen here.
                 # TODO(zhwu): log the unexpected error to usage collection
                 # for future debugging.
-                logger.info(f'Unexpected exception: {e}\nFailed to get the '
-                            'job status. Retrying.')
+                logger.info('Unexpected exception during fetching job status: '
+                            f'{common_utils.format_exception(e)}')
+                continue
+            if transient_error_reason is not None:
+                logger.info('Transient error when fetching the job status: '
+                            f'{transient_error_reason}')
                 continue
             # Check the job status until it is not in initialized status
@@ -444,9 +468,16 @@ class StrategyExecutor:
                                 raise
                             logger.info('Managed job cluster launched.')
                         else:
+                            # Get task resources from DAG for resource-aware
+                            # scheduling.
+                            task_resources = None
+                            if self.dag.tasks:
+                                task = self.dag.tasks[self.task_id]
+                                task_resources = task.resources
                             self.cluster_name = await (context_utils.to_thread(
                                 serve_utils.get_next_cluster_name, self.pool,
-                                self.job_id))
+                                self.job_id, task_resources))
                             if self.cluster_name is None:
                                 raise exceptions.NoClusterLaunchedError(
                                     'No cluster name found in the pool.')
@@ -537,6 +568,52 @@ class StrategyExecutor:
                         # At this point, a sky.launch() has succeeded. Cluster
                         # may be UP (no preemption since) or DOWN (newly
                         # preempted).
+                        # Auto-populate instance links if cluster is on a real
+                        # cloud
+                        if self.cluster_name is not None and self.pool is None:
+                            try:
+                                handle = await context_utils.to_thread(
+                                    global_user_state.
+                                    get_handle_from_cluster_name,
+                                    self.cluster_name)
+                                if (handle is not None and hasattr(
+                                        handle, 'cached_cluster_info') and
+                                        handle.cached_cluster_info is not None):
+                                    cluster_info = handle.cached_cluster_info
+                                    instance_links = (instance_links_utils.
+                                                      generate_instance_links(
+                                                          cluster_info,
+                                                          self.cluster_name))
+                                    if instance_links:
+                                        # Store instance links directly in
+                                        # database
+                                        await state.update_links_async(
+                                            self.job_id, self.task_id,
+                                            instance_links)
+                                        logger.debug(
+                                            f'Auto-populated instance links: '
+                                            f'{instance_links}')
+                                    else:
+                                        logger.debug('Failed to generate '
+                                                     'instance links')
+                                else:
+                                    logger.debug(
+                                        'Cluster handle not found or '
+                                        'cached cluster info is None so'
+                                        'not populating instance links')
+                            except Exception as e:  # pylint: disable=broad-except
+                                # Don't fail the launch if we can't generate
+                                # links
+                                logger.debug(
+                                    'Failed to auto-populate instance links: '
+                                    f'{e}')
+                        else:
+                            if self.pool:
+                                logger.debug('Not populating instance links '
+                                             'since the cluster is for a pool')
+                            else:
+                                logger.debug('Not populating instance links '
+                                             'since the cluster name is None')
                         job_submitted_at = await (
                             self._wait_until_job_starts_on_cluster())
                         if job_submitted_at is not None:
@@ -589,15 +666,35 @@ class StrategyExecutor:
                 # NoClusterLaunchedError.
                 assert False, 'Unreachable'
-    def should_restart_on_failure(self) -> bool:
+    def should_restart_on_failure(self,
+                                  exit_codes: Optional[List[int]] = None
+                                 ) -> bool:
         """Increments counter & checks if job should be restarted on a failure.
+        Args:
+            exit_codes: List of exit codes from the failed job. If any exit code
+                matches recover_on_exit_codes, recovery will be triggered
+                regardless of max_restarts_on_errors limit.
         Returns:
             True if the job should be restarted, otherwise False.
         """
+        # Check if any exit code matches the configured recover_on_exit_codes
+        # This triggers recovery without incrementing the counter
+        if exit_codes and self.recover_on_exit_codes:
+            for exit_code in exit_codes:
+                if exit_code in self.recover_on_exit_codes:
+                    logger.info(f'Exit code {exit_code} matched '
+                                'recover_on_exit_codes, triggering recovery')
+                    return True
+        # Otherwise, check the max_restarts_on_errors counter
         self.restart_cnt_on_failure += 1
         if self.restart_cnt_on_failure > self.max_restarts_on_errors:
             return False
+        logger.info(f'Restart count {self.restart_cnt_on_failure} '
+                    'is less than max_restarts_on_errors, '
+                    'restarting job')
         return True
@@ -620,10 +717,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
         starting: Set[int],
         starting_lock: asyncio.Lock,
         starting_signal: asyncio.Condition,
+        recover_on_exit_codes: Optional[List[int]] = None,
     ) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
                          job_id, task_id, pool, starting, starting_lock,
-                         starting_signal)
+                         starting_signal, recover_on_exit_codes)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is

sky/jobs/server/core.py CHANGED Viewed

@@ -25,6 +25,7 @@ from sky.adaptors import common as adaptors_common
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
 from sky.catalog import common as service_catalog_common
+from sky.data import data_utils
 from sky.data import storage as storage_lib
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import state as managed_job_state
@@ -93,6 +94,51 @@ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
 ]
+def _warn_file_mounts_rolling_update(dag: 'sky.Dag') -> None:
+    """Warn if local file mounts or workdir may be lost during rolling update.
+    When rolling update is enabled with consolidation mode but no jobs bucket
+    is configured, local file mounts and workdirs are stored locally on the API
+    server pod and will be lost during a rolling update.
+    """
+    # If rolling update is not enabled, don't warn.
+    if os.environ.get(skylet_constants.SKYPILOT_ROLLING_UPDATE_ENABLED) is None:
+        return
+    # If consolidation mode is not enabled, don't warn.
+    if not managed_job_utils.is_consolidation_mode():
+        return
+    # If a jobs bucket is configured, don't warn.
+    if skypilot_config.get_nested(('jobs', 'bucket'), None) is not None:
+        return
+    # Check if any task has local file_mounts (not cloud store URLs) or workdir
+    has_local_file_mounts = False
+    has_local_workdir = False
+    for task_ in dag.tasks:
+        if task_.file_mounts:
+            for src in task_.file_mounts.values():
+                if not data_utils.is_cloud_store_url(src):
+                    has_local_file_mounts = True
+                    break
+        if task_.workdir and isinstance(task_.workdir, str):
+            has_local_workdir = True
+            break
+        if has_local_file_mounts:
+            break
+    if not has_local_file_mounts and not has_local_workdir:
+        return
+    logger.warning(
+        f'{colorama.Fore.YELLOW}WARNING: Local file mounts or workdir detected '
+        'with rolling update enabled for API server. To persist files'
+        ' across API server restarts/update, use buckets, volumes, or git '
+        'for your file mounts; or, configure a bucket in your SkyPilot config '
+        f'under `jobs.bucket`. {colorama.Style.RESET_ALL}')
 def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
     """Upload files to the controller.
@@ -103,14 +149,21 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
     """
     local_to_controller_file_mounts: Dict[str, str] = {}
-    # For consolidation mode, we don't need to use cloud storage,
-    # as uploading to the controller is only a local copy.
+    # Check if user has explicitly configured a bucket for jobs.
+    # If so, we should use cloud storage even in consolidation mode to persist
+    # files across rolling updates and pod restarts.
+    has_explicit_bucket = skypilot_config.get_nested(('jobs', 'bucket'),
+                                                     None) is not None
     storage_clouds = (
         storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
     force_disable_cloud_bucket = skypilot_config.get_nested(
         ('jobs', 'force_disable_cloud_bucket'), False)
-    if (not managed_job_utils.is_consolidation_mode() and storage_clouds and
-            not force_disable_cloud_bucket):
+    # Use cloud storage if:
+    # 1. Not in consolidation mode, OR
+    # 2. In consolidation mode BUT user has explicit bucket configured
+    # AND storage clouds are available AND cloud bucket is not force-disabled
+    if ((not managed_job_utils.is_consolidation_mode() or has_explicit_bucket)
+            and storage_clouds and not force_disable_cloud_bucket):
         for task_ in dag.tasks:
             controller_utils.maybe_translate_local_file_mounts_and_sync_up(
                 task_, task_type='jobs')
@@ -346,6 +399,9 @@ def launch(
                         f'with:\n\n`sky down {cluster_name} --purge`\n\n'
                         f'Reason: {common_utils.format_exception(e)}')
+    # Warn if file mounts may be lost during rolling update
+    _warn_file_mounts_rolling_update(dag)
     local_to_controller_file_mounts = _upload_files_to_controller(dag)
     controller = controller_utils.Controllers.JOBS_CONTROLLER
     controller_name = controller.value.cluster_name
@@ -1216,3 +1272,24 @@ def pool_sync_down_logs(
                                replica_ids=worker_ids,
                                tail=tail,
                                pool=True)
+@usage_lib.entrypoint
+def get_job_events(
+    job_id: int,
+    task_id: Optional[int] = None,
+    limit: Optional[int] = 10,
+) -> List[Dict[str, Any]]:
+    """Get task events for a managed job.
+    Args:
+        job_id: The job ID to get task events for.
+        task_id: Optional task ID to filter by.
+        limit: Optional limit on number of task events to return (default 10).
+    Returns:
+        List of task event records.
+    """
+    return managed_job_state.get_job_events(job_id=job_id,
+                                            task_id=task_id,
+                                            limit=limit)

sky/jobs/server/server.py CHANGED Viewed

@@ -242,3 +242,17 @@ async def pool_download_logs(
         schedule_type=api_requests.ScheduleType.SHORT,
         request_cluster_name=common.JOB_CONTROLLER_NAME,
     )
+@router.post('/events')
+async def events(request: fastapi.Request,
+                 body: payloads.GetJobEventsBody) -> None:
+    """Gets task events for a managed job."""
+    await executor.schedule_request_async(
+        request_id=request.state.request_id,
+        request_name=request_names.RequestName.JOBS_EVENTS,
+        request_body=body,
+        func=core.get_job_events,
+        schedule_type=api_requests.ScheduleType.SHORT,
+        request_cluster_name=common.JOB_CONTROLLER_NAME,
+    )

skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

skypilot-nightly 1.0.0.dev20251210py3-none-any.whl → 1.0.0.dev20260112py3-none-any.whl