PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250730py3-none-any.whl → 1.0.0.dev20250801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +4 -1
sky/backends/cloud_vm_ray_backend.py +4 -3
sky/catalog/__init__.py +3 -3
sky/catalog/aws_catalog.py +12 -0
sky/catalog/common.py +2 -2
sky/catalog/data_fetchers/fetch_aws.py +13 -1
sky/client/cli/command.py +452 -53
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +21 -1
sky/data/storage.py +12 -0
sky/jobs/__init__.py +3 -0
sky/jobs/client/sdk.py +80 -3
sky/jobs/controller.py +76 -25
sky/jobs/recovery_strategy.py +80 -34
sky/jobs/scheduler.py +68 -20
sky/jobs/server/core.py +228 -136
sky/jobs/server/server.py +40 -0
sky/jobs/state.py +129 -24
sky/jobs/utils.py +109 -51
sky/provision/nebius/constants.py +3 -0
sky/provision/runpod/utils.py +27 -12
sky/py.typed +0 -0
sky/resources.py +16 -12
sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
sky/serve/autoscalers.py +8 -0
sky/serve/client/impl.py +188 -0
sky/serve/client/sdk.py +12 -82
sky/serve/constants.py +5 -1
sky/serve/controller.py +5 -0
sky/serve/replica_managers.py +112 -37
sky/serve/serve_state.py +16 -6
sky/serve/serve_utils.py +274 -77
sky/serve/server/core.py +8 -525
sky/serve/server/impl.py +709 -0
sky/serve/service.py +13 -9
sky/serve/service_spec.py +74 -4
sky/server/constants.py +1 -1
sky/server/daemons.py +164 -0
sky/server/requests/payloads.py +33 -0
sky/server/requests/requests.py +2 -107
sky/server/requests/serializers/decoders.py +12 -3
sky/server/requests/serializers/encoders.py +13 -2
sky/server/server.py +2 -1
sky/server/uvicorn.py +2 -1
sky/sky_logging.py +30 -0
sky/skylet/constants.py +2 -1
sky/skylet/events.py +9 -0
sky/skypilot_config.py +24 -21
sky/task.py +41 -11
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/sky-serve-controller.yaml.j2 +18 -2
sky/users/server.py +1 -1
sky/utils/command_runner.py +4 -2
sky/utils/controller_utils.py +14 -10
sky/utils/dag_utils.py +4 -2
sky/utils/db/migration_utils.py +2 -4
sky/utils/schemas.py +47 -19
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
/sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0

sky/jobs/controller.py CHANGED Viewed

@@ -30,6 +30,7 @@ from sky.jobs import recovery_strategy
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
+from sky.serve import serve_utils
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.usage import usage_lib
@@ -60,12 +61,13 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
 class JobsController:
     """Each jobs controller manages the life cycle of one managed job."""
-    def __init__(self, job_id: int, dag_yaml: str) -> None:
+    def __init__(self, job_id: int, dag_yaml: str, pool: Optional[str]) -> None:
         self._job_id = job_id
         self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
         logger.info(self._dag)
         # TODO(zhwu): this assumes the specific backend.
         self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
+        self._pool = pool
         # pylint: disable=line-too-long
         # Add a unique identifier to the task environment variables, so that
@@ -99,8 +101,10 @@ class JobsController:
             task.update_envs(task_envs)
     def _download_log_and_stream(
-        self, task_id: Optional[int],
-        handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
+        self,
+        task_id: Optional[int],
+        handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle],
+        job_id_on_pool_cluster: Optional[int],
     ) -> None:
         """Downloads and streams the logs of the current job with given task ID.
@@ -113,9 +117,14 @@ class JobsController:
                         'Skipping downloading and streaming the logs.')
             return
         managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
-                                            'managed_jobs')
-        log_file = controller_utils.download_and_stream_latest_job_log(
-            self._backend, handle, managed_job_logs_dir)
+                                            'managed_jobs',
+                                            f'job-id-{self._job_id}')
+        log_file = controller_utils.download_and_stream_job_log(
+            self._backend,
+            handle,
+            managed_job_logs_dir,
+            job_ids=[str(job_id_on_pool_cluster)]
+            if job_id_on_pool_cluster is not None else None)
         if log_file is not None:
             # Set the path of the log file for the current task, so it can be
             # accessed even after the job is finished
@@ -123,6 +132,12 @@ class JobsController:
                                                  log_file)
         logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
+    def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
+        if cluster_name is None:
+            return
+        if self._pool is None:
+            managed_job_utils.terminate_cluster(cluster_name)
     def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
         """Busy loop monitoring cluster status and handling recovery.
@@ -193,10 +208,14 @@ class JobsController:
         usage_lib.messages.usage.update_task_id(task_id)
         task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
         assert task.name is not None, task
+        # Set the cluster name to None if the job is submitted
+        # to a pool. This will be updated when we later calls the `launch`
+        # or `recover` function from the strategy executor.
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
-            task.name, self._job_id)
+            task.name, self._job_id) if self._pool is None else None
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
-            cluster_name, self._backend, task, self._job_id, task_id)
+            cluster_name, self._backend, task, self._job_id, task_id,
+            self._pool)
         if not is_resume:
             submitted_at = time.time()
             if task_id == 0:
@@ -226,6 +245,13 @@ class JobsController:
         if not is_resume:
             remote_job_submitted_at = self._strategy_executor.launch()
             assert remote_job_submitted_at is not None, remote_job_submitted_at
+        if self._pool is None:
+            job_id_on_pool_cluster = None
+        else:
+            # Update the cluster name when using cluster pool.
+            cluster_name, job_id_on_pool_cluster = (
+                managed_job_state.get_pool_submit_info(self._job_id))
+        assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
         if not is_resume:
             managed_job_state.set_started(job_id=self._job_id,
@@ -279,7 +305,9 @@ class JobsController:
             if not force_transit_to_recovering:
                 try:
                     job_status = managed_job_utils.get_job_status(
-                        self._backend, cluster_name)
+                        self._backend,
+                        cluster_name,
+                        job_id=job_id_on_pool_cluster)
                 except exceptions.FetchClusterInfoError as fetch_e:
                     logger.info(
                         'Failed to fetch the job status. Start recovery.\n'
@@ -288,7 +316,7 @@ class JobsController:
             if job_status == job_lib.JobStatus.SUCCEEDED:
                 success_end_time = managed_job_utils.try_to_get_job_end_time(
-                    self._backend, cluster_name)
+                    self._backend, cluster_name, job_id_on_pool_cluster)
                 # The job is done. Set the job to SUCCEEDED first before start
                 # downloading and streaming the logs to make it more responsive.
                 managed_job_state.set_succeeded(self._job_id,
@@ -299,6 +327,8 @@ class JobsController:
                     f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
                     f'Cleaning up the cluster {cluster_name}.')
                 try:
+                    logger.info(f'Downloading logs on cluster {cluster_name} '
+                                f'and job id {job_id_on_pool_cluster}.')
                     clusters = backend_utils.get_clusters(
                         cluster_names=[cluster_name],
                         refresh=common.StatusRefreshMode.NONE,
@@ -307,7 +337,8 @@ class JobsController:
                         assert len(clusters) == 1, (clusters, cluster_name)
                         handle = clusters[0].get('handle')
                         # Best effort to download and stream the logs.
-                        self._download_log_and_stream(task_id, handle)
+                        self._download_log_and_stream(task_id, handle,
+                                                      job_id_on_pool_cluster)
                 except Exception as e:  # pylint: disable=broad-except
                     # We don't want to crash here, so just log and continue.
                     logger.warning(
@@ -316,7 +347,7 @@ class JobsController:
                         exc_info=True)
                 # Only clean up the cluster, not the storages, because tasks may
                 # share storages.
-                managed_job_utils.terminate_cluster(cluster_name=cluster_name)
+                self._cleanup_cluster(cluster_name)
                 return True
             # For single-node jobs, non-terminated job_status indicates a
@@ -364,13 +395,14 @@ class JobsController:
                       job_status == job_lib.JobStatus.FAILED_DRIVER):
                     # The user code has probably crashed, fail immediately.
                     end_time = managed_job_utils.try_to_get_job_end_time(
-                        self._backend, cluster_name)
+                        self._backend, cluster_name, job_id_on_pool_cluster)
                     logger.info(
                         f'The user job failed ({job_status}). Please check the '
                         'logs below.\n'
                         f'== Logs of the user job (ID: {self._job_id}) ==\n')
-                    self._download_log_and_stream(task_id, handle)
+                    self._download_log_and_stream(task_id, handle,
+                                                  job_id_on_pool_cluster)
                     failure_reason = (
                         'To see the details, run: '
@@ -457,7 +489,7 @@ class JobsController:
                     # those clusters again may fail.
                     logger.info('Cleaning up the preempted or failed cluster'
                                 '...')
-                    managed_job_utils.terminate_cluster(cluster_name)
+                    self._cleanup_cluster(cluster_name)
             # Try to recover the managed jobs, when the cluster is preempted or
             # failed or the job status is failed to be fetched.
@@ -467,6 +499,10 @@ class JobsController:
                 force_transit_to_recovering=force_transit_to_recovering,
                 callback_func=callback_func)
             recovered_time = self._strategy_executor.recover()
+            if self._pool is not None:
+                cluster_name, job_id_on_pool_cluster = (
+                    managed_job_state.get_pool_submit_info(self._job_id))
+                assert cluster_name is not None
             managed_job_state.set_recovered(self._job_id,
                                             task_id,
                                             recovered_time=recovered_time,
@@ -541,11 +577,11 @@ class JobsController:
                 task=self._dag.tasks[task_id]))
-def _run_controller(job_id: int, dag_yaml: str):
+def _run_controller(job_id: int, dag_yaml: str, pool: Optional[str]):
     """Runs the controller in a remote process for interruption."""
     # The controller needs to be instantiated in the remote process, since
     # the controller is not serializable.
-    jobs_controller = JobsController(job_id, dag_yaml)
+    jobs_controller = JobsController(job_id, dag_yaml, pool)
     jobs_controller.run()
@@ -577,7 +613,7 @@ def _handle_signal(job_id):
         f'User sent {user_signal.value} signal.')
-def _cleanup(job_id: int, dag_yaml: str):
+def _cleanup(job_id: int, dag_yaml: str, pool: Optional[str]):
     """Clean up the cluster(s) and storages.
     (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
@@ -595,9 +631,18 @@ def _cleanup(job_id: int, dag_yaml: str):
     dag, _ = _get_dag_and_name(dag_yaml)
     for task in dag.tasks:
         assert task.name is not None, task
-        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
-            task.name, job_id)
-        managed_job_utils.terminate_cluster(cluster_name)
+        if pool is None:
+            cluster_name = managed_job_utils.generate_managed_job_cluster_name(
+                task.name, job_id)
+            managed_job_utils.terminate_cluster(cluster_name)
+        else:
+            cluster_name, job_id_on_pool_cluster = (
+                managed_job_state.get_pool_submit_info(job_id))
+            if cluster_name is not None:
+                if job_id_on_pool_cluster is not None:
+                    core.cancel(cluster_name=cluster_name,
+                                job_ids=[job_id_on_pool_cluster],
+                                _try_cancel_if_cluster_is_init=True)
         # Clean up Storages with persistent=False.
         # TODO(zhwu): this assumes the specific backend.
@@ -629,7 +674,7 @@ def _cleanup(job_id: int, dag_yaml: str):
                     f'Failed to clean up file mount {file_mount}: {e}')
-def start(job_id, dag_yaml):
+def start(job_id, dag_yaml, pool):
     """Start the controller."""
     controller_process = None
     cancelling = False
@@ -643,7 +688,8 @@ def start(job_id, dag_yaml):
         #  So we can only enable daemon after we no longer need to
         #  start daemon processes like Ray.
         controller_process = multiprocessing.Process(target=_run_controller,
-                                                     args=(job_id, dag_yaml))
+                                                     args=(job_id, dag_yaml,
+                                                           pool))
         controller_process.start()
         while controller_process.is_alive():
             _handle_signal(job_id)
@@ -679,7 +725,7 @@ def start(job_id, dag_yaml):
         # https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
         # But anyway, a clean solution is killing the controller process
         # directly, and then cleanup the cluster job_state.
-        _cleanup(job_id, dag_yaml=dag_yaml)
+        _cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
         logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
         if cancelling:
@@ -717,8 +763,13 @@ if __name__ == '__main__':
     parser.add_argument('dag_yaml',
                         type=str,
                         help='The path to the user job yaml file.')
+    parser.add_argument('--pool',
+                        required=False,
+                        default=None,
+                        type=str,
+                        help='The pool to use for the controller job.')
     args = parser.parse_args()
     # We start process with 'spawn', because 'fork' could result in weird
     # behaviors; 'spawn' is also cross-platform.
     multiprocessing.set_start_method('spawn', force=True)
-    start(args.job_id, args.dag_yaml)
+    start(args.job_id, args.dag_yaml, args.pool)

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -20,6 +20,7 @@ from sky.backends import backend_utils
 from sky.jobs import scheduler
 from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
+from sky.serve import serve_utils
 from sky.skylet import job_lib
 from sky.usage import usage_lib
 from sky.utils import common_utils
@@ -48,9 +49,9 @@ class StrategyExecutor:
     RETRY_INIT_GAP_SECONDS = 60
-    def __init__(self, cluster_name: str, backend: 'backends.Backend',
+    def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
                  task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int, task_id: int) -> None:
+                 job_id: int, task_id: int, pool: Optional[str]) -> None:
         """Initialize the strategy executor.
         Args:
@@ -62,17 +63,23 @@ class StrategyExecutor:
             'Only CloudVMRayBackend is supported.')
         self.dag = sky.Dag()
         self.dag.add(task)
+        # For jobs submitted to a pool, the cluster name might change after each
+        # recovery. Initially this is set to an empty string to indicate that no
+        # cluster is assigned yet, and in `_launch`, it will be set to one of
+        # the cluster names in the pool.
         self.cluster_name = cluster_name
         self.backend = backend
         self.max_restarts_on_errors = max_restarts_on_errors
         self.job_id = job_id
         self.task_id = task_id
+        self.pool = pool
         self.restart_cnt_on_failure = 0
+        self.job_id_on_pool_cluster: Optional[int] = None
     @classmethod
-    def make(cls, cluster_name: str, backend: 'backends.Backend',
-             task: 'task_lib.Task', job_id: int,
-             task_id: int) -> 'StrategyExecutor':
+    def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
+             task: 'task_lib.Task', job_id: int, task_id: int,
+             pool: Optional[str]) -> 'StrategyExecutor':
         """Create a strategy from a task."""
         resource_list = list(task.resources)
@@ -103,7 +110,8 @@ class StrategyExecutor:
                                  from_str(job_recovery_name))
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
-                                     max_restarts_on_errors, job_id, task_id)
+                                     max_restarts_on_errors, job_id, task_id,
+                                     pool)
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -131,12 +139,14 @@ class StrategyExecutor:
         """
         raise NotImplementedError
-    def _try_cancel_all_jobs(self):
+    def _try_cancel_jobs(self):
         from sky import core  # pylint: disable=import-outside-toplevel
+        if self.cluster_name is None:
+            return
         handle = global_user_state.get_handle_from_cluster_name(
             self.cluster_name)
-        if handle is None:
+        if handle is None or self.pool is not None:
             return
         try:
             usage_lib.messages.usage.set_internal()
@@ -159,8 +169,13 @@ class StrategyExecutor:
             # should be functional with the `_try_cancel_if_cluster_is_init`
             # flag, i.e. it sends the cancel signal to the head node, which will
             # then kill the user process on remaining worker nodes.
+            # Only cancel the corresponding job for worker pool.
+            if self.pool is None:
+                kwargs = dict(all=True)
+            else:
+                kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
             core.cancel(cluster_name=self.cluster_name,
-                        all=True,
+                        **kwargs,
                         _try_cancel_if_cluster_is_init=True)
         except Exception as e:  # pylint: disable=broad-except
             logger.info('Failed to cancel the job on the cluster. The cluster '
@@ -169,7 +184,7 @@ class StrategyExecutor:
                         f'{common_utils.format_exception(e)}\n'
                         'Terminating the cluster explicitly to ensure no '
                         'remaining job process interferes with recovery.')
-            managed_job_utils.terminate_cluster(self.cluster_name)
+            self._cleanup_cluster()
     def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
         """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -178,6 +193,7 @@ class StrategyExecutor:
             The timestamp of when the job is submitted, or None if failed to
             submit.
         """
+        assert self.cluster_name is not None
         status = None
         job_checking_retry_cnt = 0
         while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
@@ -208,7 +224,9 @@ class StrategyExecutor:
             try:
                 status = managed_job_utils.get_job_status(
-                    self.backend, self.cluster_name)
+                    self.backend,
+                    self.cluster_name,
+                    job_id=self.job_id_on_pool_cluster)
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
                 # loop.
@@ -224,7 +242,10 @@ class StrategyExecutor:
             if status is not None and status > job_lib.JobStatus.INIT:
                 try:
                     job_submitted_at = managed_job_utils.get_job_timestamp(
-                        self.backend, self.cluster_name, get_end_time=False)
+                        self.backend,
+                        self.cluster_name,
+                        self.job_id_on_pool_cluster,
+                        get_end_time=False)
                     return job_submitted_at
                 except Exception as e:  # pylint: disable=broad-except
                     # If we failed to get the job timestamp, we will retry
@@ -236,6 +257,12 @@ class StrategyExecutor:
             time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
         return None
+    def _cleanup_cluster(self) -> None:
+        if self.cluster_name is None:
+            return
+        if self.pool is None:
+            managed_job_utils.terminate_cluster(self.cluster_name)
     def _launch(self,
                 max_retry: Optional[int] = 3,
                 raise_on_failure: bool = True,
@@ -290,19 +317,35 @@ class StrategyExecutor:
                                              recovery)
                     try:
                         usage_lib.messages.usage.set_internal()
-                        # Detach setup, so that the setup failure can be
-                        # detected by the controller process (job_status ->
-                        # FAILED_SETUP).
-                        execution.launch(
-                            self.dag,
-                            cluster_name=self.cluster_name,
-                            # We expect to tear down the cluster as soon as the
-                            # job is finished. However, in case the controller
-                            # dies, set autodown to try and avoid a resource
-                            # leak.
-                            idle_minutes_to_autostop=_AUTODOWN_MINUTES,
-                            down=True,
-                            _is_launched_by_jobs_controller=True)
+                        if self.pool is None:
+                            assert self.cluster_name is not None
+                            # Detach setup, so that the setup failure can be
+                            # detected by the controller process (job_status ->
+                            # FAILED_SETUP).
+                            execution.launch(
+                                self.dag,
+                                cluster_name=self.cluster_name,
+                                # We expect to tear down the cluster as soon as
+                                # the job is finished. However, in case the
+                                # controller dies, set autodown to try and avoid
+                                # a resource leak.
+                                idle_minutes_to_autostop=_AUTODOWN_MINUTES,
+                                down=True,
+                                _is_launched_by_jobs_controller=True)
+                        else:
+                            self.cluster_name = (
+                                serve_utils.get_next_cluster_name(
+                                    self.pool, self.job_id))
+                            if self.cluster_name is None:
+                                raise exceptions.NoClusterLaunchedError(
+                                    'No cluster name found in the pool.')
+                            job_id_on_pool_cluster, _ = execution.exec(
+                                self.dag, cluster_name=self.cluster_name)
+                            assert job_id_on_pool_cluster is not None, (
+                                self.cluster_name, self.job_id)
+                            self.job_id_on_pool_cluster = job_id_on_pool_cluster
+                            state.set_job_id_on_pool_cluster(
+                                self.job_id, job_id_on_pool_cluster)
                         logger.info('Managed job cluster launched.')
                     except (exceptions.InvalidClusterNameError,
                             exceptions.NoCloudAccessError,
@@ -373,7 +416,7 @@ class StrategyExecutor:
                     # If we get here, the launch did not succeed. Tear down the
                     # cluster and retry.
-                    managed_job_utils.terminate_cluster(self.cluster_name)
+                    self._cleanup_cluster()
                     if max_retry is not None and retry_cnt >= max_retry:
                         # Retry forever if max_retry is None.
                         if raise_on_failure:
@@ -398,7 +441,10 @@ class StrategyExecutor:
                 # Update the status to PENDING during backoff.
                 state.set_backoff_pending(self.job_id, self.task_id)
                 # Calculate the backoff time and sleep.
-                gap_seconds = backoff.current_backoff()
+                # We retry immediately for worker pool, since no sky.launch()
+                # is called and the overhead is minimal.
+                gap_seconds = (backoff.current_backoff()
+                               if self.pool is None else 0)
                 logger.info('Retrying to launch the cluster in '
                             f'{gap_seconds:.1f} seconds.')
                 time.sleep(gap_seconds)
@@ -427,11 +473,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
-    def __init__(self, cluster_name: str, backend: 'backends.Backend',
+    def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
                  task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int, task_id: int) -> None:
+                 job_id: int, task_id: int, pool: Optional[str]) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
-                         job_id, task_id)
+                         job_id, task_id, pool)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
@@ -444,7 +490,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
                 recovery: bool = False) -> Optional[float]:
         job_submitted_at = super()._launch(max_retry, raise_on_failure,
                                            recovery)
-        if job_submitted_at is not None:
+        if job_submitted_at is not None and self.cluster_name is not None:
             # Only record the cloud/region if the launch is successful.
             handle = global_user_state.get_handle_from_cluster_name(
                 self.cluster_name)
@@ -464,7 +510,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
         #    original user specification.
         # Step 1
-        self._try_cancel_all_jobs()
+        self._try_cancel_jobs()
         while True:
             # Add region constraint to the task, to retry on the same region
@@ -488,7 +534,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
             # Step 2
             logger.debug('Terminating unhealthy cluster and reset cloud '
                          'region.')
-            managed_job_utils.terminate_cluster(self.cluster_name)
+            self._cleanup_cluster()
             # Step 3
             logger.debug('Relaunch the cluster  without constraining to prior '
@@ -547,7 +593,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
         # Step 1
         logger.debug('Terminating unhealthy cluster and reset cloud region.')
-        managed_job_utils.terminate_cluster(self.cluster_name)
+        self._cleanup_cluster()
         # Step 2
         logger.debug('Relaunch the cluster skipping the previously launched '

skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250730py3-none-any.whl → 1.0.0.dev20250801py3-none-any.whl