PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250114py3-none-any.whl → 1.0.0.dev20250124py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

sky/__init__.py +2 -2
sky/backends/cloud_vm_ray_backend.py +50 -67
sky/check.py +31 -1
sky/cli.py +11 -34
sky/clouds/kubernetes.py +3 -3
sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
sky/core.py +8 -5
sky/data/storage.py +66 -14
sky/global_user_state.py +1 -1
sky/jobs/constants.py +8 -7
sky/jobs/controller.py +19 -22
sky/jobs/core.py +0 -2
sky/jobs/recovery_strategy.py +114 -143
sky/jobs/scheduler.py +283 -0
sky/jobs/state.py +263 -21
sky/jobs/utils.py +338 -96
sky/provision/aws/config.py +48 -26
sky/provision/gcp/instance_utils.py +15 -9
sky/provision/kubernetes/instance.py +1 -1
sky/provision/kubernetes/utils.py +76 -18
sky/resources.py +1 -1
sky/serve/autoscalers.py +359 -301
sky/serve/controller.py +10 -8
sky/serve/core.py +84 -7
sky/serve/load_balancer.py +27 -10
sky/serve/replica_managers.py +1 -3
sky/serve/serve_state.py +10 -5
sky/serve/serve_utils.py +28 -1
sky/serve/service.py +4 -3
sky/serve/service_spec.py +31 -0
sky/skylet/constants.py +1 -1
sky/skylet/events.py +7 -3
sky/skylet/job_lib.py +10 -30
sky/skylet/log_lib.py +8 -8
sky/skylet/log_lib.pyi +3 -0
sky/skylet/skylet.py +1 -1
sky/templates/jobs-controller.yaml.j2 +7 -3
sky/templates/sky-serve-controller.yaml.j2 +4 -0
sky/utils/db_utils.py +18 -4
sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
sky/utils/resources_utils.py +25 -21
sky/utils/schemas.py +13 -0
sky/utils/subprocess_utils.py +48 -9
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0

sky/jobs/controller.py CHANGED Viewed

@@ -16,6 +16,7 @@ from sky import status_lib
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
 from sky.jobs import recovery_strategy
+from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
 from sky.skylet import constants
@@ -46,12 +47,10 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
 class JobsController:
     """Each jobs controller manages the life cycle of one managed job."""
-    def __init__(self, job_id: int, dag_yaml: str,
-                 retry_until_up: bool) -> None:
+    def __init__(self, job_id: int, dag_yaml: str) -> None:
         self._job_id = job_id
         self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
         logger.info(self._dag)
-        self._retry_until_up = retry_until_up
         # TODO(zhwu): this assumes the specific backend.
         self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
@@ -174,7 +173,7 @@ class JobsController:
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
             task.name, self._job_id)
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
-            cluster_name, self._backend, task, self._retry_until_up)
+            cluster_name, self._backend, task, self._job_id)
         managed_job_state.set_submitted(
             self._job_id,
             task_id,
@@ -202,6 +201,7 @@ class JobsController:
                                       task_id=task_id,
                                       start_time=remote_job_submitted_at,
                                       callback_func=callback_func)
         while True:
             time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
@@ -243,7 +243,7 @@ class JobsController:
                     self._download_log_and_stream(task_id, handle)
                 # Only clean up the cluster, not the storages, because tasks may
                 # share storages.
-                recovery_strategy.terminate_cluster(cluster_name=cluster_name)
+                managed_job_utils.terminate_cluster(cluster_name=cluster_name)
                 return True
             # For single-node jobs, non-terminated job_status indicates a
@@ -256,9 +256,7 @@ class JobsController:
                     task.num_nodes == 1):
                 continue
-            if job_status in [
-                    job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
-            ]:
+            if job_status in job_lib.JobStatus.user_code_failure_states():
                 # Add a grace period before the check of preemption to avoid
                 # false alarm for job failure.
                 time.sleep(5)
@@ -288,9 +286,7 @@ class JobsController:
                 if job_status is not None and not job_status.is_terminal():
                     # The multi-node job is still running, continue monitoring.
                     continue
-                elif job_status in [
-                        job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
-                ]:
+                elif job_status in job_lib.JobStatus.user_code_failure_states():
                     # The user code has probably crashed, fail immediately.
                     end_time = managed_job_utils.get_job_timestamp(
                         self._backend, cluster_name, get_end_time=True)
@@ -346,7 +342,7 @@ class JobsController:
                     # those clusters again may fail.
                     logger.info('Cleaning up the preempted or failed cluster'
                                 '...')
-                    recovery_strategy.terminate_cluster(cluster_name)
+                    managed_job_utils.terminate_cluster(cluster_name)
             # Try to recover the managed jobs, when the cluster is preempted or
             # failed or the job status is failed to be fetched.
@@ -428,11 +424,11 @@ class JobsController:
                 task=self._dag.tasks[task_id]))
-def _run_controller(job_id: int, dag_yaml: str, retry_until_up: bool):
+def _run_controller(job_id: int, dag_yaml: str):
     """Runs the controller in a remote process for interruption."""
     # The controller needs to be instantiated in the remote process, since
     # the controller is not serializable.
-    jobs_controller = JobsController(job_id, dag_yaml, retry_until_up)
+    jobs_controller = JobsController(job_id, dag_yaml)
     jobs_controller.run()
@@ -482,17 +478,18 @@ def _cleanup(job_id: int, dag_yaml: str):
         assert task.name is not None, task
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
             task.name, job_id)
-        recovery_strategy.terminate_cluster(cluster_name)
+        managed_job_utils.terminate_cluster(cluster_name)
         # Clean up Storages with persistent=False.
         # TODO(zhwu): this assumes the specific backend.
         backend = cloud_vm_ray_backend.CloudVmRayBackend()
         backend.teardown_ephemeral_storage(task)
-def start(job_id, dag_yaml, retry_until_up):
+def start(job_id, dag_yaml):
     """Start the controller."""
     controller_process = None
     cancelling = False
+    task_id = None
     try:
         _handle_signal(job_id)
         # TODO(suquark): In theory, we should make controller process a
@@ -502,8 +499,7 @@ def start(job_id, dag_yaml, retry_until_up):
         #  So we can only enable daemon after we no longer need to
         #  start daemon processes like Ray.
         controller_process = multiprocessing.Process(target=_run_controller,
-                                                     args=(job_id, dag_yaml,
-                                                           retry_until_up))
+                                                     args=(job_id, dag_yaml))
         controller_process.start()
         while controller_process.is_alive():
             _handle_signal(job_id)
@@ -511,6 +507,7 @@ def start(job_id, dag_yaml, retry_until_up):
     except exceptions.ManagedJobUserCancelledError:
         dag, _ = _get_dag_and_name(dag_yaml)
         task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
+        assert task_id is not None, job_id
         logger.info(
             f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
         managed_job_state.set_cancelling(
@@ -542,6 +539,7 @@ def start(job_id, dag_yaml, retry_until_up):
         logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
         if cancelling:
+            assert task_id is not None, job_id  # Since it's set with cancelling
             managed_job_state.set_cancelled(
                 job_id=job_id,
                 callback_func=managed_job_utils.event_callback_func(
@@ -563,6 +561,8 @@ def start(job_id, dag_yaml, retry_until_up):
                 failure_reason=('Unexpected error occurred. For details, '
                                 f'run: sky jobs logs --controller {job_id}'))
+        scheduler.job_done(job_id)
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -570,9 +570,6 @@ if __name__ == '__main__':
                         required=True,
                         type=int,
                         help='Job id for the controller job.')
-    parser.add_argument('--retry-until-up',
-                        action='store_true',
-                        help='Retry until the cluster is up.')
     parser.add_argument('dag_yaml',
                         type=str,
                         help='The path to the user job yaml file.')
@@ -580,4 +577,4 @@ if __name__ == '__main__':
     # We start process with 'spawn', because 'fork' could result in weird
     # behaviors; 'spawn' is also cross-platform.
     multiprocessing.set_start_method('spawn', force=True)
-    start(args.job_id, args.dag_yaml, args.retry_until_up)
+    start(args.job_id, args.dag_yaml)

sky/jobs/core.py CHANGED Viewed

@@ -41,7 +41,6 @@ def launch(
         name: Optional[str] = None,
         stream_logs: bool = True,
         detach_run: bool = False,
-        retry_until_up: bool = False,
         # TODO(cooperc): remove fast arg before 0.8.0
         fast: bool = True,  # pylint: disable=unused-argument for compatibility
 ) -> None:
@@ -115,7 +114,6 @@ def launch(
             'jobs_controller': controller_name,
             # Note: actual cluster name will be <task.name>-<managed job ID>
             'dag_name': dag.name,
-            'retry_until_up': retry_until_up,
             'remote_user_config_path': remote_user_config_path,
             'modified_catalogs':
                 service_catalog_common.get_modified_catalog_file_mounts(),

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -17,6 +17,7 @@ from sky import global_user_state
 from sky import sky_logging
 from sky import status_lib
 from sky.backends import backend_utils
+from sky.jobs import scheduler
 from sky.jobs import utils as managed_job_utils
 from sky.skylet import job_lib
 from sky.usage import usage_lib
@@ -42,45 +43,20 @@ MAX_JOB_CHECKING_RETRY = 10
 _AUTODOWN_MINUTES = 5
-def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
-    """Terminate the cluster."""
-    retry_cnt = 0
-    while True:
-        try:
-            usage_lib.messages.usage.set_internal()
-            sky.down(cluster_name)
-            return
-        except exceptions.ClusterDoesNotExist:
-            # The cluster is already down.
-            logger.debug(f'The cluster {cluster_name} is already down.')
-            return
-        except Exception as e:  # pylint: disable=broad-except
-            retry_cnt += 1
-            if retry_cnt >= max_retry:
-                raise RuntimeError(
-                    f'Failed to terminate the cluster {cluster_name}.') from e
-            logger.error(
-                f'Failed to terminate the cluster {cluster_name}. Retrying.'
-                f'Details: {common_utils.format_exception(e)}')
-            with ux_utils.enable_traceback():
-                logger.error(f'  Traceback: {traceback.format_exc()}')
 class StrategyExecutor:
     """Handle the launching, recovery and termination of managed job clusters"""
     RETRY_INIT_GAP_SECONDS = 60
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool,
-                 max_restarts_on_errors: int) -> None:
+                 task: 'task_lib.Task', max_restarts_on_errors: int,
+                 job_id: int) -> None:
         """Initialize the strategy executor.
         Args:
             cluster_name: The name of the cluster.
             backend: The backend to use. Only CloudVMRayBackend is supported.
             task: The task to execute.
-            retry_until_up: Whether to retry until the cluster is up.
         """
         assert isinstance(backend, backends.CloudVmRayBackend), (
             'Only CloudVMRayBackend is supported.')
@@ -88,8 +64,8 @@ class StrategyExecutor:
         self.dag.add(task)
         self.cluster_name = cluster_name
         self.backend = backend
-        self.retry_until_up = retry_until_up
         self.max_restarts_on_errors = max_restarts_on_errors
+        self.job_id = job_id
         self.restart_cnt_on_failure = 0
     def __init_subclass__(cls, name: str, default: bool = False):
@@ -102,7 +78,7 @@ class StrategyExecutor:
     @classmethod
     def make(cls, cluster_name: str, backend: 'backends.Backend',
-             task: 'task_lib.Task', retry_until_up: bool) -> 'StrategyExecutor':
+             task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
         """Create a strategy from a task."""
         resource_list = list(task.resources)
@@ -127,8 +103,9 @@ class StrategyExecutor:
             job_recovery_name = job_recovery
             max_restarts_on_errors = 0
         return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
-                                                      task, retry_until_up,
-                                                      max_restarts_on_errors)
+                                                      task,
+                                                      max_restarts_on_errors,
+                                                      job_id)
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -142,10 +119,7 @@ class StrategyExecutor:
         Raises: Please refer to the docstring of self._launch().
         """
-        if self.retry_until_up:
-            job_submit_at = self._launch(max_retry=None)
-        else:
-            job_submit_at = self._launch()
+        job_submit_at = self._launch(max_retry=None)
         assert job_submit_at is not None
         return job_submit_at
@@ -195,7 +169,7 @@ class StrategyExecutor:
                         f'{common_utils.format_exception(e)}\n'
                         'Terminating the cluster explicitly to ensure no '
                         'remaining job process interferes with recovery.')
-            terminate_cluster(self.cluster_name)
+            managed_job_utils.terminate_cluster(self.cluster_name)
     def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
         """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -304,89 +278,96 @@ class StrategyExecutor:
         backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
         while True:
             retry_cnt += 1
-            try:
-                usage_lib.messages.usage.set_internal()
-                # Detach setup, so that the setup failure can be detected
-                # by the controller process (job_status -> FAILED_SETUP).
-                sky.launch(
-                    self.dag,
-                    cluster_name=self.cluster_name,
-                    # We expect to tear down the cluster as soon as the job is
-                    # finished. However, in case the controller dies, set
-                    # autodown to try and avoid a resource leak.
-                    idle_minutes_to_autostop=_AUTODOWN_MINUTES,
-                    down=True,
-                    detach_setup=True,
-                    detach_run=True,
-                    _is_launched_by_jobs_controller=True)
-                logger.info('Managed job cluster launched.')
-            except (exceptions.InvalidClusterNameError,
-                    exceptions.NoCloudAccessError,
-                    exceptions.ResourcesMismatchError) as e:
-                logger.error('Failure happened before provisioning. '
-                             f'{common_utils.format_exception(e)}')
-                if raise_on_failure:
-                    raise exceptions.ProvisionPrechecksError(reasons=[e])
-                return None
-            except exceptions.ResourcesUnavailableError as e:
-                # This is raised when the launch fails due to prechecks or
-                # after failing over through all the candidates.
-                # Please refer to the docstring of `sky.launch` for more
-                # details of how the exception will be structured.
-                if not any(
-                        isinstance(err, exceptions.ResourcesUnavailableError)
-                        for err in e.failover_history):
-                    # _launch() (this function) should fail/exit directly, if
-                    # none of the failover reasons were because of resource
-                    # unavailability or no failover was attempted (the optimizer
-                    # cannot find feasible resources for requested resources),
-                    # i.e., e.failover_history is empty.
-                    # Failing directly avoids the infinite loop of retrying
-                    # the launch when, e.g., an invalid cluster name is used
-                    # and --retry-until-up is specified.
-                    reasons = (e.failover_history
-                               if e.failover_history else [e])
-                    reasons_str = '; '.join(
-                        common_utils.format_exception(err) for err in reasons)
-                    logger.error(
-                        'Failure happened before provisioning. Failover '
-                        f'reasons: {reasons_str}')
+            with scheduler.scheduled_launch(self.job_id):
+                try:
+                    usage_lib.messages.usage.set_internal()
+                    # Detach setup, so that the setup failure can be detected
+                    # by the controller process (job_status -> FAILED_SETUP).
+                    sky.launch(
+                        self.dag,
+                        cluster_name=self.cluster_name,
+                        # We expect to tear down the cluster as soon as the job
+                        # is finished. However, in case the controller dies, set
+                        # autodown to try and avoid a resource leak.
+                        idle_minutes_to_autostop=_AUTODOWN_MINUTES,
+                        down=True,
+                        detach_setup=True,
+                        detach_run=True,
+                        _is_launched_by_jobs_controller=True)
+                    logger.info('Managed job cluster launched.')
+                except (exceptions.InvalidClusterNameError,
+                        exceptions.NoCloudAccessError,
+                        exceptions.ResourcesMismatchError) as e:
+                    logger.error('Failure happened before provisioning. '
+                                 f'{common_utils.format_exception(e)}')
                     if raise_on_failure:
-                        raise exceptions.ProvisionPrechecksError(reasons)
-                    return None
-                logger.info('Failed to launch a cluster with error: '
-                            f'{common_utils.format_exception(e)})')
-            except Exception as e:  # pylint: disable=broad-except
-                # If the launch fails, it will be recovered by the following
-                # code.
-                logger.info('Failed to launch a cluster with error: '
-                            f'{common_utils.format_exception(e)})')
-                with ux_utils.enable_traceback():
-                    logger.info(f'  Traceback: {traceback.format_exc()}')
-            else:  # No exception, the launch succeeds.
-                # At this point, a sky.launch() has succeeded. Cluster may be
-                # UP (no preemption since) or DOWN (newly preempted).
-                job_submitted_at = self._wait_until_job_starts_on_cluster()
-                if job_submitted_at is not None:
-                    return job_submitted_at
-                # The job fails to start on the cluster, retry the launch.
-                # TODO(zhwu): log the unexpected error to usage collection
-                # for future debugging.
-                logger.info(
-                    'Failed to successfully submit the job to the '
-                    'launched cluster, due to unexpected submission errors or '
-                    'the cluster being preempted during job submission.')
-            terminate_cluster(self.cluster_name)
-            if max_retry is not None and retry_cnt >= max_retry:
-                # Retry forever if max_retry is None.
-                if raise_on_failure:
-                    with ux_utils.print_exception_no_traceback():
-                        raise exceptions.ManagedJobReachedMaxRetriesError(
-                            'Resources unavailable: failed to launch clusters '
-                            f'after {max_retry} retries.')
-                else:
+                        raise exceptions.ProvisionPrechecksError(reasons=[e])
                     return None
+                except exceptions.ResourcesUnavailableError as e:
+                    # This is raised when the launch fails due to prechecks or
+                    # after failing over through all the candidates.
+                    # Please refer to the docstring of `sky.launch` for more
+                    # details of how the exception will be structured.
+                    if not any(
+                            isinstance(err,
+                                       exceptions.ResourcesUnavailableError)
+                            for err in e.failover_history):
+                        # _launch() (this function) should fail/exit directly,
+                        # if none of the failover reasons were because of
+                        # resource unavailability or no failover was attempted
+                        # (the optimizer cannot find feasible resources for
+                        # requested resources), i.e., e.failover_history is
+                        # empty. Failing directly avoids the infinite loop of
+                        # retrying the launch when, e.g., an invalid cluster
+                        # name is used and --retry-until-up is specified.
+                        reasons = (e.failover_history
+                                   if e.failover_history else [e])
+                        reasons_str = '; '.join(
+                            common_utils.format_exception(err)
+                            for err in reasons)
+                        logger.error(
+                            'Failure happened before provisioning. Failover '
+                            f'reasons: {reasons_str}')
+                        if raise_on_failure:
+                            raise exceptions.ProvisionPrechecksError(reasons)
+                        return None
+                    logger.info('Failed to launch a cluster with error: '
+                                f'{common_utils.format_exception(e)})')
+                except Exception as e:  # pylint: disable=broad-except
+                    # If the launch fails, it will be recovered by the following
+                    # code.
+                    logger.info('Failed to launch a cluster with error: '
+                                f'{common_utils.format_exception(e)})')
+                    with ux_utils.enable_traceback():
+                        logger.info(f'  Traceback: {traceback.format_exc()}')
+                else:  # No exception, the launch succeeds.
+                    # At this point, a sky.launch() has succeeded. Cluster may
+                    # be UP (no preemption since) or DOWN (newly preempted).
+                    job_submitted_at = self._wait_until_job_starts_on_cluster()
+                    if job_submitted_at is not None:
+                        return job_submitted_at
+                    # The job fails to start on the cluster, retry the launch.
+                    # TODO(zhwu): log the unexpected error to usage collection
+                    # for future debugging.
+                    logger.info(
+                        'Failed to successfully submit the job to the '
+                        'launched cluster, due to unexpected submission errors '
+                        'or the cluster being preempted during job submission.')
+                # If we get here, the launch did not succeed. Tear down the
+                # cluster and retry.
+                managed_job_utils.terminate_cluster(self.cluster_name)
+                if max_retry is not None and retry_cnt >= max_retry:
+                    # Retry forever if max_retry is None.
+                    if raise_on_failure:
+                        with ux_utils.print_exception_no_traceback():
+                            raise exceptions.ManagedJobReachedMaxRetriesError(
+                                'Resources unavailable: failed to launch '
+                                f'clusters after {max_retry} retries.')
+                    else:
+                        return None
+            # Exit the scheduled_launch context so that the scheulde state is
+            # ALIVE during the backoff. This allows other jobs to launch.
             gap_seconds = backoff.current_backoff()
             logger.info('Retrying to launch the cluster in '
                         f'{gap_seconds:.1f} seconds.')
@@ -411,10 +392,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool,
-                 max_restarts_on_errors: int) -> None:
-        super().__init__(cluster_name, backend, task, retry_until_up,
-                         max_restarts_on_errors)
+                 task: 'task_lib.Task', max_restarts_on_errors: int,
+                 job_id: int) -> None:
+        super().__init__(cluster_name, backend, task, max_restarts_on_errors,
+                         job_id)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
@@ -468,7 +449,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
             # Step 2
             logger.debug('Terminating unhealthy cluster and reset cloud '
                          'region.')
-            terminate_cluster(self.cluster_name)
+            managed_job_utils.terminate_cluster(self.cluster_name)
             # Step 3
             logger.debug('Relaunch the cluster  without constraining to prior '
@@ -478,16 +459,11 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
                                             raise_on_failure=False)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
-                if self.retry_until_up:
-                    gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                    logger.info('Retrying to recover the cluster in '
-                                f'{gap_seconds:.1f} seconds.')
-                    time.sleep(gap_seconds)
-                    continue
-                with ux_utils.print_exception_no_traceback():
-                    raise exceptions.ResourcesUnavailableError(
-                        f'Failed to recover the cluster after retrying '
-                        f'{self._MAX_RETRY_CNT} times.')
+                gap_seconds = self.RETRY_INIT_GAP_SECONDS
+                logger.info('Retrying to recover the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
+                time.sleep(gap_seconds)
+                continue
             return job_submitted_at
@@ -531,7 +507,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
         # Step 1
         logger.debug('Terminating unhealthy cluster and reset cloud region.')
-        terminate_cluster(self.cluster_name)
+        managed_job_utils.terminate_cluster(self.cluster_name)
         # Step 2
         logger.debug('Relaunch the cluster skipping the previously launched '
@@ -566,15 +542,10 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
                                             raise_on_failure=False)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
-                if self.retry_until_up:
-                    gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                    logger.info('Retrying to recover the cluster in '
-                                f'{gap_seconds:.1f} seconds.')
-                    time.sleep(gap_seconds)
-                    continue
-                with ux_utils.print_exception_no_traceback():
-                    raise exceptions.ResourcesUnavailableError(
-                        f'Failed to recover the cluster after retrying '
-                        f'{self._MAX_RETRY_CNT} times.')
+                gap_seconds = self.RETRY_INIT_GAP_SECONDS
+                logger.info('Retrying to recover the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
+                time.sleep(gap_seconds)
+                continue
             return job_submitted_at

skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250114py3-none-any.whl → 1.0.0.dev20250124py3-none-any.whl