PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sky/__init__.py +64 -32
sky/adaptors/aws.py +23 -6
sky/adaptors/azure.py +432 -15
sky/adaptors/cloudflare.py +5 -5
sky/adaptors/common.py +19 -9
sky/adaptors/do.py +20 -0
sky/adaptors/gcp.py +3 -2
sky/adaptors/kubernetes.py +122 -88
sky/adaptors/nebius.py +100 -0
sky/adaptors/oci.py +39 -1
sky/adaptors/vast.py +29 -0
sky/admin_policy.py +101 -0
sky/authentication.py +117 -98
sky/backends/backend.py +52 -20
sky/backends/backend_utils.py +669 -557
sky/backends/cloud_vm_ray_backend.py +1099 -808
sky/backends/local_docker_backend.py +14 -8
sky/backends/wheel_utils.py +38 -20
sky/benchmark/benchmark_utils.py +22 -23
sky/check.py +76 -27
sky/cli.py +1586 -1139
sky/client/__init__.py +1 -0
sky/client/cli.py +5683 -0
sky/client/common.py +345 -0
sky/client/sdk.py +1765 -0
sky/cloud_stores.py +283 -19
sky/clouds/__init__.py +7 -2
sky/clouds/aws.py +303 -112
sky/clouds/azure.py +185 -179
sky/clouds/cloud.py +115 -37
sky/clouds/cudo.py +29 -22
sky/clouds/do.py +313 -0
sky/clouds/fluidstack.py +44 -54
sky/clouds/gcp.py +206 -65
sky/clouds/ibm.py +26 -21
sky/clouds/kubernetes.py +345 -91
sky/clouds/lambda_cloud.py +40 -29
sky/clouds/nebius.py +297 -0
sky/clouds/oci.py +129 -90
sky/clouds/paperspace.py +22 -18
sky/clouds/runpod.py +53 -34
sky/clouds/scp.py +28 -24
sky/clouds/service_catalog/__init__.py +19 -13
sky/clouds/service_catalog/aws_catalog.py +29 -12
sky/clouds/service_catalog/azure_catalog.py +33 -6
sky/clouds/service_catalog/common.py +95 -75
sky/clouds/service_catalog/constants.py +3 -3
sky/clouds/service_catalog/cudo_catalog.py +13 -3
sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
sky/clouds/service_catalog/do_catalog.py +111 -0
sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
sky/clouds/service_catalog/gcp_catalog.py +16 -2
sky/clouds/service_catalog/ibm_catalog.py +2 -2
sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
sky/clouds/service_catalog/lambda_catalog.py +8 -3
sky/clouds/service_catalog/nebius_catalog.py +116 -0
sky/clouds/service_catalog/oci_catalog.py +31 -4
sky/clouds/service_catalog/paperspace_catalog.py +2 -2
sky/clouds/service_catalog/runpod_catalog.py +2 -2
sky/clouds/service_catalog/scp_catalog.py +2 -2
sky/clouds/service_catalog/vast_catalog.py +104 -0
sky/clouds/service_catalog/vsphere_catalog.py +2 -2
sky/clouds/utils/aws_utils.py +65 -0
sky/clouds/utils/azure_utils.py +91 -0
sky/clouds/utils/gcp_utils.py +5 -9
sky/clouds/utils/oci_utils.py +47 -5
sky/clouds/utils/scp_utils.py +4 -3
sky/clouds/vast.py +280 -0
sky/clouds/vsphere.py +22 -18
sky/core.py +361 -107
sky/dag.py +41 -28
sky/data/data_transfer.py +37 -0
sky/data/data_utils.py +211 -32
sky/data/mounting_utils.py +182 -30
sky/data/storage.py +2118 -270
sky/data/storage_utils.py +126 -5
sky/exceptions.py +179 -8
sky/execution.py +158 -85
sky/global_user_state.py +150 -34
sky/jobs/__init__.py +12 -10
sky/jobs/client/__init__.py +0 -0
sky/jobs/client/sdk.py +302 -0
sky/jobs/constants.py +49 -11
sky/jobs/controller.py +161 -99
sky/jobs/dashboard/dashboard.py +171 -25
sky/jobs/dashboard/templates/index.html +572 -60
sky/jobs/recovery_strategy.py +157 -156
sky/jobs/scheduler.py +307 -0
sky/jobs/server/__init__.py +1 -0
sky/jobs/server/core.py +598 -0
sky/jobs/server/dashboard_utils.py +69 -0
sky/jobs/server/server.py +190 -0
sky/jobs/state.py +627 -122
sky/jobs/utils.py +615 -206
sky/models.py +27 -0
sky/optimizer.py +142 -83
sky/provision/__init__.py +20 -5
sky/provision/aws/config.py +124 -42
sky/provision/aws/instance.py +130 -53
sky/provision/azure/__init__.py +7 -0
sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
sky/provision/azure/config.py +220 -0
sky/provision/azure/instance.py +1012 -37
sky/provision/common.py +31 -3
sky/provision/constants.py +25 -0
sky/provision/cudo/__init__.py +2 -1
sky/provision/cudo/cudo_utils.py +112 -0
sky/provision/cudo/cudo_wrapper.py +37 -16
sky/provision/cudo/instance.py +28 -12
sky/provision/do/__init__.py +11 -0
sky/provision/do/config.py +14 -0
sky/provision/do/constants.py +10 -0
sky/provision/do/instance.py +287 -0
sky/provision/do/utils.py +301 -0
sky/provision/docker_utils.py +82 -46
sky/provision/fluidstack/fluidstack_utils.py +57 -125
sky/provision/fluidstack/instance.py +15 -43
sky/provision/gcp/config.py +19 -9
sky/provision/gcp/constants.py +7 -1
sky/provision/gcp/instance.py +55 -34
sky/provision/gcp/instance_utils.py +339 -80
sky/provision/gcp/mig_utils.py +210 -0
sky/provision/instance_setup.py +172 -133
sky/provision/kubernetes/__init__.py +1 -0
sky/provision/kubernetes/config.py +104 -90
sky/provision/kubernetes/constants.py +8 -0
sky/provision/kubernetes/instance.py +680 -325
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
sky/provision/kubernetes/network.py +54 -20
sky/provision/kubernetes/network_utils.py +70 -21
sky/provision/kubernetes/utils.py +1370 -251
sky/provision/lambda_cloud/__init__.py +11 -0
sky/provision/lambda_cloud/config.py +10 -0
sky/provision/lambda_cloud/instance.py +265 -0
sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
sky/provision/logging.py +1 -1
sky/provision/nebius/__init__.py +11 -0
sky/provision/nebius/config.py +11 -0
sky/provision/nebius/instance.py +285 -0
sky/provision/nebius/utils.py +318 -0
sky/provision/oci/__init__.py +15 -0
sky/provision/oci/config.py +51 -0
sky/provision/oci/instance.py +436 -0
sky/provision/oci/query_utils.py +681 -0
sky/provision/paperspace/constants.py +6 -0
sky/provision/paperspace/instance.py +4 -3
sky/provision/paperspace/utils.py +2 -0
sky/provision/provisioner.py +207 -130
sky/provision/runpod/__init__.py +1 -0
sky/provision/runpod/api/__init__.py +3 -0
sky/provision/runpod/api/commands.py +119 -0
sky/provision/runpod/api/pods.py +142 -0
sky/provision/runpod/instance.py +64 -8
sky/provision/runpod/utils.py +239 -23
sky/provision/vast/__init__.py +10 -0
sky/provision/vast/config.py +11 -0
sky/provision/vast/instance.py +247 -0
sky/provision/vast/utils.py +162 -0
sky/provision/vsphere/common/vim_utils.py +1 -1
sky/provision/vsphere/instance.py +8 -18
sky/provision/vsphere/vsphere_utils.py +1 -1
sky/resources.py +247 -102
sky/serve/__init__.py +9 -9
sky/serve/autoscalers.py +361 -299
sky/serve/client/__init__.py +0 -0
sky/serve/client/sdk.py +366 -0
sky/serve/constants.py +12 -3
sky/serve/controller.py +106 -36
sky/serve/load_balancer.py +63 -12
sky/serve/load_balancing_policies.py +84 -2
sky/serve/replica_managers.py +42 -34
sky/serve/serve_state.py +62 -32
sky/serve/serve_utils.py +271 -160
sky/serve/server/__init__.py +0 -0
sky/serve/{core.py → server/core.py} +271 -90
sky/serve/server/server.py +112 -0
sky/serve/service.py +52 -16
sky/serve/service_spec.py +95 -32
sky/server/__init__.py +1 -0
sky/server/common.py +430 -0
sky/server/constants.py +21 -0
sky/server/html/log.html +174 -0
sky/server/requests/__init__.py +0 -0
sky/server/requests/executor.py +472 -0
sky/server/requests/payloads.py +487 -0
sky/server/requests/queues/__init__.py +0 -0
sky/server/requests/queues/mp_queue.py +76 -0
sky/server/requests/requests.py +567 -0
sky/server/requests/serializers/__init__.py +0 -0
sky/server/requests/serializers/decoders.py +192 -0
sky/server/requests/serializers/encoders.py +166 -0
sky/server/server.py +1106 -0
sky/server/stream_utils.py +141 -0
sky/setup_files/MANIFEST.in +2 -5
sky/setup_files/dependencies.py +159 -0
sky/setup_files/setup.py +14 -125
sky/sky_logging.py +59 -14
sky/skylet/autostop_lib.py +2 -2
sky/skylet/constants.py +183 -50
sky/skylet/events.py +22 -10
sky/skylet/job_lib.py +403 -258
sky/skylet/log_lib.py +111 -71
sky/skylet/log_lib.pyi +6 -0
sky/skylet/providers/command_runner.py +6 -8
sky/skylet/providers/ibm/node_provider.py +2 -2
sky/skylet/providers/scp/config.py +11 -3
sky/skylet/providers/scp/node_provider.py +8 -8
sky/skylet/skylet.py +3 -1
sky/skylet/subprocess_daemon.py +69 -17
sky/skypilot_config.py +119 -57
sky/task.py +205 -64
sky/templates/aws-ray.yml.j2 +37 -7
sky/templates/azure-ray.yml.j2 +27 -82
sky/templates/cudo-ray.yml.j2 +7 -3
sky/templates/do-ray.yml.j2 +98 -0
sky/templates/fluidstack-ray.yml.j2 +7 -4
sky/templates/gcp-ray.yml.j2 +26 -6
sky/templates/ibm-ray.yml.j2 +3 -2
sky/templates/jobs-controller.yaml.j2 +46 -11
sky/templates/kubernetes-ingress.yml.j2 +7 -0
sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
sky/templates/kubernetes-ray.yml.j2 +292 -25
sky/templates/lambda-ray.yml.j2 +30 -40
sky/templates/nebius-ray.yml.j2 +79 -0
sky/templates/oci-ray.yml.j2 +18 -57
sky/templates/paperspace-ray.yml.j2 +10 -6
sky/templates/runpod-ray.yml.j2 +26 -4
sky/templates/scp-ray.yml.j2 +3 -2
sky/templates/sky-serve-controller.yaml.j2 +12 -1
sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
sky/templates/vast-ray.yml.j2 +70 -0
sky/templates/vsphere-ray.yml.j2 +8 -3
sky/templates/websocket_proxy.py +64 -0
sky/usage/constants.py +10 -1
sky/usage/usage_lib.py +130 -37
sky/utils/accelerator_registry.py +35 -51
sky/utils/admin_policy_utils.py +147 -0
sky/utils/annotations.py +51 -0
sky/utils/cli_utils/status_utils.py +81 -23
sky/utils/cluster_utils.py +356 -0
sky/utils/command_runner.py +452 -89
sky/utils/command_runner.pyi +77 -3
sky/utils/common.py +54 -0
sky/utils/common_utils.py +319 -108
sky/utils/config_utils.py +204 -0
sky/utils/control_master_utils.py +48 -0
sky/utils/controller_utils.py +548 -266
sky/utils/dag_utils.py +93 -32
sky/utils/db_utils.py +18 -4
sky/utils/env_options.py +29 -7
sky/utils/kubernetes/create_cluster.sh +8 -60
sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
sky/utils/kubernetes/gpu_labeler.py +4 -4
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
sky/utils/kubernetes/rsync_helper.sh +24 -0
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
sky/utils/log_utils.py +240 -33
sky/utils/message_utils.py +81 -0
sky/utils/registry.py +127 -0
sky/utils/resources_utils.py +94 -22
sky/utils/rich_utils.py +247 -18
sky/utils/schemas.py +284 -64
sky/{status_lib.py → utils/status_lib.py} +12 -7
sky/utils/subprocess_utils.py +212 -46
sky/utils/timeline.py +12 -7
sky/utils/ux_utils.py +168 -15
skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
sky/clouds/cloud_registry.py +0 -31
sky/jobs/core.py +0 -330
sky/skylet/providers/azure/__init__.py +0 -2
sky/skylet/providers/azure/azure-vm-template.json +0 -301
sky/skylet/providers/azure/config.py +0 -170
sky/skylet/providers/azure/node_provider.py +0 -466
sky/skylet/providers/lambda_cloud/__init__.py +0 -2
sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
sky/skylet/providers/oci/__init__.py +0 -2
sky/skylet/providers/oci/node_provider.py +0 -488
sky/skylet/providers/oci/query_helper.py +0 -383
sky/skylet/providers/oci/utils.py +0 -21
sky/utils/cluster_yaml_utils.py +0 -24
sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -13,50 +13,33 @@ from typing import Optional
 import sky
 from sky import backends
 from sky import exceptions
+from sky import execution
 from sky import global_user_state
 from sky import sky_logging
-from sky import status_lib
 from sky.backends import backend_utils
+from sky.jobs import scheduler
 from sky.jobs import utils as managed_job_utils
 from sky.skylet import job_lib
 from sky.usage import usage_lib
 from sky.utils import common_utils
+from sky.utils import registry
+from sky.utils import status_lib
 from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
+    from sky import resources
     from sky import task as task_lib
 logger = sky_logging.init_logger(__name__)
-RECOVERY_STRATEGIES = {}
-DEFAULT_RECOVERY_STRATEGY = None
 # Waiting time for job from INIT/PENDING to RUNNING
 # 10 * JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 10 * 5 = 50 seconds
 MAX_JOB_CHECKING_RETRY = 10
-def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
-    """Terminate the cluster."""
-    retry_cnt = 0
-    while True:
-        try:
-            usage_lib.messages.usage.set_internal()
-            sky.down(cluster_name)
-            return
-        except ValueError:
-            # The cluster is already down.
-            return
-        except Exception as e:  # pylint: disable=broad-except
-            retry_cnt += 1
-            if retry_cnt >= max_retry:
-                raise RuntimeError(
-                    f'Failed to terminate the cluster {cluster_name}.') from e
-            logger.error(
-                f'Failed to terminate the cluster {cluster_name}. Retrying.'
-                f'Details: {common_utils.format_exception(e)}')
-            with ux_utils.enable_traceback():
-                logger.error(f'  Traceback: {traceback.format_exc()}')
+# Minutes to job cluster autodown. This should be significantly larger than
+# managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
+# cluster before its status can be updated by the job controller.
+_AUTODOWN_MINUTES = 5
 class StrategyExecutor:
@@ -65,14 +48,14 @@ class StrategyExecutor:
     RETRY_INIT_GAP_SECONDS = 60
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool) -> None:
+                 task: 'task_lib.Task', max_restarts_on_errors: int,
+                 job_id: int) -> None:
         """Initialize the strategy executor.
         Args:
             cluster_name: The name of the cluster.
             backend: The backend to use. Only CloudVMRayBackend is supported.
             task: The task to execute.
-            retry_until_up: Whether to retry until the cluster is up.
         """
         assert isinstance(backend, backends.CloudVmRayBackend), (
             'Only CloudVMRayBackend is supported.')
@@ -80,19 +63,13 @@ class StrategyExecutor:
         self.dag.add(task)
         self.cluster_name = cluster_name
         self.backend = backend
-        self.retry_until_up = retry_until_up
-    def __init_subclass__(cls, name: str, default: bool = False):
-        RECOVERY_STRATEGIES[name] = cls
-        if default:
-            global DEFAULT_RECOVERY_STRATEGY
-            assert DEFAULT_RECOVERY_STRATEGY is None, (
-                'Only one strategy can be default.')
-            DEFAULT_RECOVERY_STRATEGY = name
+        self.max_restarts_on_errors = max_restarts_on_errors
+        self.job_id = job_id
+        self.restart_cnt_on_failure = 0
     @classmethod
     def make(cls, cluster_name: str, backend: 'backends.Backend',
-             task: 'task_lib.Task', retry_until_up: bool) -> 'StrategyExecutor':
+             task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
         """Create a strategy from a task."""
         resource_list = list(task.resources)
@@ -108,8 +85,19 @@ class StrategyExecutor:
         # set the new_task_resources to be the same type (list or set) as the
         # original task.resources
         task.set_resources(type(task.resources)(new_resources_list))
-        return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
-                                                 retry_until_up)
+        if isinstance(job_recovery, dict):
+            job_recovery_name = job_recovery.pop(
+                'strategy', registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default)
+            max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
+                                                      0)
+        else:
+            job_recovery_name = job_recovery
+            max_restarts_on_errors = 0
+        job_recovery_strategy = (registry.JOBS_RECOVERY_STRATEGY_REGISTRY.
+                                 from_str(job_recovery_name))
+        assert job_recovery_strategy is not None, job_recovery_name
+        return job_recovery_strategy(cluster_name, backend, task,
+                                     max_restarts_on_errors, job_id)
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -123,10 +111,7 @@ class StrategyExecutor:
         Raises: Please refer to the docstring of self._launch().
         """
-        if self.retry_until_up:
-            job_submit_at = self._launch(max_retry=None)
-        else:
-            job_submit_at = self._launch()
+        job_submit_at = self._launch(max_retry=None)
         assert job_submit_at is not None
         return job_submit_at
@@ -141,6 +126,8 @@ class StrategyExecutor:
         raise NotImplementedError
     def _try_cancel_all_jobs(self):
+        from sky import core  # pylint: disable=import-outside-toplevel
         handle = global_user_state.get_handle_from_cluster_name(
             self.cluster_name)
         if handle is None:
@@ -166,9 +153,9 @@ class StrategyExecutor:
             # should be functional with the `_try_cancel_if_cluster_is_init`
             # flag, i.e. it sends the cancel signal to the head node, which will
             # then kill the user process on remaining worker nodes.
-            sky.cancel(cluster_name=self.cluster_name,
-                       all=True,
-                       _try_cancel_if_cluster_is_init=True)
+            core.cancel(cluster_name=self.cluster_name,
+                        all=True,
+                        _try_cancel_if_cluster_is_init=True)
         except Exception as e:  # pylint: disable=broad-except
             logger.info('Failed to cancel the job on the cluster. The cluster '
                         'might be already down or the head node is preempted.'
@@ -176,7 +163,7 @@ class StrategyExecutor:
                         f'{common_utils.format_exception(e)}\n'
                         'Terminating the cluster explicitly to ensure no '
                         'remaining job process interferes with recovery.')
-            terminate_cluster(self.cluster_name)
+            managed_job_utils.terminate_cluster(self.cluster_name)
     def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
         """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -270,8 +257,8 @@ class StrategyExecutor:
                 1. The optimizer cannot find a feasible solution.
                 2. Precheck errors: invalid cluster name, failure in getting
                 cloud user identity, or unsupported feature.
-            exceptions.SpotJobReachedMaxRetryError: This will be raised when
-                all prechecks passed but the maximum number of retries is
+            exceptions.ManagedJobReachedMaxRetriesError: This will be raised
+                when all prechecks passed but the maximum number of retries is
                 reached for `sky.launch`. The failure of `sky.launch` can be
                 due to:
                 1. Any of the underlying failover exceptions is due to resources
@@ -285,104 +272,128 @@ class StrategyExecutor:
         backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
         while True:
             retry_cnt += 1
-            try:
-                usage_lib.messages.usage.set_internal()
-                # Detach setup, so that the setup failure can be detected
-                # by the controller process (job_status -> FAILED_SETUP).
-                sky.launch(self.dag,
-                           cluster_name=self.cluster_name,
-                           detach_setup=True,
-                           detach_run=True,
-                           _is_launched_by_jobs_controller=True)
-                logger.info('Managed job cluster launched.')
-            except (exceptions.InvalidClusterNameError,
-                    exceptions.NoCloudAccessError,
-                    exceptions.ResourcesMismatchError) as e:
-                logger.error('Failure happened before provisioning. '
-                             f'{common_utils.format_exception(e)}')
-                if raise_on_failure:
-                    raise exceptions.ProvisionPrechecksError(reasons=[e])
-                return None
-            except exceptions.ResourcesUnavailableError as e:
-                # This is raised when the launch fails due to prechecks or
-                # after failing over through all the candidates.
-                # Please refer to the docstring of `sky.launch` for more
-                # details of how the exception will be structured.
-                if not any(
-                        isinstance(err, exceptions.ResourcesUnavailableError)
-                        for err in e.failover_history):
-                    # _launch() (this function) should fail/exit directly, if
-                    # none of the failover reasons were because of resource
-                    # unavailability or no failover was attempted (the optimizer
-                    # cannot find feasible resources for requested resources),
-                    # i.e., e.failover_history is empty.
-                    # Failing directly avoids the infinite loop of retrying
-                    # the launch when, e.g., an invalid cluster name is used
-                    # and --retry-until-up is specified.
-                    reasons = (e.failover_history
-                               if e.failover_history else [e])
-                    reasons_str = '; '.join(
-                        common_utils.format_exception(err) for err in reasons)
-                    logger.error(
-                        'Failure happened before provisioning. Failover '
-                        f'reasons: {reasons_str}')
+            with scheduler.scheduled_launch(self.job_id):
+                try:
+                    usage_lib.messages.usage.set_internal()
+                    # Detach setup, so that the setup failure can be detected
+                    # by the controller process (job_status -> FAILED_SETUP).
+                    execution.launch(
+                        self.dag,
+                        cluster_name=self.cluster_name,
+                        # We expect to tear down the cluster as soon as the job
+                        # is finished. However, in case the controller dies, set
+                        # autodown to try and avoid a resource leak.
+                        idle_minutes_to_autostop=_AUTODOWN_MINUTES,
+                        down=True,
+                        _is_launched_by_jobs_controller=True)
+                    logger.info('Managed job cluster launched.')
+                except (exceptions.InvalidClusterNameError,
+                        exceptions.NoCloudAccessError,
+                        exceptions.ResourcesMismatchError) as e:
+                    logger.error('Failure happened before provisioning. '
+                                 f'{common_utils.format_exception(e)}')
                     if raise_on_failure:
-                        raise exceptions.ProvisionPrechecksError(
-                            reasons=reasons)
-                    return None
-                logger.info('Failed to launch a cluster with error: '
-                            f'{common_utils.format_exception(e)})')
-            except Exception as e:  # pylint: disable=broad-except
-                # If the launch fails, it will be recovered by the following
-                # code.
-                logger.info('Failed to launch a cluster with error: '
-                            f'{common_utils.format_exception(e)})')
-                with ux_utils.enable_traceback():
-                    logger.info(f'  Traceback: {traceback.format_exc()}')
-            else:  # No exception, the launch succeeds.
-                # At this point, a sky.launch() has succeeded. Cluster may be
-                # UP (no preemption since) or DOWN (newly preempted).
-                job_submitted_at = self._wait_until_job_starts_on_cluster()
-                if job_submitted_at is not None:
-                    return job_submitted_at
-                # The job fails to start on the cluster, retry the launch.
-                # TODO(zhwu): log the unexpected error to usage collection
-                # for future debugging.
-                logger.info(
-                    'Failed to successfully submit the job to the '
-                    'launched cluster, due to unexpected submission errors or '
-                    'the cluster being preempted during job submission.')
-            terminate_cluster(self.cluster_name)
-            if max_retry is not None and retry_cnt >= max_retry:
-                # Retry forever if max_retry is None.
-                if raise_on_failure:
-                    with ux_utils.print_exception_no_traceback():
-                        raise exceptions.ManagedJobReachedMaxRetriesError(
-                            'Resources unavailable: failed to launch clusters '
-                            f'after {max_retry} retries.')
-                else:
+                        raise exceptions.ProvisionPrechecksError(reasons=[e])
                     return None
+                except exceptions.ResourcesUnavailableError as e:
+                    # This is raised when the launch fails due to prechecks or
+                    # after failing over through all the candidates.
+                    # Please refer to the docstring of `sky.launch` for more
+                    # details of how the exception will be structured.
+                    if not any(
+                            isinstance(err,
+                                       exceptions.ResourcesUnavailableError)
+                            for err in e.failover_history):
+                        # _launch() (this function) should fail/exit directly,
+                        # if none of the failover reasons were because of
+                        # resource unavailability or no failover was attempted
+                        # (the optimizer cannot find feasible resources for
+                        # requested resources), i.e., e.failover_history is
+                        # empty. Failing directly avoids the infinite loop of
+                        # retrying the launch when, e.g., an invalid cluster
+                        # name is used and --retry-until-up is specified.
+                        reasons = (e.failover_history
+                                   if e.failover_history else [e])
+                        reasons_str = '; '.join(
+                            common_utils.format_exception(err)
+                            for err in reasons)
+                        logger.error(
+                            'Failure happened before provisioning. Failover '
+                            f'reasons: {reasons_str}')
+                        if raise_on_failure:
+                            raise exceptions.ProvisionPrechecksError(reasons)
+                        return None
+                    logger.info('Failed to launch a cluster with error: '
+                                f'{common_utils.format_exception(e)})')
+                except Exception as e:  # pylint: disable=broad-except
+                    # If the launch fails, it will be recovered by the following
+                    # code.
+                    logger.info('Failed to launch a cluster with error: '
+                                f'{common_utils.format_exception(e)})')
+                    with ux_utils.enable_traceback():
+                        logger.info(f'  Traceback: {traceback.format_exc()}')
+                else:  # No exception, the launch succeeds.
+                    # At this point, a sky.launch() has succeeded. Cluster may
+                    # be UP (no preemption since) or DOWN (newly preempted).
+                    job_submitted_at = self._wait_until_job_starts_on_cluster()
+                    if job_submitted_at is not None:
+                        return job_submitted_at
+                    # The job fails to start on the cluster, retry the launch.
+                    # TODO(zhwu): log the unexpected error to usage collection
+                    # for future debugging.
+                    logger.info(
+                        'Failed to successfully submit the job to the '
+                        'launched cluster, due to unexpected submission errors '
+                        'or the cluster being preempted during job submission.')
+                # If we get here, the launch did not succeed. Tear down the
+                # cluster and retry.
+                managed_job_utils.terminate_cluster(self.cluster_name)
+                if max_retry is not None and retry_cnt >= max_retry:
+                    # Retry forever if max_retry is None.
+                    if raise_on_failure:
+                        with ux_utils.print_exception_no_traceback():
+                            raise exceptions.ManagedJobReachedMaxRetriesError(
+                                'Resources unavailable: failed to launch '
+                                f'clusters after {max_retry} retries.')
+                    else:
+                        return None
+            # Exit the scheduled_launch context so that the scheulde state is
+            # ALIVE during the backoff. This allows other jobs to launch.
             gap_seconds = backoff.current_backoff()
             logger.info('Retrying to launch the cluster in '
                         f'{gap_seconds:.1f} seconds.')
             time.sleep(gap_seconds)
+    def should_restart_on_failure(self) -> bool:
+        """Increments counter & checks if job should be restarted on a failure.
-class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
-                               default=False):
+        Returns:
+            True if the job should be restarted, otherwise False.
+        """
+        self.restart_cnt_on_failure += 1
+        if self.restart_cnt_on_failure > self.max_restarts_on_errors:
+            return False
+        return True
+@registry.JOBS_RECOVERY_STRATEGY_REGISTRY.type_register(name='FAILOVER',
+                                                        default=False)
+class FailoverStrategyExecutor(StrategyExecutor):
     """Failover strategy: wait in same region and failover after timeout."""
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool) -> None:
-        super().__init__(cluster_name, backend, task, retry_until_up)
+                 task: 'task_lib.Task', max_restarts_on_errors: int,
+                 job_id: int) -> None:
+        super().__init__(cluster_name, backend, task, max_restarts_on_errors,
+                         job_id)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
         # preempted.)
-        self._launched_resources: Optional['sky.resources.Resources'] = None
+        self._launched_resources: Optional['resources.Resources'] = None
     def _launch(self,
                 max_retry: Optional[int] = 3,
@@ -431,7 +442,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
             # Step 2
             logger.debug('Terminating unhealthy cluster and reset cloud '
                          'region.')
-            terminate_cluster(self.cluster_name)
+            managed_job_utils.terminate_cluster(self.cluster_name)
             # Step 3
             logger.debug('Relaunch the cluster  without constraining to prior '
@@ -441,23 +452,18 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
                                             raise_on_failure=False)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
-                if self.retry_until_up:
-                    gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                    logger.info('Retrying to recover the cluster in '
-                                f'{gap_seconds:.1f} seconds.')
-                    time.sleep(gap_seconds)
-                    continue
-                with ux_utils.print_exception_no_traceback():
-                    raise exceptions.ResourcesUnavailableError(
-                        f'Failed to recover the cluster after retrying '
-                        f'{self._MAX_RETRY_CNT} times.')
+                gap_seconds = self.RETRY_INIT_GAP_SECONDS
+                logger.info('Retrying to recover the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
+                time.sleep(gap_seconds)
+                continue
             return job_submitted_at
-class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
-                                    name='EAGER_NEXT_REGION',
-                                    default=True):
+@registry.JOBS_RECOVERY_STRATEGY_REGISTRY.type_register(
+    name='EAGER_NEXT_REGION', default=True)
+class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
     """Eager failover strategy.
     This strategy is an extension of the FAILOVER strategy. Instead of waiting
@@ -494,7 +500,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
         # Step 1
         logger.debug('Terminating unhealthy cluster and reset cloud region.')
-        terminate_cluster(self.cluster_name)
+        managed_job_utils.terminate_cluster(self.cluster_name)
         # Step 2
         logger.debug('Relaunch the cluster skipping the previously launched '
@@ -529,15 +535,10 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
                                             raise_on_failure=False)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
-                if self.retry_until_up:
-                    gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                    logger.info('Retrying to recover the cluster in '
-                                f'{gap_seconds:.1f} seconds.')
-                    time.sleep(gap_seconds)
-                    continue
-                with ux_utils.print_exception_no_traceback():
-                    raise exceptions.ResourcesUnavailableError(
-                        f'Failed to recover the cluster after retrying '
-                        f'{self._MAX_RETRY_CNT} times.')
+                gap_seconds = self.RETRY_INIT_GAP_SECONDS
+                logger.info('Retrying to recover the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
+                time.sleep(gap_seconds)
+                continue
             return job_submitted_at

skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl