PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250118__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250116py3-none-any.whl → 1.0.0.dev20250118py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sky/__init__.py +2 -2
sky/backends/cloud_vm_ray_backend.py +50 -29
sky/cli.py +11 -34
sky/core.py +8 -5
sky/data/storage.py +16 -7
sky/global_user_state.py +1 -1
sky/jobs/constants.py +8 -7
sky/jobs/controller.py +14 -16
sky/jobs/core.py +0 -2
sky/jobs/recovery_strategy.py +114 -143
sky/jobs/scheduler.py +283 -0
sky/jobs/state.py +257 -17
sky/jobs/utils.py +287 -64
sky/provision/kubernetes/instance.py +1 -1
sky/resources.py +1 -1
sky/skylet/constants.py +1 -1
sky/skylet/events.py +7 -3
sky/skylet/job_lib.py +2 -26
sky/skylet/log_lib.py +8 -8
sky/skylet/log_lib.pyi +3 -0
sky/skylet/skylet.py +1 -1
sky/templates/jobs-controller.yaml.j2 +7 -3
sky/utils/resources_utils.py +25 -21
sky/utils/subprocess_utils.py +48 -9
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/RECORD +30 -29
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = 'e71e5a92ccd90a654662121d6f08c4e100377bbf'
+_SKYPILOT_COMMIT_SHA = '11861fd35820ff0db76ecce1dc9a644db4ffb8f7'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250116'
+__version__ = '1.0.0.dev20250118'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -10,6 +10,7 @@ import os
 import pathlib
 import re
 import shlex
+import shutil
 import signal
 import subprocess
 import sys
@@ -44,6 +45,7 @@ from sky.clouds import service_catalog
 from sky.clouds.utils import gcp_utils
 from sky.data import data_utils
 from sky.data import storage as storage_lib
+from sky.jobs import constants as managed_jobs_constants
 from sky.provision import common as provision_common
 from sky.provision import instance_setup
 from sky.provision import metadata_utils
@@ -154,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
 # might be added during ssh.
 _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
+_RESOURCES_UNAVAILABLE_LOG = (
+    'Reasons for provision failures (for details, please check the log above):')
 def _is_command_length_over_limit(command: str) -> bool:
     """Check if the length of the command exceeds the limit.
@@ -1996,6 +2001,7 @@ class RetryingVmProvisioner(object):
                                        skip_unnecessary_provisioning else None)
         failover_history: List[Exception] = list()
+        resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
         # If the user is using local credentials which may expire, the
         # controller may leak resources if the credentials expire while a job
         # is running. Here we check the enabled clouds and expiring credentials
@@ -2087,6 +2093,8 @@ class RetryingVmProvisioner(object):
                 # Add failed resources to the blocklist, only when it
                 # is in fallback mode.
                 _add_to_blocked_resources(self._blocked_resources, to_provision)
+                assert len(failover_history) > 0
+                resource_exceptions[to_provision] = failover_history[-1]
             else:
                 # If we reach here, it means that the existing cluster must have
                 # a previous status of INIT, because other statuses (UP,
@@ -2131,7 +2139,14 @@ class RetryingVmProvisioner(object):
                 # possible resources or the requested resources is too
                 # restrictive. If we reach here, our failover logic finally
                 # ends here.
-                raise e.with_failover_history(failover_history)
+                table = log_utils.create_table(['Resource', 'Reason'])
+                for (resource, exception) in resource_exceptions.items():
+                    table.add_row(
+                        [resources_utils.format_resource(resource), exception])
+                table.max_table_width = shutil.get_terminal_size().columns
+                raise exceptions.ResourcesUnavailableError(
+                    _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
+                    failover_history=failover_history)
             to_provision = task.best_resources
             assert task in self._dag.tasks, 'Internal logic error.'
             assert to_provision is not None, task
@@ -2894,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         'the `--retry-until-up` flag.')
                     with ux_utils.print_exception_no_traceback():
                         raise exceptions.ResourcesUnavailableError(
-                            error_message,
+                            error_message + '\n' + str(e),
                             failover_history=e.failover_history) from None
             if dryrun:
                 record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3909,40 +3924,45 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         Returns:
             A dictionary mapping job_id to log path.
         """
-        # if job_name is not None, job_id should be None
+        # if job_name and job_id should not both be specified
         assert job_name is None or job_id is None, (job_name, job_id)
-        if job_id is None and job_name is not None:
+        if job_id is None:
             # generate code to get the job_id
+            # if job_name is None, get all job_ids
+            # TODO: Only get the latest job_id, since that's the only one we use
             code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
                 job_name=job_name)
-            returncode, run_timestamps, stderr = self.run_on_head(
-                handle,
-                code,
-                stream_logs=False,
-                require_outputs=True,
-                separate_stderr=True)
+            returncode, job_ids, stderr = self.run_on_head(handle,
+                                                           code,
+                                                           stream_logs=False,
+                                                           require_outputs=True,
+                                                           separate_stderr=True)
             subprocess_utils.handle_returncode(returncode, code,
                                                'Failed to sync down logs.',
                                                stderr)
-            job_ids = common_utils.decode_payload(run_timestamps)
+            job_ids = common_utils.decode_payload(job_ids)
             if not job_ids:
                 logger.info(f'{colorama.Fore.YELLOW}'
                             'No matching job found'
                             f'{colorama.Style.RESET_ALL}')
                 return {}
             elif len(job_ids) > 1:
-                logger.info(
-                    f'{colorama.Fore.YELLOW}'
-                    f'Multiple jobs IDs found under the name {job_name}. '
-                    'Downloading the latest job logs.'
-                    f'{colorama.Style.RESET_ALL}')
-                job_ids = [job_ids[0]]  # descending order
-        else:
-            job_ids = [job_id]
+                name_str = ''
+                if job_name is not None:
+                    name_str = ('Multiple jobs IDs found under the name '
+                                f'{job_name}. ')
+                logger.info(f'{colorama.Fore.YELLOW}'
+                            f'{name_str}'
+                            'Downloading the latest job logs.'
+                            f'{colorama.Style.RESET_ALL}')
+            # list should aready be in descending order
+            job_id = job_ids[0]
         # get the run_timestamp
         # the function takes in [job_id]
-        code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
+        code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
+            [str(job_id)])
         returncode, run_timestamps, stderr = self.run_on_head(
             handle,
             code,
@@ -3963,13 +3983,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         job_id = list(run_timestamps.keys())[0]
         local_log_dir = ''
         if controller:  # download controller logs
-            remote_log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
-                                          run_timestamp)
+            remote_log = os.path.join(
+                managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
+                f'{job_id}.log')
             local_log_dir = os.path.expanduser(
                 os.path.join(local_dir, run_timestamp))
             logger.info(f'{colorama.Fore.CYAN}'
-                        f'Job {job_ids} local logs: {local_log_dir}'
+                        f'Job {job_id} local logs: {local_log_dir}'
                         f'{colorama.Style.RESET_ALL}')
             runners = handle.get_command_runners()
@@ -3980,12 +4001,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 Args:
                     args: A tuple of (runner, local_log_dir, remote_log_dir)
                 """
-                (runner, local_log_dir, remote_log_dir) = args
+                (runner, local_log_dir, remote_log) = args
                 try:
                     os.makedirs(local_log_dir, exist_ok=True)
                     runner.rsync(
-                        source=f'{remote_log_dir}/',
-                        target=local_log_dir,
+                        source=remote_log,
+                        target=f'{local_log_dir}/controller.log',
                         up=False,
                         stream_logs=False,
                     )
@@ -3998,9 +4019,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     else:
                         raise
-            parallel_args = [[runner, *item]
-                             for item in zip([local_log_dir], [remote_log_dir])
-                             for runner in runners]
+            parallel_args = [
+                (runner, local_log_dir, remote_log) for runner in runners
+            ]
             subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
         else:  # download job logs
             local_log_dir = os.path.expanduser(

sky/cli.py CHANGED Viewed

@@ -3530,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool):  # pylint: disable=r
     if sum([bool(names), all]) != 1:
         raise click.UsageError('Either --all or a name must be specified.')
     if all:
-        storages = sky.storage_ls()
-        if not storages:
+        # Use '*' to get all storages.
+        names = global_user_state.get_glob_storage_name(storage_name='*')
+        if not names:
             click.echo('No storage(s) to delete.')
             return
-        names = [s['name'] for s in storages]
     else:
         names = _get_glob_storages(names)
     if names:
@@ -3548,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool):  # pylint: disable=r
                 abort=True,
                 show_default=True)
-    subprocess_utils.run_in_parallel(sky.storage_delete, names)
+    def delete_storage(name: str) -> None:
+        try:
+            sky.storage_delete(name)
+        except Exception as e:  # pylint: disable=broad-except
+            click.secho(f'Error deleting storage {name}: {e}', fg='red')
+    subprocess_utils.run_in_parallel(delete_storage, names)
 @cli.group(cls=_NaturalOrderGroup)
@@ -3588,18 +3594,6 @@ def jobs():
     is_flag=True,
     help=('If True, as soon as a job is submitted, return from this call '
           'and do not stream execution logs.'))
-@click.option(
-    '--retry-until-up/--no-retry-until-up',
-    '-r/-no-r',
-    default=None,
-    is_flag=True,
-    required=False,
-    help=(
-        '(Default: True; this flag is deprecated and will be removed in a '
-        'future release.) Whether to retry provisioning infinitely until the '
-        'cluster is up, if unavailability errors are encountered. This '  # pylint: disable=bad-docstring-quotes
-        'applies to launching all managed jobs (both the initial and '
-        'any recovery attempts), not the jobs controller.'))
 @click.option('--yes',
               '-y',
               is_flag=True,
@@ -3636,7 +3630,6 @@ def jobs_launch(
     disk_tier: Optional[str],
     ports: Tuple[str],
     detach_run: bool,
-    retry_until_up: Optional[bool],
     yes: bool,
     fast: bool,
 ):
@@ -3680,19 +3673,6 @@ def jobs_launch(
         ports=ports,
         job_recovery=job_recovery,
     )
-    # Deprecation. We set the default behavior to be retry until up, and the
-    # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
-    if retry_until_up is not None:
-        flag_str = '--retry-until-up'
-        if not retry_until_up:
-            flag_str = '--no-retry-until-up'
-        click.secho(
-            f'Flag {flag_str} is deprecated and will be removed in a '
-            'future release (managed jobs will always be retried). '
-            'Please file an issue if this does not work for you.',
-            fg='yellow')
-    else:
-        retry_until_up = True
     # Deprecation. The default behavior is fast, and the flag will be removed.
     # The flag was not present in 0.7.x (only nightly), so we will remove before
@@ -3742,10 +3722,7 @@ def jobs_launch(
     common_utils.check_cluster_name_is_valid(name)
-    managed_jobs.launch(dag,
-                        name,
-                        detach_run=detach_run,
-                        retry_until_up=retry_until_up)
+    managed_jobs.launch(dag, name, detach_run=detach_run)
 @jobs.command('queue', cls=_DocumentedCodeCommand)

sky/core.py CHANGED Viewed

@@ -915,8 +915,11 @@ def storage_delete(name: str) -> None:
     handle = global_user_state.get_handle_from_storage_name(name)
     if handle is None:
         raise ValueError(f'Storage name {name!r} not found.')
-    else:
-        storage_object = data.Storage(name=handle.storage_name,
-                                      source=handle.source,
-                                      sync_on_reconstruction=False)
-        storage_object.delete()
+    assert handle.storage_name == name, (
+        f'In global_user_state, storage name {name!r} does not match '
+        f'handle.storage_name {handle.storage_name!r}')
+    storage_object = data.Storage(name=handle.storage_name,
+                                  source=handle.source,
+                                  sync_on_reconstruction=False)
+    storage_object.delete()

sky/data/storage.py CHANGED Viewed

@@ -1083,18 +1083,16 @@ class Storage(object):
         if not self.stores:
             logger.info('No backing stores found. Deleting storage.')
             global_user_state.remove_storage(self.name)
-        if store_type:
+        if store_type is not None:
             store = self.stores[store_type]
-            is_sky_managed = store.is_sky_managed
             # We delete a store from the cloud if it's sky managed. Else just
             # remove handle and return
-            if is_sky_managed:
+            if store.is_sky_managed:
                 self.handle.remove_store(store)
                 store.delete()
                 # Check remaining stores - if none is sky managed, remove
                 # the storage from global_user_state.
-                delete = all(
-                    s.is_sky_managed is False for s in self.stores.values())
+                delete = all(not s.is_sky_managed for s in self.stores.values())
                 if delete:
                     global_user_state.remove_storage(self.name)
                 else:
@@ -1689,6 +1687,9 @@ class S3Store(AbstractStore):
         Returns:
          bool; True if bucket was deleted, False if it was deleted externally.
+        Raises:
+            StorageBucketDeleteError: If deleting the bucket fails.
         """
         # Deleting objects is very slow programatically
         # (i.e. bucket.objects.all().delete() is slow).
@@ -2179,6 +2180,11 @@ class GcsStore(AbstractStore):
         Returns:
          bool; True if bucket was deleted, False if it was deleted externally.
+        Raises:
+            StorageBucketDeleteError: If deleting the bucket fails.
+            PermissionError: If the bucket is external and the user is not
+                allowed to delete it.
         """
         if _bucket_sub_path is not None:
             command_suffix = f'/{_bucket_sub_path}'
@@ -3478,6 +3484,9 @@ class R2Store(AbstractStore):
         Returns:
          bool; True if bucket was deleted, False if it was deleted externally.
+        Raises:
+            StorageBucketDeleteError: If deleting the bucket fails.
         """
         # Deleting objects is very slow programatically
         # (i.e. bucket.objects.all().delete() is slow).
@@ -3932,7 +3941,7 @@ class IBMCosStore(AbstractStore):
     def _delete_cos_bucket_objects(self,
                                    bucket: Any,
-                                   prefix: Optional[str] = None):
+                                   prefix: Optional[str] = None) -> None:
         bucket_versioning = self.s3_resource.BucketVersioning(bucket.name)
         if bucket_versioning.status == 'Enabled':
             if prefix is not None:
@@ -3947,7 +3956,7 @@ class IBMCosStore(AbstractStore):
                 res = list(bucket.objects.delete())
         logger.debug(f'Deleted bucket\'s content:\n{res}, prefix: {prefix}')
-    def _delete_cos_bucket(self):
+    def _delete_cos_bucket(self) -> None:
         bucket = self.s3_resource.Bucket(self.name)
         try:
             self._delete_cos_bucket_objects(bucket)

sky/global_user_state.py CHANGED Viewed

@@ -827,7 +827,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
 def get_storage() -> List[Dict[str, Any]]:
-    rows = _DB.cursor.execute('select * from storage')
+    rows = _DB.cursor.execute('SELECT * FROM storage')
     records = []
     for name, launched_at, handle, last_use, status in rows:
         # TODO: use namedtuple instead of dict

sky/jobs/constants.py CHANGED Viewed

@@ -2,18 +2,19 @@
 JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
 JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
+JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
 JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
 # Resources as a dict for the jobs controller.
-# Use default CPU instance type for jobs controller with >= 24GB, i.e.
-# m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB)
-# for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP, etc.
-# Based on profiling, memory should be at least 3x (in GB) as num vCPUs to avoid
-# OOM (each vCPU can have 4 jobs controller processes as we set the CPU
-# requirement to 0.25, and 3 GB is barely enough for 4 job processes).
+# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
+# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
+# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
+# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
+# parallelism limit, and memory / 350MB is the limit to concurrently running
+# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
 # We use 50 GB disk size to reduce the cost.
-CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50}
+CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
 # Max length of the cluster name for GCP is 35, the user hash to be attached is
 # 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max

sky/jobs/controller.py CHANGED Viewed

@@ -16,6 +16,7 @@ from sky import status_lib
 from sky.backends import backend_utils
 from sky.backends import cloud_vm_ray_backend
 from sky.jobs import recovery_strategy
+from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
 from sky.skylet import constants
@@ -46,12 +47,10 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
 class JobsController:
     """Each jobs controller manages the life cycle of one managed job."""
-    def __init__(self, job_id: int, dag_yaml: str,
-                 retry_until_up: bool) -> None:
+    def __init__(self, job_id: int, dag_yaml: str) -> None:
         self._job_id = job_id
         self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
         logger.info(self._dag)
-        self._retry_until_up = retry_until_up
         # TODO(zhwu): this assumes the specific backend.
         self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
@@ -174,7 +173,7 @@ class JobsController:
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
             task.name, self._job_id)
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
-            cluster_name, self._backend, task, self._retry_until_up)
+            cluster_name, self._backend, task, self._job_id)
         managed_job_state.set_submitted(
             self._job_id,
             task_id,
@@ -202,6 +201,7 @@ class JobsController:
                                       task_id=task_id,
                                       start_time=remote_job_submitted_at,
                                       callback_func=callback_func)
         while True:
             time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
@@ -243,7 +243,7 @@ class JobsController:
                     self._download_log_and_stream(task_id, handle)
                 # Only clean up the cluster, not the storages, because tasks may
                 # share storages.
-                recovery_strategy.terminate_cluster(cluster_name=cluster_name)
+                managed_job_utils.terminate_cluster(cluster_name=cluster_name)
                 return True
             # For single-node jobs, non-terminated job_status indicates a
@@ -342,7 +342,7 @@ class JobsController:
                     # those clusters again may fail.
                     logger.info('Cleaning up the preempted or failed cluster'
                                 '...')
-                    recovery_strategy.terminate_cluster(cluster_name)
+                    managed_job_utils.terminate_cluster(cluster_name)
             # Try to recover the managed jobs, when the cluster is preempted or
             # failed or the job status is failed to be fetched.
@@ -424,11 +424,11 @@ class JobsController:
                 task=self._dag.tasks[task_id]))
-def _run_controller(job_id: int, dag_yaml: str, retry_until_up: bool):
+def _run_controller(job_id: int, dag_yaml: str):
     """Runs the controller in a remote process for interruption."""
     # The controller needs to be instantiated in the remote process, since
     # the controller is not serializable.
-    jobs_controller = JobsController(job_id, dag_yaml, retry_until_up)
+    jobs_controller = JobsController(job_id, dag_yaml)
     jobs_controller.run()
@@ -478,14 +478,14 @@ def _cleanup(job_id: int, dag_yaml: str):
         assert task.name is not None, task
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
             task.name, job_id)
-        recovery_strategy.terminate_cluster(cluster_name)
+        managed_job_utils.terminate_cluster(cluster_name)
         # Clean up Storages with persistent=False.
         # TODO(zhwu): this assumes the specific backend.
         backend = cloud_vm_ray_backend.CloudVmRayBackend()
         backend.teardown_ephemeral_storage(task)
-def start(job_id, dag_yaml, retry_until_up):
+def start(job_id, dag_yaml):
     """Start the controller."""
     controller_process = None
     cancelling = False
@@ -499,8 +499,7 @@ def start(job_id, dag_yaml, retry_until_up):
         #  So we can only enable daemon after we no longer need to
         #  start daemon processes like Ray.
         controller_process = multiprocessing.Process(target=_run_controller,
-                                                     args=(job_id, dag_yaml,
-                                                           retry_until_up))
+                                                     args=(job_id, dag_yaml))
         controller_process.start()
         while controller_process.is_alive():
             _handle_signal(job_id)
@@ -562,6 +561,8 @@ def start(job_id, dag_yaml, retry_until_up):
                 failure_reason=('Unexpected error occurred. For details, '
                                 f'run: sky jobs logs --controller {job_id}'))
+        scheduler.job_done(job_id)
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -569,9 +570,6 @@ if __name__ == '__main__':
                         required=True,
                         type=int,
                         help='Job id for the controller job.')
-    parser.add_argument('--retry-until-up',
-                        action='store_true',
-                        help='Retry until the cluster is up.')
     parser.add_argument('dag_yaml',
                         type=str,
                         help='The path to the user job yaml file.')
@@ -579,4 +577,4 @@ if __name__ == '__main__':
     # We start process with 'spawn', because 'fork' could result in weird
     # behaviors; 'spawn' is also cross-platform.
     multiprocessing.set_start_method('spawn', force=True)
-    start(args.job_id, args.dag_yaml, args.retry_until_up)
+    start(args.job_id, args.dag_yaml)

sky/jobs/core.py CHANGED Viewed

@@ -41,7 +41,6 @@ def launch(
         name: Optional[str] = None,
         stream_logs: bool = True,
         detach_run: bool = False,
-        retry_until_up: bool = False,
         # TODO(cooperc): remove fast arg before 0.8.0
         fast: bool = True,  # pylint: disable=unused-argument for compatibility
 ) -> None:
@@ -115,7 +114,6 @@ def launch(
             'jobs_controller': controller_name,
             # Note: actual cluster name will be <task.name>-<managed job ID>
             'dag_name': dag.name,
-            'retry_until_up': retry_until_up,
             'remote_user_config_path': remote_user_config_path,
             'modified_catalogs':
                 service_catalog_common.get_modified_catalog_file_mounts(),

skypilot-nightly 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250118__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250116py3-none-any.whl → 1.0.0.dev20250118py3-none-any.whl