PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241029__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241029py3-none-any.whl → 1.0.0.dev20241030py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

sky/__init__.py +2 -2
sky/backends/cloud_vm_ray_backend.py +13 -3
sky/execution.py +5 -4
sky/jobs/controller.py +38 -22
sky/jobs/recovery_strategy.py +30 -5
sky/jobs/state.py +33 -5
sky/jobs/utils.py +28 -4
sky/resources.py +25 -8
sky/setup_files/setup.py +4 -3
sky/skylet/job_lib.py +34 -42
sky/utils/dag_utils.py +14 -4
sky/utils/schemas.py +21 -1
{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/METADATA +13 -11
{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/RECORD +18 -18
{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '47ebae73e972c65de6e87aa7556220e515f2fc5e'
+_SKYPILOT_COMMIT_SHA = '9d50f192b262d5f6cc74b5b6644f3a9e3ea31f2f'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20241029'
+__version__ = '1.0.0.dev20241030'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -3175,9 +3175,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
             if returncode == 255:
                 is_message_too_long = False
-                with open(setup_log_path, 'r', encoding='utf-8') as f:
-                    if 'too long' in f.read():
-                        is_message_too_long = True
+                try:
+                    with open(os.path.expanduser(setup_log_path),
+                              'r',
+                              encoding='utf-8') as f:
+                        if 'too long' in f.read():
+                            is_message_too_long = True
+                except Exception as e:  # pylint: disable=broad-except
+                    # We don't crash the setup if we cannot read the log file.
+                    # Instead, we should retry the setup with dumping the script
+                    # to a file to be safe.
+                    logger.debug('Failed to read setup log file '
+                                 f'{setup_log_path}: {e}')
+                    is_message_too_long = True
                 if is_message_too_long:
                     # If the setup script is too long, we retry it with dumping

sky/execution.py CHANGED Viewed

@@ -171,10 +171,11 @@ def _execute(
     task = dag.tasks[0]
     if any(r.job_recovery is not None for r in task.resources):
-        with ux_utils.print_exception_no_traceback():
-            raise ValueError(
-                'Job recovery is specified in the task. To launch a '
-                'managed job, please use: sky jobs launch')
+        logger.warning(
+            f'{colorama.Style.DIM}The task has `job_recovery` specified, '
+            'but is launched as an unmanaged job. It will be ignored.'
+            'To enable job recovery, use managed jobs: sky jobs launch.'
+            f'{colorama.Style.RESET_ALL}')
     cluster_exists = False
     if cluster_name is not None:

sky/jobs/controller.py CHANGED Viewed

@@ -160,6 +160,11 @@ class JobsController:
         if task_id == 0:
             submitted_at = backend_utils.get_timestamp_from_run_timestamp(
                 self._backend.run_timestamp)
+        assert task.name is not None, task
+        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
+            task.name, self._job_id)
+        self._strategy_executor = recovery_strategy.StrategyExecutor.make(
+            cluster_name, self._backend, task, self._retry_until_up)
         managed_job_state.set_submitted(
             self._job_id,
             task_id,
@@ -167,15 +172,14 @@ class JobsController:
             submitted_at,
             resources_str=backend_utils.get_task_resources_str(
                 task, is_managed_job=True),
+            specs={
+                'max_restarts_on_errors':
+                    self._strategy_executor.max_restarts_on_errors
+            },
             callback_func=callback_func)
         logger.info(
             f'Submitted managed job {self._job_id} (task: {task_id}, name: '
             f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
-        assert task.name is not None, task
-        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
-            task.name, self._job_id)
-        self._strategy_executor = recovery_strategy.StrategyExecutor.make(
-            cluster_name, self._backend, task, self._retry_until_up)
         logger.info('Started monitoring.')
         managed_job_state.set_starting(job_id=self._job_id,
@@ -283,23 +287,35 @@ class JobsController:
                     failure_reason = (
                         'To see the details, run: '
                         f'sky jobs logs --controller {self._job_id}')
-                    managed_job_state.set_failed(
-                        self._job_id,
-                        task_id,
-                        failure_type=managed_job_status,
-                        failure_reason=failure_reason,
-                        end_time=end_time,
-                        callback_func=callback_func)
-                    return False
-                # Although the cluster is healthy, we fail to access the
-                # job status. Try to recover the job (will not restart the
-                # cluster, if the cluster is healthy).
-                assert job_status is None, job_status
-                logger.info('Failed to fetch the job status while the '
-                            'cluster is healthy. Try to recover the job '
-                            '(the cluster will not be restarted).')
+                    should_restart_on_failure = (
+                        self._strategy_executor.should_restart_on_failure())
+                    if should_restart_on_failure:
+                        max_restarts = (
+                            self._strategy_executor.max_restarts_on_errors)
+                        logger.info(
+                            f'User program crashed '
+                            f'({managed_job_status.value}). '
+                            f'Retry the job as max_restarts_on_errors is '
+                            f'set to {max_restarts}. '
+                            f'[{self._strategy_executor.restart_cnt_on_failure}'
+                            f'/{max_restarts}]')
+                    else:
+                        managed_job_state.set_failed(
+                            self._job_id,
+                            task_id,
+                            failure_type=managed_job_status,
+                            failure_reason=failure_reason,
+                            end_time=end_time,
+                            callback_func=callback_func)
+                        return False
+                else:
+                    # Although the cluster is healthy, we fail to access the
+                    # job status. Try to recover the job (will not restart the
+                    # cluster, if the cluster is healthy).
+                    assert job_status is None, job_status
+                    logger.info('Failed to fetch the job status while the '
+                                'cluster is healthy. Try to recover the job '
+                                '(the cluster will not be restarted).')
             # When the handle is None, the cluster should be cleaned up already.
             if handle is not None:
                 resources = handle.launched_resources

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -66,7 +66,8 @@ class StrategyExecutor:
     RETRY_INIT_GAP_SECONDS = 60
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool) -> None:
+                 task: 'task_lib.Task', retry_until_up: bool,
+                 max_restarts_on_errors: int) -> None:
         """Initialize the strategy executor.
         Args:
@@ -82,6 +83,8 @@ class StrategyExecutor:
         self.cluster_name = cluster_name
         self.backend = backend
         self.retry_until_up = retry_until_up
+        self.max_restarts_on_errors = max_restarts_on_errors
+        self.restart_cnt_on_failure = 0
     def __init_subclass__(cls, name: str, default: bool = False):
         RECOVERY_STRATEGIES[name] = cls
@@ -109,8 +112,17 @@ class StrategyExecutor:
         # set the new_task_resources to be the same type (list or set) as the
         # original task.resources
         task.set_resources(type(task.resources)(new_resources_list))
-        return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
-                                                 retry_until_up)
+        if isinstance(job_recovery, dict):
+            job_recovery_name = job_recovery.pop('strategy',
+                                                 DEFAULT_RECOVERY_STRATEGY)
+            max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
+                                                      0)
+        else:
+            job_recovery_name = job_recovery
+            max_restarts_on_errors = 0
+        return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
+                                                      task, retry_until_up,
+                                                      max_restarts_on_errors)
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -368,6 +380,17 @@ class StrategyExecutor:
                         f'{gap_seconds:.1f} seconds.')
             time.sleep(gap_seconds)
+    def should_restart_on_failure(self) -> bool:
+        """Increments counter & checks if job should be restarted on a failure.
+        Returns:
+            True if the job should be restarted, otherwise False.
+        """
+        self.restart_cnt_on_failure += 1
+        if self.restart_cnt_on_failure > self.max_restarts_on_errors:
+            return False
+        return True
 class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
                                default=False):
@@ -376,8 +399,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', retry_until_up: bool) -> None:
-        super().__init__(cluster_name, backend, task, retry_until_up)
+                 task: 'task_lib.Task', retry_until_up: bool,
+                 max_restarts_on_errors: int) -> None:
+        super().__init__(cluster_name, backend, task, retry_until_up,
+                         max_restarts_on_errors)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is

sky/jobs/state.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # TODO(zhwu): maybe use file based status instead of database, so
 # that we can easily switch to a s3-based storage.
 import enum
+import json
 import pathlib
 import sqlite3
 import time
@@ -65,7 +66,8 @@ _CURSOR.execute("""\
     failure_reason TEXT,
     spot_job_id INTEGER,
     task_id INTEGER DEFAULT 0,
-    task_name TEXT)""")
+    task_name TEXT,
+    specs TEXT)""")
 _CONN.commit()
 db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
@@ -92,6 +94,17 @@ db_utils.add_column_to_table(_CURSOR,
                              'TEXT',
                              copy_from='job_name')
+# Specs is some useful information about the task, e.g., the
+# max_restarts_on_errors value. It is stored in JSON format.
+db_utils.add_column_to_table(_CURSOR,
+                             _CONN,
+                             'spot',
+                             'specs',
+                             'TEXT',
+                             value_to_replace_existing_entries=json.dumps({
+                                 'max_restarts_on_errors': 0,
+                             }))
 # `job_info` contains the mapping from job_id to the job_name.
 # In the future, it may contain more information about each job.
 _CURSOR.execute("""\
@@ -130,7 +143,8 @@ columns = [
     'task_name',
     # columns from the job_info table
     '_job_info_job_id',  # This should be the same as job_id
-    'job_name'
+    'job_name',
+    'specs',
 ]
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
 def set_submitted(job_id: int, task_id: int, run_timestamp: str,
                   submit_time: float, resources_str: str,
-                  callback_func: CallbackType):
+                  specs: Dict[str, Union[str,
+                                         int]], callback_func: CallbackType):
     """Set the task to submitted.
     Args:
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
             determine the log directory of the managed task.
         submit_time: The time when the managed task is submitted.
         resources_str: The resources string of the managed task.
+        specs: The specs of the managed task.
+        callback_func: The callback function.
     """
     # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
     # the log directory and submission time align with each other, so as to
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
             resources=(?),
             submitted_at=(?),
             status=(?),
-            run_timestamp=(?)
+            run_timestamp=(?),
+            specs=(?)
             WHERE spot_job_id=(?) AND
             task_id=(?)""",
             (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
-             run_timestamp, job_id, task_id))
+             run_timestamp, json.dumps(specs), job_id, task_id))
     callback_func('SUBMITTED')
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
         for (job_id,) in rows:
             return job_id
         return None
+def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        task_specs = cursor.execute(
+            """\
+            SELECT specs FROM spot
+            WHERE spot_job_id=(?) AND task_id=(?)""",
+            (job_id, task_id)).fetchone()
+        return json.loads(task_specs[0])

sky/jobs/utils.py CHANGED Viewed

@@ -70,7 +70,7 @@ _JOB_CANCELLED_MESSAGE = (
 # state, after the job finished. This is a safeguard to avoid the case where
 # the managed job status fails to be updated and keep the `sky jobs logs`
 # blocking for a long time.
-_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
 class UserSignal(enum.Enum):
@@ -392,8 +392,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                             f'INFO: Log for the current task ({task_id}) '
                             'is finished. Waiting for the next task\'s log '
                             'to be started.')
-                        status_display.update('Waiting for the next task: '
-                                              f'{task_id + 1}.')
+                        # Add a newline to avoid the status display below
+                        # removing the last line of the task output.
+                        print()
+                        status_display.update(
+                            ux_utils.spinner_message(
+                                f'Waiting for the next task: {task_id + 1}'))
                         status_display.start()
                         original_task_id = task_id
                         while True:
@@ -405,7 +409,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
                             time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
                         continue
                     else:
-                        break
+                        task_specs = managed_job_state.get_task_specs(
+                            job_id, task_id)
+                        if task_specs.get('max_restarts_on_errors', 0) == 0:
+                            # We don't need to wait for the managed job status
+                            # update, as the job is guaranteed to be in terminal
+                            # state afterwards.
+                            break
+                        print()
+                        status_display.update(
+                            ux_utils.spinner_message(
+                                'Waiting for next restart for the failed task'))
+                        status_display.start()
+                        while True:
+                            _, managed_job_status = (
+                                managed_job_state.get_latest_task_id_status(
+                                    job_id))
+                            if (managed_job_status !=
+                                    managed_job_state.ManagedJobStatus.RUNNING):
+                                break
+                            time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
+                        continue
                 # The job can be cancelled by the user or the controller (when
                 # the cluster is partially preempted).
                 logger.debug(

sky/resources.py CHANGED Viewed

@@ -55,7 +55,7 @@ class Resources:
         accelerators: Union[None, str, Dict[str, int]] = None,
         accelerator_args: Optional[Dict[str, str]] = None,
         use_spot: Optional[bool] = None,
-        job_recovery: Optional[str] = None,
+        job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
         region: Optional[str] = None,
         zone: Optional[str] = None,
         image_id: Union[Dict[str, str], str, None] = None,
@@ -111,6 +111,12 @@ class Resources:
             job to recover the cluster from preemption. Refer to
             `recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/jobs/recovery_strategy.py>`__ # pylint: disable=line-too-long
             for more details.
+            When a dict is provided, it can have the following fields:
+            - strategy: the recovery strategy to use.
+            - max_restarts_on_errors: the max number of restarts on user code
+              errors.
           region: the region to use.
           zone: the zone to use.
           image_id: the image ID to use. If a str, must be a string
@@ -161,10 +167,20 @@ class Resources:
         self._use_spot_specified = use_spot is not None
         self._use_spot = use_spot if use_spot is not None else False
-        self._job_recovery = None
+        self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
         if job_recovery is not None:
-            if job_recovery.strip().lower() != 'none':
-                self._job_recovery = job_recovery.upper()
+            if isinstance(job_recovery, str):
+                job_recovery = {'strategy': job_recovery}
+            if 'strategy' not in job_recovery:
+                job_recovery['strategy'] = None
+            strategy_name = job_recovery['strategy']
+            if strategy_name == 'none':
+                self._job_recovery = None
+            else:
+                if strategy_name is not None:
+                    job_recovery['strategy'] = strategy_name.upper()
+                self._job_recovery = job_recovery
         if disk_size is not None:
             if round(disk_size) != disk_size:
@@ -419,7 +435,7 @@ class Resources:
         return self._use_spot_specified
     @property
-    def job_recovery(self) -> Optional[str]:
+    def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
         return self._job_recovery
     @property
@@ -814,12 +830,13 @@ class Resources:
         Raises:
             ValueError: if the attributes are invalid.
         """
-        if self._job_recovery is None:
+        if self._job_recovery is None or self._job_recovery['strategy'] is None:
             return
-        if self._job_recovery not in managed_jobs.RECOVERY_STRATEGIES:
+        if (self._job_recovery['strategy']
+                not in managed_jobs.RECOVERY_STRATEGIES):
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(
-                    f'Spot recovery strategy {self._job_recovery} '
+                    f'Spot recovery strategy {self._job_recovery["strategy"]} '
                     'is not supported. The strategy should be among '
                     f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')

sky/setup_files/setup.py CHANGED Viewed

@@ -153,7 +153,7 @@ install_requires = [
     'tabulate',
     # Light weight requirement, can be replaced with "typing" once
     # we deprecate Python 3.7 (this will take a while).
-    "typing_extensions",
+    'typing_extensions',
     'filelock >= 3.6.0',
     'packaging',
     'psutil',
@@ -216,8 +216,9 @@ extras_require: Dict[str, List[str]] = {
     # We need azure-identity>=1.13.0 to enable the customization of the
     # timeout of AzureCliCredential.
     'azure': [
-        'azure-cli>=2.31.0', 'azure-core', 'azure-identity>=1.13.0',
-        'azure-mgmt-network', 'azure-storage-blob', 'msgraph-sdk'
+        'azure-cli>=2.65.0', 'azure-core>=1.31.0', 'azure-identity>=1.19.0',
+        'azure-mgmt-network>=27.0.0', 'azure-mgmt-compute>=33.0.0',
+        'azure-storage-blob>=12.23.1', 'msgraph-sdk'
     ] + local_ray,
     # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
     # parameter for stopping instances.

sky/skylet/job_lib.py CHANGED Viewed

@@ -512,16 +512,13 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
     return records
-def _get_pending_jobs():
-    rows = _CURSOR.execute(
-        'SELECT job_id, created_time, submit FROM pending_jobs')
-    rows = list(rows)
-    return {
-        job_id: {
-            'created_time': created_time,
-            'submit': submit
-        } for job_id, created_time, submit in rows
-    }
+def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
+    rows = _CURSOR.execute('SELECT created_time, submit FROM pending_jobs '
+                           f'WHERE job_id={job_id!r}')
+    for row in rows:
+        created_time, submit = row
+        return {'created_time': created_time, 'submit': submit}
+    return None
 def update_job_status(job_ids: List[int],
@@ -535,7 +532,7 @@ def update_job_status(job_ids: List[int],
     during job cancelling, we still need this to handle the staleness problem,
     caused by instance restarting and other corner cases (if any).
-    This function should only be run on the remote instance with ray==2.4.0.
+    This function should only be run on the remote instance with ray>=2.4.0.
     """
     if len(job_ids) == 0:
         return []
@@ -547,50 +544,45 @@ def update_job_status(job_ids: List[int],
     # In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
     # which contains the job status (str) and submission_id (str).
+    ray_job_query_time = time.time()
     job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
-    pending_jobs = _get_pending_jobs()
     job_details = {}
     ray_job_ids_set = set(ray_job_ids)
     for job_detail in job_detail_lists:
         if job_detail.submission_id in ray_job_ids_set:
             job_details[job_detail.submission_id] = job_detail
-    job_statuses: List[Optional[JobStatus]] = [None] * len(ray_job_ids)
-    for i, ray_job_id in enumerate(ray_job_ids):
-        job_id = job_ids[i]
-        if ray_job_id in job_details:
-            ray_status = job_details[ray_job_id].status
-            job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status]
-        if job_id in pending_jobs:
-            if pending_jobs[job_id]['created_time'] < psutil.boot_time():
-                logger.info(
-                    f'Job {job_id} is stale, setting to FAILED: '
-                    f'created_time={pending_jobs[job_id]["created_time"]}, '
-                    f'boot_time={psutil.boot_time()}')
-                # The job is stale as it is created before the instance
-                # is booted, e.g. the instance is rebooted.
-                job_statuses[i] = JobStatus.FAILED
-            # Gives a 60 second grace period between job being submit from
-            # the pending table until appearing in ray jobs.
-            if (pending_jobs[job_id]['submit'] > 0 and
-                    pending_jobs[job_id]['submit'] <
-                    time.time() - _PENDING_SUBMIT_GRACE_PERIOD):
-                # For jobs submitted outside of the grace period, we will
-                # consider the ray job status.
-                continue
-            else:
-                # Reset the job status to PENDING even though it may not appear
-                # in the ray jobs, so that it will not be considered as stale.
-                job_statuses[i] = JobStatus.PENDING
-    assert len(job_statuses) == len(job_ids), (job_statuses, job_ids)
     statuses = []
-    for job_id, status in zip(job_ids, job_statuses):
+    for job_id, ray_job_id in zip(job_ids, ray_job_ids):
         # Per-job status lock is required because between the job status
         # query and the job status update, the job status in the databse
         # can be modified by the generated ray program.
         with filelock.FileLock(_get_lock_path(job_id)):
+            status = None
+            if ray_job_id in job_details:
+                ray_status = job_details[ray_job_id].status
+                status = _RAY_TO_JOB_STATUS_MAP[ray_status]
+            pending_job = _get_pending_job(job_id)
+            if pending_job is not None:
+                if pending_job['created_time'] < psutil.boot_time():
+                    logger.info(f'Job {job_id} is stale, setting to FAILED: '
+                                f'created_time={pending_job["created_time"]}, '
+                                f'boot_time={psutil.boot_time()}')
+                    # The job is stale as it is created before the instance
+                    # is booted, e.g. the instance is rebooted.
+                    status = JobStatus.FAILED
+                # Gives a 60 second grace period between job being submit from
+                # the pending table until appearing in ray jobs. For jobs
+                # submitted outside of the grace period, we will consider the
+                # ray job status.
+                if not (pending_job['submit'] > 0 and pending_job['submit'] <
+                        ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
+                    # Reset the job status to PENDING even though it may not
+                    # appear in the ray jobs, so that it will not be considered
+                    # as stale.
+                    status = JobStatus.PENDING
             original_status = get_status_no_lock(job_id)
             assert original_status is not None, (job_id, status)
             if status is None:

sky/utils/dag_utils.py CHANGED Viewed

@@ -143,11 +143,21 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
     for task_ in dag.tasks:
         new_resources_list = []
+        default_strategy = jobs.DEFAULT_RECOVERY_STRATEGY
+        assert default_strategy is not None
         for resources in list(task_.resources):
-            change_default_value: Dict[str, Any] = {}
-            if resources.job_recovery is None:
-                change_default_value[
-                    'job_recovery'] = jobs.DEFAULT_RECOVERY_STRATEGY
+            original_job_recovery = resources.job_recovery
+            job_recovery = {'strategy': default_strategy}
+            if isinstance(original_job_recovery, str):
+                job_recovery['strategy'] = original_job_recovery
+            elif isinstance(original_job_recovery, dict):
+                job_recovery.update(original_job_recovery)
+                strategy = job_recovery.get('strategy')
+                if strategy is None:
+                    job_recovery['strategy'] = default_strategy
+            change_default_value: Dict[str, Any] = {
+                'job_recovery': job_recovery
+            }
             new_resources = resources.copy(**change_default_value)
             new_resources_list.append(new_resources)

sky/utils/schemas.py CHANGED Viewed

@@ -92,7 +92,27 @@ def _get_single_resources_schema():
                 'type': 'string',
             },
             'job_recovery': {
-                'type': 'string',
+                # Either a string or a dict.
+                'anyOf': [{
+                    'type': 'string',
+                }, {
+                    'type': 'object',
+                    'required': [],
+                    'additionalProperties': False,
+                    'properties': {
+                        'strategy': {
+                            'anyOf': [{
+                                'type': 'string',
+                            }, {
+                                'type': 'null',
+                            }],
+                        },
+                        'max_restarts_on_errors': {
+                            'type': 'integer',
+                            'minimum': 0,
+                        },
+                    }
+                }],
             },
             'disk_size': {
                 'type': 'integer',

{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: skypilot-nightly
-Version: 1.0.0.dev20241029
+Version: 1.0.0.dev20241030
 Summary: SkyPilot: An intercloud broker for the clouds
 Author: SkyPilot Team
 License: Apache 2.0
@@ -46,11 +46,12 @@ Requires-Dist: awscli>=1.27.10; extra == "all"
 Requires-Dist: botocore>=1.29.10; extra == "all"
 Requires-Dist: boto3>=1.26.1; extra == "all"
 Requires-Dist: colorama<0.4.5; extra == "all"
-Requires-Dist: azure-cli>=2.31.0; extra == "all"
-Requires-Dist: azure-core; extra == "all"
-Requires-Dist: azure-identity>=1.13.0; extra == "all"
-Requires-Dist: azure-mgmt-network; extra == "all"
-Requires-Dist: azure-storage-blob; extra == "all"
+Requires-Dist: azure-cli>=2.65.0; extra == "all"
+Requires-Dist: azure-core>=1.31.0; extra == "all"
+Requires-Dist: azure-identity>=1.19.0; extra == "all"
+Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
+Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
+Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
 Requires-Dist: msgraph-sdk; extra == "all"
 Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
 Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
@@ -78,11 +79,12 @@ Requires-Dist: botocore>=1.29.10; extra == "aws"
 Requires-Dist: boto3>=1.26.1; extra == "aws"
 Requires-Dist: colorama<0.4.5; extra == "aws"
 Provides-Extra: azure
-Requires-Dist: azure-cli>=2.31.0; extra == "azure"
-Requires-Dist: azure-core; extra == "azure"
-Requires-Dist: azure-identity>=1.13.0; extra == "azure"
-Requires-Dist: azure-mgmt-network; extra == "azure"
-Requires-Dist: azure-storage-blob; extra == "azure"
+Requires-Dist: azure-cli>=2.65.0; extra == "azure"
+Requires-Dist: azure-core>=1.31.0; extra == "azure"
+Requires-Dist: azure-identity>=1.19.0; extra == "azure"
+Requires-Dist: azure-mgmt-network>=27.0.0; extra == "azure"
+Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "azure"
+Requires-Dist: azure-storage-blob>=12.23.1; extra == "azure"
 Requires-Dist: msgraph-sdk; extra == "azure"
 Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "azure"
 Provides-Extra: cloudflare

{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-sky/__init__.py,sha256=BxGwYNzkF-X3QWAkY2mXeidbCMkcagQQqkJ-gwBlJiI,5882
+sky/__init__.py,sha256=WwnJbF2ubaAJEJkUGPJ7jK5mh3QD1r487evpncErtC8,5882
 sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
 sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
 sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
@@ -7,10 +7,10 @@ sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
 sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
 sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
 sky/exceptions.py,sha256=KBIEJHgrw6OMBL8H65o-Gk6qYQEV1SR9gBwMjnMnxxg,8858
-sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
+sky/execution.py,sha256=tDK6JhF_405cjqxRpbdLbHZyxrKTD5oa0UkKDvPJ_9Q,24751
 sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
 sky/optimizer.py,sha256=tXGrFpc6xNtKH34qjBAMd4jTuWcDZTPnGFwEtuCQFmk,59702
-sky/resources.py,sha256=bm004Ms2qlBqEr0N_TEUybDOXJVhLF8yOwkhoqb1t9c,67478
+sky/resources.py,sha256=7kVpLRfy3DFFgmEji0_Xz6FbrvBDUSXC6K0bsRIK3hA,68290
 sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
 sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
 sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
 sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
 sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
 sky/backends/backend_utils.py,sha256=LmLsaLiPuuUyGebOXykdvwZpUY-8sB7n4o2AnmwNmdQ,121714
-sky/backends/cloud_vm_ray_backend.py,sha256=WX93AnMR_E6e8L0hvXc5eWFdajQo-Sbwfv8Z8lidy9U,232598
+sky/backends/cloud_vm_ray_backend.py,sha256=ZWAzdmKzSf3qalDoKfmLGaO3PywjLtIA5Q3AeeHhvHA,233158
 sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
 sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
 sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
@@ -95,11 +95,11 @@ sky/data/storage.py,sha256=x8YYY4zVBdit_5oAR_MXV-TM9qDefV_ZV4z0irv6ZaU,163102
 sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
 sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
 sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
-sky/jobs/controller.py,sha256=zSdawmXg-9SZ91jJg5_OSFVlntu9xupLs-CiPwG1QdQ,26412
+sky/jobs/controller.py,sha256=sirpi730_GfKfPZeZ2PvCXnJWger0r6AyLSOx2sLd6A,27368
 sky/jobs/core.py,sha256=RkBFaKDlovmdzqlOAgQ0xAimZFgo4pXq3qaQkAvGsGk,16908
-sky/jobs/recovery_strategy.py,sha256=UOEaVGSpRbCnCzlD8cgyjhCPIBIeBeCXCutoSic5aiA,25545
-sky/jobs/state.py,sha256=C6R5Yq7ftBqGPa_71tUjflBMKAaJ1FTTdbgjAwmbJsI,23231
-sky/jobs/utils.py,sha256=lYfWkEAPVnYcj2nT6VYdM6PCaWKUH6_AD4TAV_sVCkY,36376
+sky/jobs/recovery_strategy.py,sha256=FpPK6e2PT61cZPDUJqIfo6g53uSRTBh7dOTbfR1DLVE,26672
+sky/jobs/state.py,sha256=TV1G12vEMQJRgwWXsAjb3lmkJqkZmAOUUOja2QQPrg8,24307
+sky/jobs/utils.py,sha256=pF4Kyl4v1M_Bmm2jIRlXGTSdII5BJ3f4qwex_oCFgBk,37742
 sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
 sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
 sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx_WbwhWDsWM,9837
@@ -184,7 +184,7 @@ sky/serve/serve_utils.py,sha256=wqBxChpJylZ_qHWyFmMBJqrG8_7xTIOr9nlOeyHs9P8,3943
 sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
 sky/serve/service_spec.py,sha256=1aS6b-ku7W4CjyekXKDxjZsDdt-O8ygos-jFeXu31cA,13766
 sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
-sky/setup_files/setup.py,sha256=o4IgiwFoTB6Sdn3MmOirUIS0OSkoh6qo_0vrgcmrYA4,12093
+sky/setup_files/setup.py,sha256=G767GNB-jXqyC8MR-IdiojnnI2E6tP4gMYenKU14ZGA,12156
 sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
 sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
 sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
 sky/skylet/constants.py,sha256=OsuJcQp6UgkQ9Yfml6f_raXXbHS7-_h-v4QNv92y0Gw,14642
 sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
-sky/skylet/job_lib.py,sha256=Nfvefaa3N5IwxfhhOz1XE7ps46l3LY-db6VWF2pC3HQ,35335
+sky/skylet/job_lib.py,sha256=jqJ4D3UeG6fNMm8xPtdWclnrVHQb6WiRqb1nrBp8TPg,35200
 sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
 sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
 sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
@@ -249,14 +249,14 @@ sky/utils/command_runner.py,sha256=3CDcqRXEmoe3C-t2P58McgcRg6p9m5haUWYj1rOLuqM,3
 sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
 sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
 sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
-sky/utils/dag_utils.py,sha256=gjGZiJj4_GYsraXX67e6ElvbmOByJcyjSfvVgYZiXvs,5588
+sky/utils/dag_utils.py,sha256=pVX3lGDDcYTcGoH_1jEWzl9767Y4mwlIEYIzoyHO6gM,6105
 sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
 sky/utils/env_options.py,sha256=3oAaUPxowL6vI2XmxXrH56V7Myj9IJWsL-MXFmRFVdI,1294
 sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
 sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
 sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
 sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
-sky/utils/schemas.py,sha256=MTjGcxmc4aAz9QzqZY2pO87uNuWhJ3ss1N9rXcCNYGQ,28357
+sky/utils/schemas.py,sha256=mogoStpQ77S936VfChinAW2I1DT4q2c5E7qY_qNiO0w,29094
 sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
 sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
 sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
 sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
 sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
 sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
-skypilot_nightly-1.0.0.dev20241029.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
-skypilot_nightly-1.0.0.dev20241029.dist-info/METADATA,sha256=UAIFfOVp0n7QbIlx-vP21aRhzERPIIoEGbE4RcLzR5U,19540
-skypilot_nightly-1.0.0.dev20241029.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-skypilot_nightly-1.0.0.dev20241029.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
-skypilot_nightly-1.0.0.dev20241029.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
-skypilot_nightly-1.0.0.dev20241029.dist-info/RECORD,,
+skypilot_nightly-1.0.0.dev20241030.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
+skypilot_nightly-1.0.0.dev20241030.dist-info/METADATA,sha256=bwgfsg4Zzl63yZYrUfZIBNeMitC8bOcgqKucALPDnbk,19708
+skypilot_nightly-1.0.0.dev20241030.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+skypilot_nightly-1.0.0.dev20241030.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
+skypilot_nightly-1.0.0.dev20241030.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
+skypilot_nightly-1.0.0.dev20241030.dist-info/RECORD,,

{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/LICENSE RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/WHEEL RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/top_level.txt RENAMED Viewed

File without changes

skypilot-nightly 1.0.0.dev20241029__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241029py3-none-any.whl → 1.0.0.dev20241030py3-none-any.whl