PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250529__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250528py3-none-any.whl → 1.0.0.dev20250529py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

sky/__init__.py +2 -2
sky/cli.py +13 -3
sky/client/cli.py +13 -3
sky/client/oauth.py +82 -0
sky/client/sdk.py +60 -10
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/HvNkg7hqKM1p0ptAcdDcF/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/236-90e5498a5b00ec29.js +6 -0
sky/dashboard/out/_next/static/chunks/303-2c7b0f7af571710b.js +6 -0
sky/dashboard/out/_next/static/chunks/{856-62b87c68917b08ed.js → 856-59a1760784c9e770.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/config-7c48919fe030bc43.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-909f1ceb0fcf1b99.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-d4c6875c88771e17.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-6b80e9e0c6aa16a1.js +6 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -0
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/exceptions.py +1 -1
sky/jobs/client/sdk.py +1 -0
sky/jobs/constants.py +2 -0
sky/jobs/controller.py +3 -5
sky/jobs/recovery_strategy.py +148 -102
sky/jobs/scheduler.py +23 -8
sky/jobs/server/core.py +16 -0
sky/jobs/state.py +130 -35
sky/jobs/utils.py +30 -4
sky/resources.py +16 -1
sky/server/common.py +6 -2
sky/server/html/token_page.html +32 -6
sky/server/server.py +3 -1
sky/setup_files/dependencies.py +7 -1
sky/skylet/constants.py +1 -1
sky/task.py +26 -0
sky/templates/jobs-controller.yaml.j2 +2 -1
sky/utils/schemas.py +12 -0
{skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/METADATA +3 -1
{skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/RECORD +53 -49
sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/config-41738d1896fc02fe.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
/sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → HvNkg7hqKM1p0ptAcdDcF}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250529.dist-info}/top_level.txt +0 -0

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -18,6 +18,7 @@ from sky import global_user_state
 from sky import sky_logging
 from sky.backends import backend_utils
 from sky.jobs import scheduler
+from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
 from sky.skylet import job_lib
 from sky.usage import usage_lib
@@ -49,7 +50,7 @@ class StrategyExecutor:
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
                  task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int) -> None:
+                 job_id: int, task_id: int) -> None:
         """Initialize the strategy executor.
         Args:
@@ -65,11 +66,13 @@ class StrategyExecutor:
         self.backend = backend
         self.max_restarts_on_errors = max_restarts_on_errors
         self.job_id = job_id
+        self.task_id = task_id
         self.restart_cnt_on_failure = 0
     @classmethod
     def make(cls, cluster_name: str, backend: 'backends.Backend',
-             task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
+             task: 'task_lib.Task', job_id: int,
+             task_id: int) -> 'StrategyExecutor':
         """Create a strategy from a task."""
         resource_list = list(task.resources)
@@ -100,7 +103,7 @@ class StrategyExecutor:
                                  from_str(job_recovery_name))
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
-                                     max_restarts_on_errors, job_id)
+                                     max_restarts_on_errors, job_id, task_id)
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -235,7 +238,8 @@ class StrategyExecutor:
     def _launch(self,
                 max_retry: Optional[int] = 3,
-                raise_on_failure: bool = True) -> Optional[float]:
+                raise_on_failure: bool = True,
+                recovery: bool = False) -> Optional[float]:
         """Implementation of launch().
         The function will wait until the job starts running, but will leave the
@@ -275,98 +279,134 @@ class StrategyExecutor:
         backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
         while True:
             retry_cnt += 1
-            with scheduler.scheduled_launch(self.job_id):
-                try:
-                    usage_lib.messages.usage.set_internal()
-                    # Detach setup, so that the setup failure can be detected
-                    # by the controller process (job_status -> FAILED_SETUP).
-                    execution.launch(
-                        self.dag,
-                        cluster_name=self.cluster_name,
-                        # We expect to tear down the cluster as soon as the job
-                        # is finished. However, in case the controller dies, set
-                        # autodown to try and avoid a resource leak.
-                        idle_minutes_to_autostop=_AUTODOWN_MINUTES,
-                        down=True,
-                        _is_launched_by_jobs_controller=True)
-                    logger.info('Managed job cluster launched.')
-                except (exceptions.InvalidClusterNameError,
-                        exceptions.NoCloudAccessError,
-                        exceptions.ResourcesMismatchError) as e:
-                    logger.error('Failure happened before provisioning. '
-                                 f'{common_utils.format_exception(e)}')
-                    if raise_on_failure:
-                        raise exceptions.ProvisionPrechecksError(reasons=[e])
-                    return None
-                except exceptions.ResourcesUnavailableError as e:
-                    # This is raised when the launch fails due to prechecks or
-                    # after failing over through all the candidates.
-                    # Please refer to the docstring of `sky.launch` for more
-                    # details of how the exception will be structured.
-                    if not any(
-                            isinstance(err,
-                                       exceptions.ResourcesUnavailableError)
-                            for err in e.failover_history):
-                        # _launch() (this function) should fail/exit directly,
-                        # if none of the failover reasons were because of
-                        # resource unavailability or no failover was attempted
-                        # (the optimizer cannot find feasible resources for
-                        # requested resources), i.e., e.failover_history is
-                        # empty. Failing directly avoids the infinite loop of
-                        # retrying the launch when, e.g., an invalid cluster
-                        # name is used and --retry-until-up is specified.
-                        reasons = (e.failover_history
-                                   if e.failover_history else [e])
-                        reasons_str = '; '.join(
-                            common_utils.format_exception(err)
-                            for err in reasons)
-                        logger.error(
-                            'Failure happened before provisioning. Failover '
-                            f'reasons: {reasons_str}')
+            try:
+                with scheduler.scheduled_launch(self.job_id):
+                    # The job state may have been PENDING during backoff -
+                    # update to STARTING or RECOVERING.
+                    # On the first attempt (when retry_cnt is 1), we should
+                    # already be in STARTING or RECOVERING.
+                    if retry_cnt > 1:
+                        state.set_restarting(self.job_id, self.task_id,
+                                             recovery)
+                    try:
+                        usage_lib.messages.usage.set_internal()
+                        # Detach setup, so that the setup failure can be
+                        # detected by the controller process (job_status ->
+                        # FAILED_SETUP).
+                        execution.launch(
+                            self.dag,
+                            cluster_name=self.cluster_name,
+                            # We expect to tear down the cluster as soon as the
+                            # job is finished. However, in case the controller
+                            # dies, set autodown to try and avoid a resource
+                            # leak.
+                            idle_minutes_to_autostop=_AUTODOWN_MINUTES,
+                            down=True,
+                            _is_launched_by_jobs_controller=True)
+                        logger.info('Managed job cluster launched.')
+                    except (exceptions.InvalidClusterNameError,
+                            exceptions.NoCloudAccessError,
+                            exceptions.ResourcesMismatchError) as e:
+                        logger.error('Failure happened before provisioning. '
+                                     f'{common_utils.format_exception(e)}')
                         if raise_on_failure:
-                            raise exceptions.ProvisionPrechecksError(reasons)
-                        return None
-                    logger.info('Failed to launch a cluster with error: '
-                                f'{common_utils.format_exception(e)})')
-                except Exception as e:  # pylint: disable=broad-except
-                    # If the launch fails, it will be recovered by the following
-                    # code.
-                    logger.info('Failed to launch a cluster with error: '
-                                f'{common_utils.format_exception(e)})')
-                    with ux_utils.enable_traceback():
-                        logger.info(f'  Traceback: {traceback.format_exc()}')
-                else:  # No exception, the launch succeeds.
-                    # At this point, a sky.launch() has succeeded. Cluster may
-                    # be UP (no preemption since) or DOWN (newly preempted).
-                    job_submitted_at = self._wait_until_job_starts_on_cluster()
-                    if job_submitted_at is not None:
-                        return job_submitted_at
-                    # The job fails to start on the cluster, retry the launch.
-                    # TODO(zhwu): log the unexpected error to usage collection
-                    # for future debugging.
-                    logger.info(
-                        'Failed to successfully submit the job to the '
-                        'launched cluster, due to unexpected submission errors '
-                        'or the cluster being preempted during job submission.')
-                # If we get here, the launch did not succeed. Tear down the
-                # cluster and retry.
-                managed_job_utils.terminate_cluster(self.cluster_name)
-                if max_retry is not None and retry_cnt >= max_retry:
-                    # Retry forever if max_retry is None.
-                    if raise_on_failure:
-                        with ux_utils.print_exception_no_traceback():
-                            raise exceptions.ManagedJobReachedMaxRetriesError(
-                                'Resources unavailable: failed to launch '
-                                f'clusters after {max_retry} retries.')
-                    else:
+                            raise exceptions.ProvisionPrechecksError(
+                                reasons=[e])
                         return None
-            # Exit the scheduled_launch context so that the scheulde state is
-            # ALIVE during the backoff. This allows other jobs to launch.
-            gap_seconds = backoff.current_backoff()
-            logger.info('Retrying to launch the cluster in '
-                        f'{gap_seconds:.1f} seconds.')
-            time.sleep(gap_seconds)
+                    except exceptions.ResourcesUnavailableError as e:
+                        # This is raised when the launch fails due to prechecks
+                        # or after failing over through all the candidates.
+                        # Please refer to the docstring of `sky.launch` for more
+                        # details of how the exception will be structured.
+                        if not any(
+                                isinstance(err,
+                                           exceptions.ResourcesUnavailableError)
+                                for err in e.failover_history):
+                            # _launch() (this function) should fail/exit
+                            # directly, if none of the failover reasons were
+                            # because of resource unavailability or no failover
+                            # was attempted (the optimizer cannot find feasible
+                            # resources for requested resources), i.e.,
+                            # e.failover_history is empty. Failing directly
+                            # avoids the infinite loop of retrying the launch
+                            # when, e.g., an invalid cluster name is used and
+                            # --retry-until-up is specified.
+                            reasons = (e.failover_history
+                                       if e.failover_history else [e])
+                            reasons_str = '; '.join(
+                                common_utils.format_exception(err)
+                                for err in reasons)
+                            logger.error(
+                                'Failure happened before provisioning. '
+                                f'Failover reasons: {reasons_str}')
+                            if raise_on_failure:
+                                raise exceptions.ProvisionPrechecksError(
+                                    reasons)
+                            return None
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
+                    except Exception as e:  # pylint: disable=broad-except
+                        # If the launch fails, it will be recovered by the
+                        # following code.
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
+                        with ux_utils.enable_traceback():
+                            logger.info(
+                                f'  Traceback: {traceback.format_exc()}')
+                    else:  # No exception, the launch succeeds.
+                        # At this point, a sky.launch() has succeeded. Cluster
+                        # may be UP (no preemption since) or DOWN (newly
+                        # preempted).
+                        job_submitted_at = (
+                            self._wait_until_job_starts_on_cluster())
+                        if job_submitted_at is not None:
+                            return job_submitted_at
+                        # The job fails to start on the cluster, retry the
+                        # launch.
+                        # TODO(zhwu): log the unexpected error to usage
+                        # collection for future debugging.
+                        logger.info(
+                            'Failed to successfully submit the job to the '
+                            'launched cluster, due to unexpected submission '
+                            'errors or the cluster being preempted during '
+                            'job submission.')
+                    # If we get here, the launch did not succeed. Tear down the
+                    # cluster and retry.
+                    managed_job_utils.terminate_cluster(self.cluster_name)
+                    if max_retry is not None and retry_cnt >= max_retry:
+                        # Retry forever if max_retry is None.
+                        if raise_on_failure:
+                            with ux_utils.print_exception_no_traceback():
+                                raise (
+                                    exceptions.ManagedJobReachedMaxRetriesError(
+                                        'Resources unavailable: failed to '
+                                        f'launch clusters after {max_retry} '
+                                        'retries.'))
+                        else:
+                            return None
+                    # Raise NoClusterLaunchedError to indicate that the job is
+                    # in retry backoff. This will trigger special handling in
+                    # scheduler.schedule_launched().
+                    # We will exit the scheduled_launch context so that the
+                    # schedule state is ALIVE_BACKOFF during the backoff. This
+                    # allows other jobs to launch.
+                    raise exceptions.NoClusterLaunchedError()
+            except exceptions.NoClusterLaunchedError:
+                # Update the status to PENDING during backoff.
+                state.set_backoff_pending(self.job_id, self.task_id)
+                # Calculate the backoff time and sleep.
+                gap_seconds = backoff.current_backoff()
+                logger.info('Retrying to launch the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
+                time.sleep(gap_seconds)
+                continue
+            else:
+                # The inner loop should either return or throw
+                # NoClusterLaunchedError.
+                assert False, 'Unreachable'
     def should_restart_on_failure(self) -> bool:
         """Increments counter & checks if job should be restarted on a failure.
@@ -389,9 +429,9 @@ class FailoverStrategyExecutor(StrategyExecutor):
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
                  task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int) -> None:
+                 job_id: int, task_id: int) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
-                         job_id)
+                         job_id, task_id)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
@@ -400,8 +440,10 @@ class FailoverStrategyExecutor(StrategyExecutor):
     def _launch(self,
                 max_retry: Optional[int] = 3,
-                raise_on_failure: bool = True) -> Optional[float]:
-        job_submitted_at = super()._launch(max_retry, raise_on_failure)
+                raise_on_failure: bool = True,
+                recovery: bool = False) -> Optional[float]:
+        job_submitted_at = super()._launch(max_retry, raise_on_failure,
+                                           recovery)
         if job_submitted_at is not None:
             # Only record the cloud/region if the launch is successful.
             handle = global_user_state.get_handle_from_cluster_name(
@@ -436,7 +478,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
                     cloud=launched_cloud, region=launched_region, zone=None)
                 task.set_resources({new_resources})
                 # Not using self.launch to avoid the retry until up logic.
-                job_submitted_at = self._launch(raise_on_failure=False)
+                job_submitted_at = self._launch(raise_on_failure=False,
+                                                recovery=True)
                 # Restore the original dag, i.e. reset the region constraint.
                 task.set_resources(original_resources)
                 if job_submitted_at is not None:
@@ -452,7 +495,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
             job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                            raise_on_failure=False)
+                                            raise_on_failure=False,
+                                            recovery=True)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
@@ -524,7 +568,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
                                              region=launched_region)
                 }
                 # Not using self.launch to avoid the retry until up logic.
-                job_submitted_at = self._launch(raise_on_failure=False)
+                job_submitted_at = self._launch(raise_on_failure=False,
+                                                recovery=True)
                 task.blocked_resources = None
                 if job_submitted_at is not None:
                     return job_submitted_at
@@ -535,7 +580,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
             job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                            raise_on_failure=False)
+                                            raise_on_failure=False,
+                                            recovery=True)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS

sky/jobs/scheduler.py CHANGED Viewed

@@ -45,6 +45,7 @@ import typing
 import filelock
+from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import common as adaptors_common
 from sky.jobs import constants as managed_job_constants
@@ -190,7 +191,8 @@ def maybe_schedule_next_jobs() -> None:
         pass
-def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
+def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str,
+               priority: int) -> None:
     """Submit an existing job to the scheduler.
     This should be called after a job is created in the `spot` table as
@@ -202,7 +204,7 @@ def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
     """
     with filelock.FileLock(_get_lock_path()):
         state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
-                                    common_utils.get_user_hash())
+                                    common_utils.get_user_hash(), priority)
     maybe_schedule_next_jobs()
@@ -240,11 +242,19 @@ def scheduled_launch(job_id: int):
                state.ManagedJobScheduleState.LAUNCHING):
             time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
-    yield
-    with filelock.FileLock(_get_lock_path()):
-        state.scheduler_set_alive(job_id)
-    maybe_schedule_next_jobs()
+    try:
+        yield
+    except exceptions.NoClusterLaunchedError:
+        # NoClusterLaunchedError is indicates that the job is in retry backoff.
+        # We should transition to ALIVE_BACKOFF instead of ALIVE.
+        with filelock.FileLock(_get_lock_path()):
+            state.scheduler_set_alive_backoff(job_id)
+        raise
+    else:
+        with filelock.FileLock(_get_lock_path()):
+            state.scheduler_set_alive(job_id)
+    finally:
+        maybe_schedule_next_jobs()
 def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -309,5 +319,10 @@ if __name__ == '__main__':
     parser.add_argument('--env-file',
                         type=str,
                         help='The path to the controller env file.')
+    parser.add_argument(
+        '--priority',
+        type=int,
+        default=500,
+        help='Job priority (0-1000, lower is higher). Default: 500.')
     args = parser.parse_args()
-    submit_job(args.job_id, args.dag_yaml, args.env_file)
+    submit_job(args.job_id, args.dag_yaml, args.env_file, args.priority)

sky/jobs/server/core.py CHANGED Viewed

@@ -91,6 +91,7 @@ def launch(
     dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
     task_names = set()
+    priority = None
     for task_ in dag.tasks:
         if task_.name in task_names:
             with ux_utils.print_exception_no_traceback():
@@ -100,6 +101,20 @@ def launch(
                     'name only and comment out the task names (so that they '
                     'will be auto-generated) .')
         task_names.add(task_.name)
+        if task_.job_priority is not None:
+            if (priority is not None and priority != task_.job_priority):
+                with ux_utils.print_exception_no_traceback():
+                    raise ValueError(
+                        'Multiple tasks in the DAG have different priorities. '
+                        'Either specify a priority in only one task, or set '
+                        'the same priority for each task.')
+            priority = task_.job_priority
+    if priority is None:
+        priority = managed_job_constants.DEFAULT_PRIORITY
+    if priority < 0 or priority > 1000:
+        raise ValueError(f'Priority must be between 0 and 1000, got {priority}')
     dag_utils.fill_default_config_in_dag_for_job_launch(dag)
@@ -186,6 +201,7 @@ def launch(
                 service_catalog_common.get_modified_catalog_file_mounts(),
             'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
             'dashboard_user_id': common.SERVER_ID,
+            'priority': priority,
             **controller_utils.shared_controller_vars_to_fill(
                 controller,
                 remote_user_config_path=remote_user_config_path,

skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250529__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250528py3-none-any.whl → 1.0.0.dev20250529py3-none-any.whl