PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250528py3-none-any.whl → 1.0.0.dev20250530py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

sky/global_user_state.py CHANGED Viewed

@@ -22,6 +22,7 @@ from sqlalchemy import orm
 from sqlalchemy.dialects import postgresql
 from sqlalchemy.dialects import sqlite
 from sqlalchemy.ext import declarative
+import yaml
 from sky import models
 from sky import sky_logging
@@ -96,6 +97,12 @@ cluster_table = sqlalchemy.Table(
     sqlalchemy.Column('workspace',
                       sqlalchemy.Text,
                       server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
+    sqlalchemy.Column('last_creation_yaml',
+                      sqlalchemy.Text,
+                      server_default=None),
+    sqlalchemy.Column('last_creation_command',
+                      sqlalchemy.Text,
+                      server_default=None),
 )
 storage_table = sqlalchemy.Table(
@@ -133,6 +140,21 @@ cluster_history_table = sqlalchemy.Table(
     sqlalchemy.Column('user_hash', sqlalchemy.Text),
 )
+ssh_key_table = sqlalchemy.Table(
+    'ssh_key',
+    Base.metadata,
+    sqlalchemy.Column('user_hash', sqlalchemy.Text, primary_key=True),
+    sqlalchemy.Column('ssh_public_key', sqlalchemy.Text),
+    sqlalchemy.Column('ssh_private_key', sqlalchemy.Text),
+)
+cluster_yaml_table = sqlalchemy.Table(
+    'cluster_yaml',
+    Base.metadata,
+    sqlalchemy.Column('cluster_name', sqlalchemy.Text, primary_key=True),
+    sqlalchemy.Column('yaml', sqlalchemy.Text),
+)
 def _glob_to_similar(glob_pattern):
     """Converts a glob pattern to a PostgreSQL LIKE pattern."""
@@ -270,6 +292,19 @@ def create_table():
             default_statement='DEFAULT \'default\'',
             value_to_replace_existing_entries=constants.
             SKYPILOT_DEFAULT_WORKSPACE)
+        db_utils.add_column_to_table_sqlalchemy(
+            session,
+            'clusters',
+            'last_creation_yaml',
+            sqlalchemy.Text(),
+            default_statement='DEFAULT NULL',
+        )
+        db_utils.add_column_to_table_sqlalchemy(
+            session,
+            'clusters',
+            'last_creation_command',
+            sqlalchemy.Text(),
+            default_statement='DEFAULT NULL')
         session.commit()
@@ -318,7 +353,8 @@ def add_or_update_cluster(cluster_name: str,
                           requested_resources: Optional[Set[Any]],
                           ready: bool,
                           is_launch: bool = True,
-                          config_hash: Optional[str] = None):
+                          config_hash: Optional[str] = None,
+                          task_config: Optional[Dict[str, Any]] = None):
     """Adds or updates cluster_name -> cluster_handle mapping.
     Args:
@@ -329,6 +365,8 @@ def add_or_update_cluster(cluster_name: str,
             be marked as INIT, otherwise it will be marked as UP.
         is_launch: if the cluster is firstly launched. If True, the launched_at
             and last_use will be updated. Otherwise, use the old value.
+        config_hash: Configuration hash for the cluster.
+        task_config: The config of the task being launched.
     """
     # TODO(zhwu): have to be imported here to avoid circular import.
     from sky import skypilot_config  # pylint: disable=import-outside-toplevel
@@ -404,6 +442,13 @@ def add_or_update_cluster(cluster_name: str,
             conditional_values.update({
                 'workspace': active_workspace,
             })
+        if (is_launch and not cluster_row or
+                cluster_row.status != status_lib.ClusterStatus.UP.value):
+            conditional_values.update({
+                'last_creation_yaml': common_utils.dump_yaml_str(task_config)
+                                      if task_config else None,
+                'last_creation_command': last_use,
+            })
         if (_SQLALCHEMY_ENGINE.dialect.name ==
                 db_utils.SQLAlchemyDialect.SQLITE.value):
@@ -790,6 +835,8 @@ def get_cluster_from_name(
         'user_name': get_user(user_hash).name,
         'config_hash': row.config_hash,
         'workspace': row.workspace,
+        'last_creation_yaml': row.last_creation_yaml,
+        'last_creation_command': row.last_creation_command,
     }
     return record
@@ -822,6 +869,8 @@ def get_clusters() -> List[Dict[str, Any]]:
             'user_name': get_user(user_hash).name,
             'config_hash': row.config_hash,
             'workspace': row.workspace,
+            'last_creation_yaml': row.last_creation_yaml,
+            'last_creation_command': row.last_creation_command,
         }
         records.append(record)
@@ -1049,3 +1098,102 @@ def get_storage() -> List[Dict[str, Any]]:
             'status': status_lib.StorageStatus[row.status],
         })
     return records
+def get_ssh_keys(user_hash: str) -> Tuple[str, str, bool]:
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        row = session.query(ssh_key_table).filter_by(
+            user_hash=user_hash).first()
+    if row:
+        return row.ssh_public_key, row.ssh_private_key, True
+    return '', '', False
+def set_ssh_keys(user_hash: str, ssh_public_key: str, ssh_private_key: str):
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        if (_SQLALCHEMY_ENGINE.dialect.name ==
+                db_utils.SQLAlchemyDialect.SQLITE.value):
+            insert_func = sqlite.insert
+        elif (_SQLALCHEMY_ENGINE.dialect.name ==
+              db_utils.SQLAlchemyDialect.POSTGRESQL.value):
+            insert_func = postgresql.insert
+        else:
+            raise ValueError('Unsupported database dialect')
+        insert_stmnt = insert_func(ssh_key_table).values(
+            user_hash=user_hash,
+            ssh_public_key=ssh_public_key,
+            ssh_private_key=ssh_private_key)
+        do_update_stmt = insert_stmnt.on_conflict_do_update(
+            index_elements=[ssh_key_table.c.user_hash],
+            set_={
+                ssh_key_table.c.ssh_public_key: ssh_public_key,
+                ssh_key_table.c.ssh_private_key: ssh_private_key
+            })
+        session.execute(do_update_stmt)
+        session.commit()
+def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
+    """Get the cluster yaml from the database or the local file system.
+    If the cluster yaml is not in the database, check if it exists on the
+    local file system and migrate it to the database.
+    It is assumed that the cluster yaml file is named as <cluster_name>.yml.
+    """
+    if cluster_yaml_path is None:
+        raise ValueError('Attempted to read a None YAML.')
+    cluster_file_name = os.path.basename(cluster_yaml_path)
+    cluster_name, _ = os.path.splitext(cluster_file_name)
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        row = session.query(cluster_yaml_table).filter_by(
+            cluster_name=cluster_name).first()
+    if row is None:
+        # If the cluster yaml is not in the database, check if it exists
+        # on the local file system and migrate it to the database.
+        # TODO(syang): remove this check once we have a way to migrate the
+        # cluster from file to database. Remove on v0.12.0.
+        if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
+            with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
+                yaml_str = f.read()
+            set_cluster_yaml(cluster_name, yaml_str)
+            return yaml_str
+        return None
+    return row.yaml
+def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
+    """Get the cluster yaml as a dictionary from the database.
+    It is assumed that the cluster yaml file is named as <cluster_name>.yml.
+    """
+    yaml_str = get_cluster_yaml_str(cluster_yaml_path)
+    if yaml_str is None:
+        raise ValueError(f'Cluster yaml {cluster_yaml_path} not found.')
+    return yaml.safe_load(yaml_str)
+def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
+    """Set the cluster yaml in the database."""
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        if (_SQLALCHEMY_ENGINE.dialect.name ==
+                db_utils.SQLAlchemyDialect.SQLITE.value):
+            insert_func = sqlite.insert
+        elif (_SQLALCHEMY_ENGINE.dialect.name ==
+              db_utils.SQLAlchemyDialect.POSTGRESQL.value):
+            insert_func = postgresql.insert
+        else:
+            raise ValueError('Unsupported database dialect')
+        insert_stmnt = insert_func(cluster_yaml_table).values(
+            cluster_name=cluster_name, yaml=yaml_str)
+        do_update_stmt = insert_stmnt.on_conflict_do_update(
+            index_elements=[cluster_yaml_table.c.cluster_name],
+            set_={cluster_yaml_table.c.yaml: yaml_str})
+        session.execute(do_update_stmt)
+        session.commit()
+def remove_cluster_yaml(cluster_name: str):
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        session.query(cluster_yaml_table).filter_by(
+            cluster_name=cluster_name).delete()
+        session.commit()

sky/jobs/client/sdk.py CHANGED Viewed

@@ -46,6 +46,7 @@ def launch(
         task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
             managed job.
         name: Name of the managed job.
+        priority: Priority of the managed job.
         _need_confirmation: (Internal only) Whether to show a confirmation
             prompt before launching the job.

sky/jobs/constants.py CHANGED Viewed

@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
 # The version of the lib files that jobs/utils use. Whenever there is an API
 # change for the jobs/utils, we need to bump this version and update
 # job.utils.ManagedJobCodeGen to handle the version update.
-MANAGED_JOBS_VERSION = 4
+MANAGED_JOBS_VERSION = 5
 # The command for setting up the jobs dashboard on the controller. It firstly
 # checks if the systemd services are available, and if not (e.g., Kubernetes
@@ -70,3 +70,5 @@ DASHBOARD_SETUP_CMD = (
     f'(nohup {skylet_constants.SKY_PYTHON_CMD} -m sky.jobs.dashboard.dashboard '
     '>> ~/.sky/job-dashboard.log 2>&1 &); '
     'fi')
+DEFAULT_PRIORITY = 500

sky/jobs/controller.py CHANGED Viewed

@@ -179,8 +179,8 @@ class JobsController:
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
             task.name, self._job_id)
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
-            cluster_name, self._backend, task, self._job_id)
-        managed_job_state.set_submitted(
+            cluster_name, self._backend, task, self._job_id, task_id)
+        managed_job_state.set_starting(
             self._job_id,
             task_id,
             self._backend.run_timestamp,
@@ -197,9 +197,7 @@ class JobsController:
             f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
         logger.info('Started monitoring.')
-        managed_job_state.set_starting(job_id=self._job_id,
-                                       task_id=task_id,
-                                       callback_func=callback_func)
         remote_job_submitted_at = self._strategy_executor.launch()
         assert remote_job_submitted_at is not None, remote_job_submitted_at

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -18,6 +18,7 @@ from sky import global_user_state
 from sky import sky_logging
 from sky.backends import backend_utils
 from sky.jobs import scheduler
+from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
 from sky.skylet import job_lib
 from sky.usage import usage_lib
@@ -49,7 +50,7 @@ class StrategyExecutor:
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
                  task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int) -> None:
+                 job_id: int, task_id: int) -> None:
         """Initialize the strategy executor.
         Args:
@@ -65,11 +66,13 @@ class StrategyExecutor:
         self.backend = backend
         self.max_restarts_on_errors = max_restarts_on_errors
         self.job_id = job_id
+        self.task_id = task_id
         self.restart_cnt_on_failure = 0
     @classmethod
     def make(cls, cluster_name: str, backend: 'backends.Backend',
-             task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
+             task: 'task_lib.Task', job_id: int,
+             task_id: int) -> 'StrategyExecutor':
         """Create a strategy from a task."""
         resource_list = list(task.resources)
@@ -100,7 +103,7 @@ class StrategyExecutor:
                                  from_str(job_recovery_name))
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
-                                     max_restarts_on_errors, job_id)
+                                     max_restarts_on_errors, job_id, task_id)
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -235,7 +238,8 @@ class StrategyExecutor:
     def _launch(self,
                 max_retry: Optional[int] = 3,
-                raise_on_failure: bool = True) -> Optional[float]:
+                raise_on_failure: bool = True,
+                recovery: bool = False) -> Optional[float]:
         """Implementation of launch().
         The function will wait until the job starts running, but will leave the
@@ -275,98 +279,134 @@ class StrategyExecutor:
         backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
         while True:
             retry_cnt += 1
-            with scheduler.scheduled_launch(self.job_id):
-                try:
-                    usage_lib.messages.usage.set_internal()
-                    # Detach setup, so that the setup failure can be detected
-                    # by the controller process (job_status -> FAILED_SETUP).
-                    execution.launch(
-                        self.dag,
-                        cluster_name=self.cluster_name,
-                        # We expect to tear down the cluster as soon as the job
-                        # is finished. However, in case the controller dies, set
-                        # autodown to try and avoid a resource leak.
-                        idle_minutes_to_autostop=_AUTODOWN_MINUTES,
-                        down=True,
-                        _is_launched_by_jobs_controller=True)
-                    logger.info('Managed job cluster launched.')
-                except (exceptions.InvalidClusterNameError,
-                        exceptions.NoCloudAccessError,
-                        exceptions.ResourcesMismatchError) as e:
-                    logger.error('Failure happened before provisioning. '
-                                 f'{common_utils.format_exception(e)}')
-                    if raise_on_failure:
-                        raise exceptions.ProvisionPrechecksError(reasons=[e])
-                    return None
-                except exceptions.ResourcesUnavailableError as e:
-                    # This is raised when the launch fails due to prechecks or
-                    # after failing over through all the candidates.
-                    # Please refer to the docstring of `sky.launch` for more
-                    # details of how the exception will be structured.
-                    if not any(
-                            isinstance(err,
-                                       exceptions.ResourcesUnavailableError)
-                            for err in e.failover_history):
-                        # _launch() (this function) should fail/exit directly,
-                        # if none of the failover reasons were because of
-                        # resource unavailability or no failover was attempted
-                        # (the optimizer cannot find feasible resources for
-                        # requested resources), i.e., e.failover_history is
-                        # empty. Failing directly avoids the infinite loop of
-                        # retrying the launch when, e.g., an invalid cluster
-                        # name is used and --retry-until-up is specified.
-                        reasons = (e.failover_history
-                                   if e.failover_history else [e])
-                        reasons_str = '; '.join(
-                            common_utils.format_exception(err)
-                            for err in reasons)
-                        logger.error(
-                            'Failure happened before provisioning. Failover '
-                            f'reasons: {reasons_str}')
+            try:
+                with scheduler.scheduled_launch(self.job_id):
+                    # The job state may have been PENDING during backoff -
+                    # update to STARTING or RECOVERING.
+                    # On the first attempt (when retry_cnt is 1), we should
+                    # already be in STARTING or RECOVERING.
+                    if retry_cnt > 1:
+                        state.set_restarting(self.job_id, self.task_id,
+                                             recovery)
+                    try:
+                        usage_lib.messages.usage.set_internal()
+                        # Detach setup, so that the setup failure can be
+                        # detected by the controller process (job_status ->
+                        # FAILED_SETUP).
+                        execution.launch(
+                            self.dag,
+                            cluster_name=self.cluster_name,
+                            # We expect to tear down the cluster as soon as the
+                            # job is finished. However, in case the controller
+                            # dies, set autodown to try and avoid a resource
+                            # leak.
+                            idle_minutes_to_autostop=_AUTODOWN_MINUTES,
+                            down=True,
+                            _is_launched_by_jobs_controller=True)
+                        logger.info('Managed job cluster launched.')
+                    except (exceptions.InvalidClusterNameError,
+                            exceptions.NoCloudAccessError,
+                            exceptions.ResourcesMismatchError) as e:
+                        logger.error('Failure happened before provisioning. '
+                                     f'{common_utils.format_exception(e)}')
                         if raise_on_failure:
-                            raise exceptions.ProvisionPrechecksError(reasons)
-                        return None
-                    logger.info('Failed to launch a cluster with error: '
-                                f'{common_utils.format_exception(e)})')
-                except Exception as e:  # pylint: disable=broad-except
-                    # If the launch fails, it will be recovered by the following
-                    # code.
-                    logger.info('Failed to launch a cluster with error: '
-                                f'{common_utils.format_exception(e)})')
-                    with ux_utils.enable_traceback():
-                        logger.info(f'  Traceback: {traceback.format_exc()}')
-                else:  # No exception, the launch succeeds.
-                    # At this point, a sky.launch() has succeeded. Cluster may
-                    # be UP (no preemption since) or DOWN (newly preempted).
-                    job_submitted_at = self._wait_until_job_starts_on_cluster()
-                    if job_submitted_at is not None:
-                        return job_submitted_at
-                    # The job fails to start on the cluster, retry the launch.
-                    # TODO(zhwu): log the unexpected error to usage collection
-                    # for future debugging.
-                    logger.info(
-                        'Failed to successfully submit the job to the '
-                        'launched cluster, due to unexpected submission errors '
-                        'or the cluster being preempted during job submission.')
-                # If we get here, the launch did not succeed. Tear down the
-                # cluster and retry.
-                managed_job_utils.terminate_cluster(self.cluster_name)
-                if max_retry is not None and retry_cnt >= max_retry:
-                    # Retry forever if max_retry is None.
-                    if raise_on_failure:
-                        with ux_utils.print_exception_no_traceback():
-                            raise exceptions.ManagedJobReachedMaxRetriesError(
-                                'Resources unavailable: failed to launch '
-                                f'clusters after {max_retry} retries.')
-                    else:
+                            raise exceptions.ProvisionPrechecksError(
+                                reasons=[e])
                         return None
-            # Exit the scheduled_launch context so that the scheulde state is
-            # ALIVE during the backoff. This allows other jobs to launch.
-            gap_seconds = backoff.current_backoff()
-            logger.info('Retrying to launch the cluster in '
-                        f'{gap_seconds:.1f} seconds.')
-            time.sleep(gap_seconds)
+                    except exceptions.ResourcesUnavailableError as e:
+                        # This is raised when the launch fails due to prechecks
+                        # or after failing over through all the candidates.
+                        # Please refer to the docstring of `sky.launch` for more
+                        # details of how the exception will be structured.
+                        if not any(
+                                isinstance(err,
+                                           exceptions.ResourcesUnavailableError)
+                                for err in e.failover_history):
+                            # _launch() (this function) should fail/exit
+                            # directly, if none of the failover reasons were
+                            # because of resource unavailability or no failover
+                            # was attempted (the optimizer cannot find feasible
+                            # resources for requested resources), i.e.,
+                            # e.failover_history is empty. Failing directly
+                            # avoids the infinite loop of retrying the launch
+                            # when, e.g., an invalid cluster name is used and
+                            # --retry-until-up is specified.
+                            reasons = (e.failover_history
+                                       if e.failover_history else [e])
+                            reasons_str = '; '.join(
+                                common_utils.format_exception(err)
+                                for err in reasons)
+                            logger.error(
+                                'Failure happened before provisioning. '
+                                f'Failover reasons: {reasons_str}')
+                            if raise_on_failure:
+                                raise exceptions.ProvisionPrechecksError(
+                                    reasons)
+                            return None
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
+                    except Exception as e:  # pylint: disable=broad-except
+                        # If the launch fails, it will be recovered by the
+                        # following code.
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
+                        with ux_utils.enable_traceback():
+                            logger.info(
+                                f'  Traceback: {traceback.format_exc()}')
+                    else:  # No exception, the launch succeeds.
+                        # At this point, a sky.launch() has succeeded. Cluster
+                        # may be UP (no preemption since) or DOWN (newly
+                        # preempted).
+                        job_submitted_at = (
+                            self._wait_until_job_starts_on_cluster())
+                        if job_submitted_at is not None:
+                            return job_submitted_at
+                        # The job fails to start on the cluster, retry the
+                        # launch.
+                        # TODO(zhwu): log the unexpected error to usage
+                        # collection for future debugging.
+                        logger.info(
+                            'Failed to successfully submit the job to the '
+                            'launched cluster, due to unexpected submission '
+                            'errors or the cluster being preempted during '
+                            'job submission.')
+                    # If we get here, the launch did not succeed. Tear down the
+                    # cluster and retry.
+                    managed_job_utils.terminate_cluster(self.cluster_name)
+                    if max_retry is not None and retry_cnt >= max_retry:
+                        # Retry forever if max_retry is None.
+                        if raise_on_failure:
+                            with ux_utils.print_exception_no_traceback():
+                                raise (
+                                    exceptions.ManagedJobReachedMaxRetriesError(
+                                        'Resources unavailable: failed to '
+                                        f'launch clusters after {max_retry} '
+                                        'retries.'))
+                        else:
+                            return None
+                    # Raise NoClusterLaunchedError to indicate that the job is
+                    # in retry backoff. This will trigger special handling in
+                    # scheduler.schedule_launched().
+                    # We will exit the scheduled_launch context so that the
+                    # schedule state is ALIVE_BACKOFF during the backoff. This
+                    # allows other jobs to launch.
+                    raise exceptions.NoClusterLaunchedError()
+            except exceptions.NoClusterLaunchedError:
+                # Update the status to PENDING during backoff.
+                state.set_backoff_pending(self.job_id, self.task_id)
+                # Calculate the backoff time and sleep.
+                gap_seconds = backoff.current_backoff()
+                logger.info('Retrying to launch the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
+                time.sleep(gap_seconds)
+                continue
+            else:
+                # The inner loop should either return or throw
+                # NoClusterLaunchedError.
+                assert False, 'Unreachable'
     def should_restart_on_failure(self) -> bool:
         """Increments counter & checks if job should be restarted on a failure.
@@ -389,9 +429,9 @@ class FailoverStrategyExecutor(StrategyExecutor):
     def __init__(self, cluster_name: str, backend: 'backends.Backend',
                  task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int) -> None:
+                 job_id: int, task_id: int) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
-                         job_id)
+                         job_id, task_id)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
@@ -400,8 +440,10 @@ class FailoverStrategyExecutor(StrategyExecutor):
     def _launch(self,
                 max_retry: Optional[int] = 3,
-                raise_on_failure: bool = True) -> Optional[float]:
-        job_submitted_at = super()._launch(max_retry, raise_on_failure)
+                raise_on_failure: bool = True,
+                recovery: bool = False) -> Optional[float]:
+        job_submitted_at = super()._launch(max_retry, raise_on_failure,
+                                           recovery)
         if job_submitted_at is not None:
             # Only record the cloud/region if the launch is successful.
             handle = global_user_state.get_handle_from_cluster_name(
@@ -436,7 +478,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
                     cloud=launched_cloud, region=launched_region, zone=None)
                 task.set_resources({new_resources})
                 # Not using self.launch to avoid the retry until up logic.
-                job_submitted_at = self._launch(raise_on_failure=False)
+                job_submitted_at = self._launch(raise_on_failure=False,
+                                                recovery=True)
                 # Restore the original dag, i.e. reset the region constraint.
                 task.set_resources(original_resources)
                 if job_submitted_at is not None:
@@ -452,7 +495,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
             job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                            raise_on_failure=False)
+                                            raise_on_failure=False,
+                                            recovery=True)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
@@ -524,7 +568,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
                                              region=launched_region)
                 }
                 # Not using self.launch to avoid the retry until up logic.
-                job_submitted_at = self._launch(raise_on_failure=False)
+                job_submitted_at = self._launch(raise_on_failure=False,
+                                                recovery=True)
                 task.blocked_resources = None
                 if job_submitted_at is not None:
                     return job_submitted_at
@@ -535,7 +580,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
             job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                            raise_on_failure=False)
+                                            raise_on_failure=False,
+                                            recovery=True)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS

sky/jobs/scheduler.py CHANGED Viewed

@@ -45,6 +45,7 @@ import typing
 import filelock
+from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import common as adaptors_common
 from sky.jobs import constants as managed_job_constants
@@ -190,7 +191,8 @@ def maybe_schedule_next_jobs() -> None:
         pass
-def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
+def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str,
+               priority: int) -> None:
     """Submit an existing job to the scheduler.
     This should be called after a job is created in the `spot` table as
@@ -202,7 +204,7 @@ def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
     """
     with filelock.FileLock(_get_lock_path()):
         state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
-                                    common_utils.get_user_hash())
+                                    common_utils.get_user_hash(), priority)
     maybe_schedule_next_jobs()
@@ -240,11 +242,19 @@ def scheduled_launch(job_id: int):
                state.ManagedJobScheduleState.LAUNCHING):
             time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
-    yield
-    with filelock.FileLock(_get_lock_path()):
-        state.scheduler_set_alive(job_id)
-    maybe_schedule_next_jobs()
+    try:
+        yield
+    except exceptions.NoClusterLaunchedError:
+        # NoClusterLaunchedError is indicates that the job is in retry backoff.
+        # We should transition to ALIVE_BACKOFF instead of ALIVE.
+        with filelock.FileLock(_get_lock_path()):
+            state.scheduler_set_alive_backoff(job_id)
+        raise
+    else:
+        with filelock.FileLock(_get_lock_path()):
+            state.scheduler_set_alive(job_id)
+    finally:
+        maybe_schedule_next_jobs()
 def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -309,5 +319,10 @@ if __name__ == '__main__':
     parser.add_argument('--env-file',
                         type=str,
                         help='The path to the controller env file.')
+    parser.add_argument(
+        '--priority',
+        type=int,
+        default=500,
+        help='Job priority (0-1000, lower is higher). Default: 500.')
     args = parser.parse_args()
-    submit_job(args.job_id, args.dag_yaml, args.env_file)
+    submit_job(args.job_id, args.dag_yaml, args.env_file, args.priority)

skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250528py3-none-any.whl → 1.0.0.dev20250530py3-none-any.whl