PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl - Mend - Supply Chain Defender

skypilot-nightly 1.0.0.dev20250729py3-none-any.whl → 1.0.0.dev20250731py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -20,6 +20,7 @@ from sky.backends import backend_utils
 from sky.jobs import scheduler
 from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
+from sky.serve import serve_utils
 from sky.skylet import job_lib
 from sky.usage import usage_lib
 from sky.utils import common_utils
@@ -48,9 +49,9 @@ class StrategyExecutor:
     RETRY_INIT_GAP_SECONDS = 60
-    def __init__(self, cluster_name: str, backend: 'backends.Backend',
+    def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
                  task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int, task_id: int) -> None:
+                 job_id: int, task_id: int, pool: Optional[str]) -> None:
         """Initialize the strategy executor.
         Args:
@@ -62,17 +63,23 @@ class StrategyExecutor:
             'Only CloudVMRayBackend is supported.')
         self.dag = sky.Dag()
         self.dag.add(task)
+        # For jobs submitted to a pool, the cluster name might change after each
+        # recovery. Initially this is set to an empty string to indicate that no
+        # cluster is assigned yet, and in `_launch`, it will be set to one of
+        # the cluster names in the pool.
         self.cluster_name = cluster_name
         self.backend = backend
         self.max_restarts_on_errors = max_restarts_on_errors
         self.job_id = job_id
         self.task_id = task_id
+        self.pool = pool
         self.restart_cnt_on_failure = 0
+        self.job_id_on_pool_cluster: Optional[int] = None
     @classmethod
-    def make(cls, cluster_name: str, backend: 'backends.Backend',
-             task: 'task_lib.Task', job_id: int,
-             task_id: int) -> 'StrategyExecutor':
+    def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
+             task: 'task_lib.Task', job_id: int, task_id: int,
+             pool: Optional[str]) -> 'StrategyExecutor':
         """Create a strategy from a task."""
         resource_list = list(task.resources)
@@ -103,7 +110,8 @@ class StrategyExecutor:
                                  from_str(job_recovery_name))
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
-                                     max_restarts_on_errors, job_id, task_id)
+                                     max_restarts_on_errors, job_id, task_id,
+                                     pool)
     def launch(self) -> float:
         """Launch the cluster for the first time.
@@ -131,12 +139,14 @@ class StrategyExecutor:
         """
         raise NotImplementedError
-    def _try_cancel_all_jobs(self):
+    def _try_cancel_jobs(self):
         from sky import core  # pylint: disable=import-outside-toplevel
+        if self.cluster_name is None:
+            return
         handle = global_user_state.get_handle_from_cluster_name(
             self.cluster_name)
-        if handle is None:
+        if handle is None or self.pool is not None:
             return
         try:
             usage_lib.messages.usage.set_internal()
@@ -159,8 +169,13 @@ class StrategyExecutor:
             # should be functional with the `_try_cancel_if_cluster_is_init`
             # flag, i.e. it sends the cancel signal to the head node, which will
             # then kill the user process on remaining worker nodes.
+            # Only cancel the corresponding job for worker pool.
+            if self.pool is None:
+                kwargs = dict(all=True)
+            else:
+                kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
             core.cancel(cluster_name=self.cluster_name,
-                        all=True,
+                        **kwargs,
                         _try_cancel_if_cluster_is_init=True)
         except Exception as e:  # pylint: disable=broad-except
             logger.info('Failed to cancel the job on the cluster. The cluster '
@@ -169,7 +184,7 @@ class StrategyExecutor:
                         f'{common_utils.format_exception(e)}\n'
                         'Terminating the cluster explicitly to ensure no '
                         'remaining job process interferes with recovery.')
-            managed_job_utils.terminate_cluster(self.cluster_name)
+            self._cleanup_cluster()
     def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
         """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -178,6 +193,7 @@ class StrategyExecutor:
             The timestamp of when the job is submitted, or None if failed to
             submit.
         """
+        assert self.cluster_name is not None
         status = None
         job_checking_retry_cnt = 0
         while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
@@ -208,7 +224,9 @@ class StrategyExecutor:
             try:
                 status = managed_job_utils.get_job_status(
-                    self.backend, self.cluster_name)
+                    self.backend,
+                    self.cluster_name,
+                    job_id=self.job_id_on_pool_cluster)
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
                 # loop.
@@ -224,7 +242,10 @@ class StrategyExecutor:
             if status is not None and status > job_lib.JobStatus.INIT:
                 try:
                     job_submitted_at = managed_job_utils.get_job_timestamp(
-                        self.backend, self.cluster_name, get_end_time=False)
+                        self.backend,
+                        self.cluster_name,
+                        self.job_id_on_pool_cluster,
+                        get_end_time=False)
                     return job_submitted_at
                 except Exception as e:  # pylint: disable=broad-except
                     # If we failed to get the job timestamp, we will retry
@@ -236,6 +257,12 @@ class StrategyExecutor:
             time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
         return None
+    def _cleanup_cluster(self) -> None:
+        if self.cluster_name is None:
+            return
+        if self.pool is None:
+            managed_job_utils.terminate_cluster(self.cluster_name)
     def _launch(self,
                 max_retry: Optional[int] = 3,
                 raise_on_failure: bool = True,
@@ -290,19 +317,35 @@ class StrategyExecutor:
                                              recovery)
                     try:
                         usage_lib.messages.usage.set_internal()
-                        # Detach setup, so that the setup failure can be
-                        # detected by the controller process (job_status ->
-                        # FAILED_SETUP).
-                        execution.launch(
-                            self.dag,
-                            cluster_name=self.cluster_name,
-                            # We expect to tear down the cluster as soon as the
-                            # job is finished. However, in case the controller
-                            # dies, set autodown to try and avoid a resource
-                            # leak.
-                            idle_minutes_to_autostop=_AUTODOWN_MINUTES,
-                            down=True,
-                            _is_launched_by_jobs_controller=True)
+                        if self.pool is None:
+                            assert self.cluster_name is not None
+                            # Detach setup, so that the setup failure can be
+                            # detected by the controller process (job_status ->
+                            # FAILED_SETUP).
+                            execution.launch(
+                                self.dag,
+                                cluster_name=self.cluster_name,
+                                # We expect to tear down the cluster as soon as
+                                # the job is finished. However, in case the
+                                # controller dies, set autodown to try and avoid
+                                # a resource leak.
+                                idle_minutes_to_autostop=_AUTODOWN_MINUTES,
+                                down=True,
+                                _is_launched_by_jobs_controller=True)
+                        else:
+                            self.cluster_name = (
+                                serve_utils.get_next_cluster_name(
+                                    self.pool, self.job_id))
+                            if self.cluster_name is None:
+                                raise exceptions.NoClusterLaunchedError(
+                                    'No cluster name found in the pool.')
+                            job_id_on_pool_cluster, _ = execution.exec(
+                                self.dag, cluster_name=self.cluster_name)
+                            assert job_id_on_pool_cluster is not None, (
+                                self.cluster_name, self.job_id)
+                            self.job_id_on_pool_cluster = job_id_on_pool_cluster
+                            state.set_job_id_on_pool_cluster(
+                                self.job_id, job_id_on_pool_cluster)
                         logger.info('Managed job cluster launched.')
                     except (exceptions.InvalidClusterNameError,
                             exceptions.NoCloudAccessError,
@@ -373,7 +416,7 @@ class StrategyExecutor:
                     # If we get here, the launch did not succeed. Tear down the
                     # cluster and retry.
-                    managed_job_utils.terminate_cluster(self.cluster_name)
+                    self._cleanup_cluster()
                     if max_retry is not None and retry_cnt >= max_retry:
                         # Retry forever if max_retry is None.
                         if raise_on_failure:
@@ -398,7 +441,10 @@ class StrategyExecutor:
                 # Update the status to PENDING during backoff.
                 state.set_backoff_pending(self.job_id, self.task_id)
                 # Calculate the backoff time and sleep.
-                gap_seconds = backoff.current_backoff()
+                # We retry immediately for worker pool, since no sky.launch()
+                # is called and the overhead is minimal.
+                gap_seconds = (backoff.current_backoff()
+                               if self.pool is None else 0)
                 logger.info('Retrying to launch the cluster in '
                             f'{gap_seconds:.1f} seconds.')
                 time.sleep(gap_seconds)
@@ -427,11 +473,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
-    def __init__(self, cluster_name: str, backend: 'backends.Backend',
+    def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
                  task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int, task_id: int) -> None:
+                 job_id: int, task_id: int, pool: Optional[str]) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
-                         job_id, task_id)
+                         job_id, task_id, pool)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
@@ -444,7 +490,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
                 recovery: bool = False) -> Optional[float]:
         job_submitted_at = super()._launch(max_retry, raise_on_failure,
                                            recovery)
-        if job_submitted_at is not None:
+        if job_submitted_at is not None and self.cluster_name is not None:
             # Only record the cloud/region if the launch is successful.
             handle = global_user_state.get_handle_from_cluster_name(
                 self.cluster_name)
@@ -464,7 +510,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
         #    original user specification.
         # Step 1
-        self._try_cancel_all_jobs()
+        self._try_cancel_jobs()
         while True:
             # Add region constraint to the task, to retry on the same region
@@ -488,7 +534,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
             # Step 2
             logger.debug('Terminating unhealthy cluster and reset cloud '
                          'region.')
-            managed_job_utils.terminate_cluster(self.cluster_name)
+            self._cleanup_cluster()
             # Step 3
             logger.debug('Relaunch the cluster  without constraining to prior '
@@ -547,7 +593,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
         # Step 1
         logger.debug('Terminating unhealthy cluster and reset cloud region.')
-        managed_job_utils.terminate_cluster(self.cluster_name)
+        self._cleanup_cluster()
         # Step 2
         logger.debug('Relaunch the cluster skipping the previously launched '

sky/jobs/scheduler.py CHANGED Viewed

@@ -9,9 +9,11 @@ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
 be called from any code running on the managed jobs controller instance to
 trigger scheduling of new jobs if possible. This function should be called
 immediately after any state change that could result in jobs newly being able to
-be scheduled.
+be scheduled. If the job is running in a pool, the scheduler will only schedule
+jobs for the same pool, because the resources limitations are per-pool (see the
+following section for more details).
-The scheduling logic limits the number of running jobs according to two limits:
+The scheduling logic limits #running jobs according to three limits:
 1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
    once, based on the number of CPUs. (See _get_launch_parallelism.) This the
    most compute-intensive part of the job lifecycle, which is why we have an
@@ -20,6 +22,8 @@ The scheduling logic limits the number of running jobs according to two limits:
    of memory. (See _get_job_parallelism.) Since the job controller is doing very
    little once a job starts (just checking its status periodically), the most
    significant resource it consumes is memory.
+3. The number of jobs that can be running in a pool at any given time, based on
+   the number of ready workers in the pool. (See _can_start_new_job.)
 The state of the scheduler is entirely determined by the schedule_state column
 of all the jobs in the job_info table. This column should only be modified via
@@ -43,6 +47,7 @@ import os
 import sys
 import time
 import typing
+from typing import Optional
 import filelock
@@ -51,6 +56,7 @@ from sky import sky_logging
 from sky.adaptors import common as adaptors_common
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import state
+from sky.serve import serve_utils
 from sky.skylet import constants
 from sky.utils import common_utils
 from sky.utils import subprocess_utils
@@ -80,18 +86,21 @@ LAUNCHES_PER_CPU = 4
 @lru_cache(maxsize=1)
 def _get_lock_path() -> str:
+    # TODO(tian): Per pool lock.
     path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
     os.makedirs(os.path.dirname(path), exist_ok=True)
     return path
-def _start_controller(job_id: int, dag_yaml_path: str,
-                      env_file_path: str) -> None:
+def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
+                      pool: Optional[str]) -> None:
     activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
     source_environment_cmd = (f'source {env_file_path};'
                               if env_file_path else '')
-    run_controller_cmd = (f'{sys.executable} -u -m sky.jobs.controller '
-                          f'{dag_yaml_path} --job-id {job_id};')
+    maybe_pool_arg = (f'--pool {pool}' if pool is not None else '')
+    run_controller_cmd = (
+        f'{sys.executable} -u -m sky.jobs.controller '
+        f'{dag_yaml_path} --job-id {job_id} {maybe_pool_arg};')
     # If the command line here is changed, please also update
     # utils._controller_process_alive. The substring `--job-id X`
@@ -111,7 +120,7 @@ def _start_controller(job_id: int, dag_yaml_path: str,
     logger.debug(f'Job {job_id} started with pid {pid}')
-def maybe_schedule_next_jobs() -> None:
+def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
     """Determine if any managed jobs can be scheduled, and if so, schedule them.
     Here, "schedule" means to select job that is waiting, and allow it to
@@ -141,6 +150,13 @@ def maybe_schedule_next_jobs() -> None:
     the jobs controller instance. New job controller processes will be detached
     from the current process and there will not be a parent/child relationship.
     See launch_new_process_tree for more.
+    After adding the pool support, this function will be called in a per-pool
+    basis. We employ resources limitation for each pool given the number of
+    ready workers in the pool. Each pool will have its own scheduler queue,
+    indicating by the argument `pool`. Finished job in pool 1 will only trigger
+    another jobs in pool 1, but the job in pool 2 will still be waiting. When
+    the `pool` argument is None, it schedules a job regardless of the pool.
     """
     try:
         # We must use a global lock rather than a per-job lock to ensure correct
@@ -149,10 +165,11 @@ def maybe_schedule_next_jobs() -> None:
         # releasing the lock.
         with filelock.FileLock(_get_lock_path(), blocking=False):
             while True:
-                maybe_next_job = state.get_waiting_job()
+                maybe_next_job = state.get_waiting_job(pool)
                 if maybe_next_job is None:
                     # Nothing left to start, break from scheduling loop
                     break
+                actual_pool = maybe_next_job['pool']
                 current_state = maybe_next_job['schedule_state']
@@ -171,7 +188,17 @@ def maybe_schedule_next_jobs() -> None:
                         # Can't schedule anything, break from scheduling loop.
                         break
                 elif current_state == state.ManagedJobScheduleState.WAITING:
-                    if not _can_start_new_job():
+                    if not _can_start_new_job(actual_pool):
+                        # If there is no job can be scheduled in the pool, we
+                        # try to schedule another job regardless of the pool.
+                        # This is to avoid the case where the pool is scaled
+                        # down at the same time as a job is done. In this case,
+                        # we won't have any job to schedule in the pool, but
+                        # other jobs in other pool (or no pool) can still be
+                        # scheduled.
+                        if pool is not None:
+                            pool = None
+                            continue
                         # Can't schedule anything, break from scheduling loop.
                         break
@@ -187,7 +214,8 @@ def maybe_schedule_next_jobs() -> None:
                     dag_yaml_path = maybe_next_job['dag_yaml_path']
                     env_file_path = maybe_next_job['env_file_path']
-                    _start_controller(job_id, dag_yaml_path, env_file_path)
+                    _start_controller(job_id, dag_yaml_path, env_file_path,
+                                      actual_pool)
     except filelock.Timeout:
         # If we can't get the lock, just exit. The process holding the lock
@@ -196,7 +224,7 @@ def maybe_schedule_next_jobs() -> None:
 def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
-               env_file_path: str, priority: int) -> None:
+               env_file_path: str, priority: int, pool: Optional[str]) -> None:
     """Submit an existing job to the scheduler.
     This should be called after a job is created in the `spot` table as
@@ -213,9 +241,9 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
                                                 common_utils.get_user_hash(),
                                                 priority)
     if is_resume:
-        _start_controller(job_id, dag_yaml_path, env_file_path)
+        _start_controller(job_id, dag_yaml_path, env_file_path, pool)
     else:
-        maybe_schedule_next_jobs()
+        maybe_schedule_next_jobs(pool)
 @contextlib.contextmanager
@@ -251,6 +279,7 @@ def scheduled_launch(job_id: int):
         while (state.get_job_schedule_state(job_id) !=
                state.ManagedJobScheduleState.LAUNCHING):
             time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
+    pool = state.get_pool_from_job_id(job_id)
     try:
         yield
@@ -264,7 +293,7 @@ def scheduled_launch(job_id: int):
         with filelock.FileLock(_get_lock_path()):
             state.scheduler_set_alive(job_id)
     finally:
-        maybe_schedule_next_jobs()
+        maybe_schedule_next_jobs(pool)
 def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -279,17 +308,19 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
     if idempotent and (state.get_job_schedule_state(job_id)
                        == state.ManagedJobScheduleState.DONE):
         return
+    pool = state.get_pool_from_job_id(job_id)
     with filelock.FileLock(_get_lock_path()):
         state.scheduler_set_done(job_id, idempotent)
-    maybe_schedule_next_jobs()
+    maybe_schedule_next_jobs(pool)
 def _set_alive_waiting(job_id: int) -> None:
     """Should use wait_until_launch_okay() to transition to this state."""
     with filelock.FileLock(_get_lock_path()):
         state.scheduler_set_alive_waiting(job_id)
-    maybe_schedule_next_jobs()
+    pool = state.get_pool_from_job_id(job_id)
+    maybe_schedule_next_jobs(pool)
 def _get_job_parallelism() -> int:
@@ -305,11 +336,23 @@ def _get_launch_parallelism() -> int:
     return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
-def _can_start_new_job() -> bool:
+def _can_start_new_job(pool: Optional[str]) -> bool:
     launching_jobs = state.get_num_launching_jobs()
     alive_jobs = state.get_num_alive_jobs()
-    return launching_jobs < _get_launch_parallelism(
-    ) and alive_jobs < _get_job_parallelism()
+    # Check basic resource limits
+    if not (launching_jobs < _get_launch_parallelism() and
+            alive_jobs < _get_job_parallelism()):
+        return False
+    # Check if there are available replicas in the pool
+    if pool is not None:
+        alive_jobs_in_pool = state.get_num_alive_jobs(pool)
+        if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
+            logger.debug(f'No replicas available in pool {pool}')
+            return False
+    return True
 def _can_lauch_in_alive_job() -> bool:
@@ -332,6 +375,11 @@ if __name__ == '__main__':
     parser.add_argument('--env-file',
                         type=str,
                         help='The path to the controller env file.')
+    parser.add_argument('--pool',
+                        type=str,
+                        required=False,
+                        default=None,
+                        help='The pool to use for the controller job.')
     parser.add_argument(
         '--priority',
         type=int,
@@ -341,4 +389,4 @@ if __name__ == '__main__':
         f' Default: {constants.DEFAULT_PRIORITY}.')
     args = parser.parse_args()
     submit_job(args.job_id, args.dag_yaml, args.user_yaml_path, args.env_file,
-               args.priority)
+               args.priority, args.pool)