PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250812py3-none-any.whl → 1.0.0.dev20250815py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (179) hide show

sky/__init__.py +4 -2
sky/adaptors/nebius.py +43 -1
sky/backends/backend_utils.py +74 -7
sky/backends/cloud_vm_ray_backend.py +169 -29
sky/catalog/cudo_catalog.py +1 -1
sky/catalog/data_fetchers/fetch_cudo.py +1 -1
sky/catalog/data_fetchers/fetch_nebius.py +6 -3
sky/client/cli/command.py +62 -85
sky/client/common.py +1 -1
sky/client/sdk.py +69 -19
sky/client/sdk_async.py +5 -4
sky/clouds/aws.py +52 -1
sky/clouds/kubernetes.py +15 -5
sky/clouds/nebius.py +3 -1
sky/dag.py +1 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +11 -1
sky/exceptions.py +5 -0
sky/execution.py +13 -10
sky/global_user_state.py +191 -8
sky/jobs/constants.py +1 -1
sky/jobs/controller.py +0 -1
sky/jobs/recovery_strategy.py +3 -3
sky/jobs/scheduler.py +35 -87
sky/jobs/server/core.py +82 -22
sky/jobs/server/utils.py +1 -1
sky/jobs/state.py +7 -5
sky/jobs/utils.py +167 -8
sky/provision/__init__.py +1 -0
sky/provision/aws/config.py +25 -0
sky/provision/aws/instance.py +37 -13
sky/provision/azure/instance.py +2 -0
sky/provision/cudo/cudo_wrapper.py +1 -1
sky/provision/cudo/instance.py +2 -0
sky/provision/do/instance.py +2 -0
sky/provision/fluidstack/instance.py +2 -0
sky/provision/gcp/instance.py +2 -0
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/kubernetes/instance.py +133 -0
sky/provision/lambda_cloud/instance.py +2 -0
sky/provision/nebius/instance.py +2 -0
sky/provision/nebius/utils.py +101 -86
sky/provision/oci/instance.py +2 -0
sky/provision/paperspace/instance.py +2 -1
sky/provision/paperspace/utils.py +1 -1
sky/provision/provisioner.py +13 -8
sky/provision/runpod/instance.py +2 -0
sky/provision/runpod/utils.py +1 -1
sky/provision/scp/instance.py +2 -0
sky/provision/vast/instance.py +2 -0
sky/provision/vsphere/instance.py +2 -0
sky/resources.py +6 -7
sky/schemas/__init__.py +0 -0
sky/schemas/api/__init__.py +0 -0
sky/schemas/api/responses.py +70 -0
sky/schemas/db/global_user_state/006_provision_log.py +41 -0
sky/schemas/generated/__init__.py +0 -0
sky/schemas/generated/autostopv1_pb2.py +36 -0
sky/schemas/generated/autostopv1_pb2.pyi +43 -0
sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
sky/serve/constants.py +3 -7
sky/serve/replica_managers.py +138 -117
sky/serve/serve_state.py +42 -0
sky/serve/serve_utils.py +58 -36
sky/serve/server/impl.py +15 -19
sky/serve/service.py +82 -33
sky/server/constants.py +1 -1
sky/server/requests/payloads.py +6 -0
sky/server/requests/serializers/decoders.py +12 -2
sky/server/requests/serializers/encoders.py +10 -2
sky/server/server.py +64 -16
sky/setup_files/dependencies.py +11 -10
sky/skylet/autostop_lib.py +38 -5
sky/skylet/constants.py +3 -1
sky/skylet/services.py +44 -0
sky/skylet/skylet.py +49 -4
sky/task.py +19 -16
sky/templates/aws-ray.yml.j2 +2 -2
sky/templates/jobs-controller.yaml.j2 +6 -0
sky/templates/kubernetes-ray.yml.j2 +1 -0
sky/utils/command_runner.py +1 -1
sky/utils/common_utils.py +20 -0
sky/utils/config_utils.py +29 -5
sky/utils/controller_utils.py +86 -0
sky/utils/db/db_utils.py +17 -0
sky/utils/db/migration_utils.py +1 -1
sky/utils/log_utils.py +14 -5
sky/utils/resources_utils.py +25 -1
sky/utils/schemas.py +6 -0
sky/utils/ux_utils.py +36 -5
sky/volumes/server/core.py +2 -2
sky/volumes/server/server.py +2 -2
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
/sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0

sky/jobs/scheduler.py CHANGED Viewed

@@ -15,13 +15,14 @@ following section for more details).
 The scheduling logic limits #running jobs according to three limits:
 1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
-   once, based on the number of CPUs. (See _get_launch_parallelism.) This the
-   most compute-intensive part of the job lifecycle, which is why we have an
-   additional limit.
+   once, based on the number of CPUs. This the most compute-intensive part of
+   the job lifecycle, which is why we have an additional limit.
+   See sky/utils/controller_utils.py::_get_launch_parallelism.
 2. The number of jobs that can be running at any given time, based on the amount
-   of memory. (See _get_job_parallelism.) Since the job controller is doing very
-   little once a job starts (just checking its status periodically), the most
-   significant resource it consumes is memory.
+   of memory. Since the job controller is doing very little once a job starts
+   (just checking its status periodically), the most significant resource it
+   consumes is memory.
+   See sky/utils/controller_utils.py::_get_job_parallelism.
 3. The number of jobs that can be running in a pool at any given time, based on
    the number of ready workers in the pool. (See _can_start_new_job.)
@@ -42,55 +43,27 @@ Nomenclature:
 from argparse import ArgumentParser
 import contextlib
-from functools import lru_cache
 import os
 import sys
 import time
-import typing
 from typing import Optional
 import filelock
 from sky import exceptions
 from sky import sky_logging
-from sky.adaptors import common as adaptors_common
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import state
 from sky.serve import serve_utils
 from sky.skylet import constants
 from sky.utils import common_utils
+from sky.utils import controller_utils
 from sky.utils import subprocess_utils
-if typing.TYPE_CHECKING:
-    import psutil
-else:
-    psutil = adaptors_common.LazyImport('psutil')
 logger = sky_logging.init_logger('sky.jobs.controller')
-# The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
-# parallelism control or updating the schedule_state of any job.
-# Any code that takes this lock must conclude by calling
-# maybe_schedule_next_jobs.
-_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
 _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
-# Based on testing, assume a running job uses 350MB memory.
-JOB_MEMORY_MB = 350
-# Past 2000 simultaneous jobs, we become unstable.
-# See https://github.com/skypilot-org/skypilot/issues/4649.
-MAX_JOB_LIMIT = 2000
-# Number of ongoing launches launches allowed per CPU.
-LAUNCHES_PER_CPU = 4
-@lru_cache(maxsize=1)
-def _get_lock_path() -> str:
-    # TODO(tian): Per pool lock.
-    path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
-    os.makedirs(os.path.dirname(path), exist_ok=True)
-    return path
 def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
                       pool: Optional[str]) -> None:
@@ -120,7 +93,7 @@ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
     logger.debug(f'Job {job_id} started with pid {pid}')
-def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
+def maybe_schedule_next_jobs() -> None:
     """Determine if any managed jobs can be scheduled, and if so, schedule them.
     Here, "schedule" means to select job that is waiting, and allow it to
@@ -163,9 +136,10 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
         # parallelism control. If we cannot obtain the lock, exit immediately.
         # The current lock holder is expected to launch any jobs it can before
         # releasing the lock.
-        with filelock.FileLock(_get_lock_path(), blocking=False):
+        with filelock.FileLock(controller_utils.get_resources_lock_path(),
+                               blocking=False):
             while True:
-                maybe_next_job = state.get_waiting_job(pool)
+                maybe_next_job = state.get_waiting_job()
                 if maybe_next_job is None:
                     # Nothing left to start, break from scheduling loop
                     break
@@ -184,21 +158,11 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
                 # an ALIVE_WAITING job, but we would be able to launch a WAITING
                 # job.
                 if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
-                    if not _can_lauch_in_alive_job():
+                    if not controller_utils.can_provision():
                         # Can't schedule anything, break from scheduling loop.
                         break
                 elif current_state == state.ManagedJobScheduleState.WAITING:
                     if not _can_start_new_job(actual_pool):
-                        # If there is no job can be scheduled in the pool, we
-                        # try to schedule another job regardless of the pool.
-                        # This is to avoid the case where the pool is scaled
-                        # down at the same time as a job is done. In this case,
-                        # we won't have any job to schedule in the pool, but
-                        # other jobs in other pool (or no pool) can still be
-                        # scheduled.
-                        if pool is not None:
-                            pool = None
-                            continue
                         # Can't schedule anything, break from scheduling loop.
                         break
@@ -234,7 +198,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
     The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
     """
-    with filelock.FileLock(_get_lock_path()):
+    with filelock.FileLock(controller_utils.get_resources_lock_path()):
         is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
                                                 original_user_yaml_path,
                                                 env_file_path,
@@ -243,7 +207,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
     if is_resume:
         _start_controller(job_id, dag_yaml_path, env_file_path, pool)
     else:
-        maybe_schedule_next_jobs(pool)
+        maybe_schedule_next_jobs()
 @contextlib.contextmanager
@@ -268,6 +232,13 @@ def scheduled_launch(job_id: int):
     multiple uses of this context are nested, behavior is undefined. Don't do
     that.
     """
+    pool = state.get_pool_from_job_id(job_id)
+    # For pool, since there is no execution.launch, we don't need to have all
+    # the ALIVE_WAITING state. The state transition will be
+    # WAITING -> ALIVE -> DONE without any intermediate transitions.
+    if pool is not None:
+        yield
+        return
     # If we're already in LAUNCHING schedule_state, we don't need to wait.
     # This may be the case for the first launch of a job.
@@ -279,21 +250,20 @@ def scheduled_launch(job_id: int):
         while (state.get_job_schedule_state(job_id) !=
                state.ManagedJobScheduleState.LAUNCHING):
             time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
-    pool = state.get_pool_from_job_id(job_id)
     try:
         yield
     except exceptions.NoClusterLaunchedError:
         # NoClusterLaunchedError is indicates that the job is in retry backoff.
         # We should transition to ALIVE_BACKOFF instead of ALIVE.
-        with filelock.FileLock(_get_lock_path()):
+        with filelock.FileLock(controller_utils.get_resources_lock_path()):
             state.scheduler_set_alive_backoff(job_id)
         raise
     else:
-        with filelock.FileLock(_get_lock_path()):
+        with filelock.FileLock(controller_utils.get_resources_lock_path()):
             state.scheduler_set_alive(job_id)
     finally:
-        maybe_schedule_next_jobs(pool)
+        maybe_schedule_next_jobs()
 def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -308,58 +278,36 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
     if idempotent and (state.get_job_schedule_state(job_id)
                        == state.ManagedJobScheduleState.DONE):
         return
-    pool = state.get_pool_from_job_id(job_id)
-    with filelock.FileLock(_get_lock_path()):
+    with filelock.FileLock(controller_utils.get_resources_lock_path()):
         state.scheduler_set_done(job_id, idempotent)
-    maybe_schedule_next_jobs(pool)
+    maybe_schedule_next_jobs()
 def _set_alive_waiting(job_id: int) -> None:
     """Should use wait_until_launch_okay() to transition to this state."""
-    with filelock.FileLock(_get_lock_path()):
+    with filelock.FileLock(controller_utils.get_resources_lock_path()):
         state.scheduler_set_alive_waiting(job_id)
-    pool = state.get_pool_from_job_id(job_id)
-    maybe_schedule_next_jobs(pool)
-def _get_job_parallelism() -> int:
-    job_memory = JOB_MEMORY_MB * 1024 * 1024
-    job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
-    return max(job_limit, 1)
-def _get_launch_parallelism() -> int:
-    cpus = os.cpu_count()
-    return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
+    maybe_schedule_next_jobs()
 def _can_start_new_job(pool: Optional[str]) -> bool:
-    launching_jobs = state.get_num_launching_jobs()
-    alive_jobs = state.get_num_alive_jobs()
     # Check basic resource limits
-    if not (launching_jobs < _get_launch_parallelism() and
-            alive_jobs < _get_job_parallelism()):
+    # Pool jobs don't need to provision resources, so we skip the check.
+    if not ((controller_utils.can_provision() or pool is not None) and
+            controller_utils.can_start_new_process()):
         return False
-    # Check if there are available replicas in the pool
+    # Check if there are available workers in the pool
     if pool is not None:
         alive_jobs_in_pool = state.get_num_alive_jobs(pool)
-        if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
-            logger.debug(f'No replicas available in pool {pool}')
+        if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
+            logger.debug(f'No READY workers available in pool {pool}')
             return False
     return True
-def _can_lauch_in_alive_job() -> bool:
-    launching_jobs = state.get_num_launching_jobs()
-    return launching_jobs < _get_launch_parallelism()
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('dag_yaml',

sky/jobs/server/core.py CHANGED Viewed

@@ -93,8 +93,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
     return local_to_controller_file_mounts
-def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
-                              num_jobs: Optional[int]) -> Optional[List[int]]:
+def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
+                              num_jobs: int) -> Optional[List[int]]:
     """Submit the managed job locally if in consolidation mode.
     In normal mode the managed job submission is done in the ray job submission.
@@ -109,12 +109,13 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
     # Create local directory for the managed job.
     pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
     job_ids = []
+    pool = dag.pool
     pool_hash = None
     if pool is not None:
         pool_hash = serve_state.get_service_hash(pool)
         # Already checked in the sdk.
         assert pool_hash is not None, f'Pool {pool} not found'
-    for _ in range(num_jobs if num_jobs is not None else 1):
+    for _ in range(num_jobs):
         # TODO(tian): We should have a separate name for each job when
         # submitting multiple jobs. Current blocker is that we are sharing
         # the same dag object for all jobs. Maybe we can do copy.copy() for
@@ -172,9 +173,6 @@ def launch(
       handle: Optional[backends.ResourceHandle]; handle to the controller VM.
         None if dryrun.
     """
-    if pool is not None and not managed_job_utils.is_consolidation_mode():
-        with ux_utils.print_exception_no_traceback():
-            raise ValueError('pool is only supported in consolidation mode.')
     entrypoint = task
     # using hasattr instead of isinstance to avoid importing sky
     if hasattr(task, 'metadata'):
@@ -295,8 +293,13 @@ def launch(
         controller=controller,
         task_resources=sum([list(t.resources) for t in dag.tasks], []))
+    num_jobs = num_jobs if num_jobs is not None else 1
+    # We do this assignment after applying the admin policy, so that we don't
+    # need to serialize the pool name in the dag. The dag object will be
+    # preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
+    dag.pool = pool
     consolidation_mode_job_ids = _maybe_submit_job_locally(
-        prefix, dag, pool, num_jobs)
+        prefix, dag, num_jobs)
     # This is only needed for non-consolidation mode. For consolidation
     # mode, the controller uses the same catalog as API server.
@@ -373,8 +376,8 @@ def launch(
             controller_task._metadata = metadata
             job_identity = ''
-            if consolidation_mode_job_id is not None:
-                job_identity = f' (Job ID: {consolidation_mode_job_id})'
+            if job_rank is not None:
+                job_identity = f' (rank: {job_rank})'
             logger.info(f'{colorama.Fore.YELLOW}'
                         f'Launching managed job {dag.name!r}{job_identity} '
                         f'from jobs controller...{colorama.Style.RESET_ALL}')
@@ -428,14 +431,17 @@ def launch(
                     backend.run_on_head(local_handle, run_script)
                     return consolidation_mode_job_id, local_handle
-    if consolidation_mode_job_ids is None:
-        return _submit_one()
     if pool is None:
+        if consolidation_mode_job_ids is None:
+            return _submit_one()
         assert len(consolidation_mode_job_ids) == 1
         return _submit_one(consolidation_mode_job_ids[0])
     ids = []
     all_handle = None
-    for job_rank, job_id in enumerate(consolidation_mode_job_ids):
+    for job_rank in range(num_jobs):
+        job_id = (consolidation_mode_job_ids[job_rank]
+                  if consolidation_mode_job_ids is not None else None)
         jid, handle = _submit_one(job_id, job_rank)
         assert jid is not None, (job_id, handle)
         ids.append(jid)
@@ -491,7 +497,8 @@ def queue_from_kubernetes_pod(
     managed_jobs_runner = provision_lib.get_command_runners(
         'kubernetes', cluster_info)[0]
-    code = managed_job_utils.ManagedJobCodeGen.get_job_table()
+    code = managed_job_utils.ManagedJobCodeGen.get_job_table(
+        skip_finished=skip_finished)
     returncode, job_table_payload, stderr = managed_jobs_runner.run(
         code,
         require_outputs=True,
@@ -507,7 +514,14 @@ def queue_from_kubernetes_pod(
     except exceptions.CommandError as e:
         raise RuntimeError(str(e)) from e
-    jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
+    jobs, _, result_type = managed_job_utils.load_managed_job_queue(
+        job_table_payload)
+    if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
+        return jobs
+    # Backward compatibility for old jobs controller without filtering
+    # TODO(hailong): remove this after 0.12.0
     if skip_finished:
         # Filter out the finished jobs. If a multi-task job is partially
         # finished, we will include all its tasks.
@@ -562,10 +576,18 @@ def _maybe_restart_controller(
 @usage_lib.entrypoint
-def queue(refresh: bool,
-          skip_finished: bool = False,
-          all_users: bool = False,
-          job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
+def queue(
+    refresh: bool,
+    skip_finished: bool = False,
+    all_users: bool = False,
+    job_ids: Optional[List[int]] = None,
+    user_match: Optional[str] = None,
+    workspace_match: Optional[str] = None,
+    name_match: Optional[str] = None,
+    pool_match: Optional[str] = None,
+    page: Optional[int] = None,
+    limit: Optional[int] = None,
+) -> Tuple[List[Dict[str, Any]], int]:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Gets statuses of managed jobs.
@@ -595,6 +617,17 @@ def queue(refresh: bool,
             does not exist.
         RuntimeError: if failed to get the managed jobs with ssh.
     """
+    if limit is not None:
+        if limit < 1:
+            raise ValueError(f'Limit must be at least 1, got {limit}')
+        if page is None:
+            page = 1
+        if page < 1:
+            raise ValueError(f'Page must be at least 1, got {page}')
+    else:
+        if page is not None:
+            raise ValueError('Limit must be specified when page is specified')
     handle = _maybe_restart_controller(refresh,
                                        stopped_message='No in-progress '
                                        'managed jobs.',
@@ -603,7 +636,22 @@ def queue(refresh: bool,
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend)
-    code = managed_job_utils.ManagedJobCodeGen.get_job_table()
+    user_hashes: Optional[List[Optional[str]]] = None
+    if not all_users:
+        user_hashes = [common_utils.get_user_hash()]
+        # For backwards compatibility, we show jobs that do not have a
+        # user_hash. TODO(cooperc): Remove before 0.12.0.
+        user_hashes.append(None)
+    elif user_match is not None:
+        users = global_user_state.get_user_by_name_match(user_match)
+        if not users:
+            return [], 0
+        user_hashes = [user.id for user in users]
+    accessible_workspaces = list(workspaces_core.get_workspaces().keys())
+    code = managed_job_utils.ManagedJobCodeGen.get_job_table(
+        skip_finished, accessible_workspaces, job_ids, workspace_match,
+        name_match, pool_match, page, limit, user_hashes)
     returncode, job_table_payload, stderr = backend.run_on_head(
         handle,
         code,
@@ -616,8 +664,14 @@ def queue(refresh: bool,
         raise RuntimeError('Failed to fetch managed jobs with returncode: '
                            f'{returncode}.\n{job_table_payload + stderr}')
-    jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
+    jobs, total, result_type = managed_job_utils.load_managed_job_queue(
+        job_table_payload)
+    if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
+        return jobs, total
+    # Backward compatibility for old jobs controller without filtering
+    # TODO(hailong): remove this after 0.12.0
     if not all_users:
         def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
@@ -630,7 +684,6 @@ def queue(refresh: bool,
         jobs = list(filter(user_hash_matches_or_missing, jobs))
-    accessible_workspaces = workspaces_core.get_workspaces()
     jobs = list(
         filter(
             lambda job: job.get('workspace', skylet_constants.
@@ -649,7 +702,14 @@ def queue(refresh: bool,
     if job_ids:
         jobs = [job for job in jobs if job['job_id'] in job_ids]
-    return jobs
+    return managed_job_utils.filter_jobs(jobs,
+                                         workspace_match,
+                                         name_match,
+                                         pool_match,
+                                         page=page,
+                                         limit=limit,
+                                         user_match=user_match,
+                                         enable_user_match=True)
 @usage_lib.entrypoint

sky/jobs/server/utils.py CHANGED Viewed

@@ -62,7 +62,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
     version_matches = controller_version == local_version
     # Load and filter jobs locally using existing method
-    jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
+    jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
     non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
     has_non_terminal_jobs = len(non_terminal_jobs) > 0

sky/jobs/state.py CHANGED Viewed

@@ -441,7 +441,8 @@ class ManagedJobScheduleState(enum.Enum):
 # === Status transition functions ===
 @_init_db
-def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
+def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
+                 pool: Optional[str], pool_hash: Optional[str]):
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -457,7 +458,10 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
             name=name,
             schedule_state=ManagedJobScheduleState.INACTIVE.value,
             workspace=workspace,
-            entrypoint=entrypoint)
+            entrypoint=entrypoint,
+            pool=pool,
+            pool_hash=pool_hash,
+        )
         session.execute(insert_stmt)
         session.commit()
@@ -1524,7 +1528,7 @@ def get_nonterminal_job_ids_by_pool(pool: str,
 @_init_db
-def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
+def get_waiting_job() -> Optional[Dict[str, Any]]:
     """Get the next job that should transition to LAUNCHING.
     Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
@@ -1555,8 +1559,6 @@ def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
             job_info_table.c.priority >= sqlalchemy.func.coalesce(
                 max_priority_subquery, 0),
         ]
-        if pool is not None:
-            select_conds.append(job_info_table.c.pool == pool)
         query = sqlalchemy.select(
             job_info_table.c.spot_job_id,
             job_info_table.c.schedule_state,

skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250812py3-none-any.whl → 1.0.0.dev20250815py3-none-any.whl