PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250729py3-none-any.whl → 1.0.0.dev20250731py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show

sky/global_user_state.py CHANGED Viewed

@@ -11,6 +11,7 @@ import json
 import os
 import pickle
 import re
+import threading
 import time
 import typing
 from typing import Any, Dict, List, Optional, Set, Tuple
@@ -47,6 +48,7 @@ _ENABLED_CLOUDS_KEY_PREFIX = 'enabled_clouds_'
 _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
 _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
+_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
 Base = declarative.declarative_base()
@@ -241,21 +243,29 @@ def create_table(engine: sqlalchemy.engine.Engine):
         migration_utils.GLOBAL_USER_STATE_VERSION)
+# We wrap the sqlalchemy engine initialization in a thread
+# lock to ensure that multiple threads do not initialize the
+# engine which could result in a rare race condition where
+# a session has already been created with _SQLALCHEMY_ENGINE = e1,
+# and then another thread overwrites _SQLALCHEMY_ENGINE = e2
+# which could result in e1 being garbage collected unexpectedly.
 def initialize_and_get_db() -> sqlalchemy.engine.Engine:
     global _SQLALCHEMY_ENGINE
     if _SQLALCHEMY_ENGINE is not None:
         return _SQLALCHEMY_ENGINE
+    with _SQLALCHEMY_ENGINE_LOCK:
+        if _SQLALCHEMY_ENGINE is not None:
+            return _SQLALCHEMY_ENGINE
+        # get an engine to the db
+        engine = migration_utils.get_engine('state')
-    # get an engine to the db
-    engine = migration_utils.get_engine('state')
+        # run migrations if needed
+        create_table(engine)
-    # run migrations if needed
-    create_table(engine)
-    # return engine
-    _SQLALCHEMY_ENGINE = engine
-    return _SQLALCHEMY_ENGINE
+        # return engine
+        _SQLALCHEMY_ENGINE = engine
+        return _SQLALCHEMY_ENGINE
 def _init_db(func):

sky/jobs/__init__.py CHANGED Viewed

@@ -5,6 +5,9 @@ from sky.jobs.client.sdk import cancel
 from sky.jobs.client.sdk import dashboard
 from sky.jobs.client.sdk import download_logs
 from sky.jobs.client.sdk import launch
+from sky.jobs.client.sdk import pool_apply
+from sky.jobs.client.sdk import pool_down
+from sky.jobs.client.sdk import pool_status
 from sky.jobs.client.sdk import queue
 from sky.jobs.client.sdk import tail_logs
 from sky.jobs.constants import JOBS_CLUSTER_NAME_PREFIX_LENGTH

sky/jobs/client/sdk.py CHANGED Viewed

@@ -9,8 +9,10 @@ import click
 from sky import sky_logging
 from sky.client import common as client_common
 from sky.client import sdk
+from sky.serve.client import impl
 from sky.server import common as server_common
 from sky.server import rest
+from sky.server import versions
 from sky.server.requests import payloads
 from sky.skylet import constants
 from sky.usage import usage_lib
@@ -23,6 +25,7 @@ if typing.TYPE_CHECKING:
     import io
     import sky
+    from sky.serve import serve_utils
 logger = sky_logging.init_logger(__name__)
@@ -33,6 +36,8 @@ logger = sky_logging.init_logger(__name__)
 def launch(
     task: Union['sky.Task', 'sky.Dag'],
     name: Optional[str] = None,
+    pool: Optional[str] = None,
+    num_jobs: Optional[int] = None,
     # Internal only:
     # pylint: disable=invalid-name
     _need_confirmation: bool = False,
@@ -61,15 +66,35 @@ def launch(
           chain dag.
         sky.exceptions.NotSupportedError: the feature is not supported.
     """
+    remote_api_version = versions.get_remote_api_version()
+    if (pool is not None and
+        (remote_api_version is None or remote_api_version < 12)):
+        raise click.UsageError('Pools are not supported in your API server. '
+                               'Please upgrade to a newer API server to use '
+                               'pools.')
+    if pool is None and num_jobs is not None:
+        raise click.UsageError('Cannot specify num_jobs without pool.')
     dag = dag_utils.convert_entrypoint_to_dag(task)
     with admin_policy_utils.apply_and_use_config_in_current_request(
             dag, at_client_side=True) as dag:
         sdk.validate(dag)
         if _need_confirmation:
-            request_id = sdk.optimize(dag)
-            sdk.stream_and_get(request_id)
-            prompt = f'Launching a managed job {dag.name!r}. Proceed?'
+            job_identity = 'a managed job'
+            if pool is None:
+                request_id = sdk.optimize(dag)
+                sdk.stream_and_get(request_id)
+            else:
+                request_id = pool_status(pool)
+                pool_statuses = sdk.get(request_id)
+                if not pool_statuses:
+                    raise click.UsageError(f'Pool {pool!r} not found.')
+                resources = pool_statuses[0]['requested_resources_str']
+                click.secho(f'Use resources from pool {pool!r}: {resources}.',
+                            fg='green')
+                if num_jobs is not None:
+                    job_identity = f'{num_jobs} managed jobs'
+            prompt = f'Launching {job_identity} {dag.name!r}. Proceed?'
             if prompt is not None:
                 click.confirm(prompt,
                               default=True,
@@ -81,6 +106,8 @@ def launch(
         body = payloads.JobsLaunchBody(
             task=dag_str,
             name=name,
+            pool=pool,
+            num_jobs=num_jobs,
         )
         response = server_common.make_authenticated_request(
             'POST',
@@ -158,6 +185,7 @@ def cancel(
     job_ids: Optional[List[int]] = None,
     all: bool = False,  # pylint: disable=redefined-builtin
     all_users: bool = False,
+    pool: Optional[str] = None,
 ) -> server_common.RequestId:
     """Cancels managed jobs.
@@ -168,6 +196,7 @@ def cancel(
         job_ids: IDs of the managed jobs to cancel.
         all: Whether to cancel all managed jobs.
         all_users: Whether to cancel all managed jobs from all users.
+        pool: Pool name to cancel.
     Returns:
         The request ID of the cancel request.
@@ -176,11 +205,18 @@ def cancel(
         sky.exceptions.ClusterNotUpError: the jobs controller is not up.
         RuntimeError: failed to cancel the job.
     """
+    remote_api_version = versions.get_remote_api_version()
+    if (pool is not None and
+        (remote_api_version is None or remote_api_version < 12)):
+        raise click.UsageError('Pools are not supported in your API server. '
+                               'Please upgrade to a newer API server to use '
+                               'pools.')
     body = payloads.JobsCancelBody(
         name=name,
         job_ids=job_ids,
         all=all,
         all_users=all_users,
+        pool=pool,
     )
     response = server_common.make_authenticated_request(
         'POST',
@@ -327,3 +363,44 @@ def dashboard() -> None:
     url = f'{api_server_url}/jobs/dashboard?{params}'
     logger.info(f'Opening dashboard in browser: {url}')
     webbrowser.open(url)
+@context.contextual
+@usage_lib.entrypoint
+@server_common.check_server_healthy_or_start
+@versions.minimal_api_version(12)
+def pool_apply(
+    task: Union['sky.Task', 'sky.Dag'],
+    pool_name: str,
+    mode: 'serve_utils.UpdateMode',
+    # Internal only:
+    # pylint: disable=invalid-name
+    _need_confirmation: bool = False
+) -> server_common.RequestId:
+    """Apply a config to a pool."""
+    return impl.apply(task,
+                      pool_name,
+                      mode,
+                      pool=True,
+                      _need_confirmation=_need_confirmation)
+@usage_lib.entrypoint
+@server_common.check_server_healthy_or_start
+@versions.minimal_api_version(12)
+def pool_down(
+    pool_names: Optional[Union[str, List[str]]],
+    all: bool = False,  # pylint: disable=redefined-builtin
+    purge: bool = False,
+) -> server_common.RequestId:
+    """Delete a pool."""
+    return impl.down(pool_names, all, purge, pool=True)
+@usage_lib.entrypoint
+@server_common.check_server_healthy_or_start
+@versions.minimal_api_version(12)
+def pool_status(
+    pool_names: Optional[Union[str, List[str]]],) -> server_common.RequestId:
+    """Query a pool."""
+    return impl.status(pool_names, pool=True)

sky/jobs/controller.py CHANGED Viewed

@@ -30,6 +30,7 @@ from sky.jobs import recovery_strategy
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
+from sky.serve import serve_utils
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.usage import usage_lib
@@ -60,12 +61,13 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
 class JobsController:
     """Each jobs controller manages the life cycle of one managed job."""
-    def __init__(self, job_id: int, dag_yaml: str) -> None:
+    def __init__(self, job_id: int, dag_yaml: str, pool: Optional[str]) -> None:
         self._job_id = job_id
         self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
         logger.info(self._dag)
         # TODO(zhwu): this assumes the specific backend.
         self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
+        self._pool = pool
         # pylint: disable=line-too-long
         # Add a unique identifier to the task environment variables, so that
@@ -99,8 +101,10 @@ class JobsController:
             task.update_envs(task_envs)
     def _download_log_and_stream(
-        self, task_id: Optional[int],
-        handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
+        self,
+        task_id: Optional[int],
+        handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle],
+        job_id_on_pool_cluster: Optional[int],
     ) -> None:
         """Downloads and streams the logs of the current job with given task ID.
@@ -113,9 +117,14 @@ class JobsController:
                         'Skipping downloading and streaming the logs.')
             return
         managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
-                                            'managed_jobs')
-        log_file = controller_utils.download_and_stream_latest_job_log(
-            self._backend, handle, managed_job_logs_dir)
+                                            'managed_jobs',
+                                            f'job-id-{self._job_id}')
+        log_file = controller_utils.download_and_stream_job_log(
+            self._backend,
+            handle,
+            managed_job_logs_dir,
+            job_ids=[str(job_id_on_pool_cluster)]
+            if job_id_on_pool_cluster is not None else None)
         if log_file is not None:
             # Set the path of the log file for the current task, so it can be
             # accessed even after the job is finished
@@ -123,6 +132,12 @@ class JobsController:
                                                  log_file)
         logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
+    def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
+        if cluster_name is None:
+            return
+        if self._pool is None:
+            managed_job_utils.terminate_cluster(cluster_name)
     def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
         """Busy loop monitoring cluster status and handling recovery.
@@ -193,10 +208,14 @@ class JobsController:
         usage_lib.messages.usage.update_task_id(task_id)
         task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
         assert task.name is not None, task
+        # Set the cluster name to None if the job is submitted
+        # to a pool. This will be updated when we later calls the `launch`
+        # or `recover` function from the strategy executor.
         cluster_name = managed_job_utils.generate_managed_job_cluster_name(
-            task.name, self._job_id)
+            task.name, self._job_id) if self._pool is None else None
         self._strategy_executor = recovery_strategy.StrategyExecutor.make(
-            cluster_name, self._backend, task, self._job_id, task_id)
+            cluster_name, self._backend, task, self._job_id, task_id,
+            self._pool)
         if not is_resume:
             submitted_at = time.time()
             if task_id == 0:
@@ -226,6 +245,13 @@ class JobsController:
         if not is_resume:
             remote_job_submitted_at = self._strategy_executor.launch()
             assert remote_job_submitted_at is not None, remote_job_submitted_at
+        if self._pool is None:
+            job_id_on_pool_cluster = None
+        else:
+            # Update the cluster name when using cluster pool.
+            cluster_name, job_id_on_pool_cluster = (
+                managed_job_state.get_pool_submit_info(self._job_id))
+        assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
         if not is_resume:
             managed_job_state.set_started(job_id=self._job_id,
@@ -279,7 +305,9 @@ class JobsController:
             if not force_transit_to_recovering:
                 try:
                     job_status = managed_job_utils.get_job_status(
-                        self._backend, cluster_name)
+                        self._backend,
+                        cluster_name,
+                        job_id=job_id_on_pool_cluster)
                 except exceptions.FetchClusterInfoError as fetch_e:
                     logger.info(
                         'Failed to fetch the job status. Start recovery.\n'
@@ -288,7 +316,7 @@ class JobsController:
             if job_status == job_lib.JobStatus.SUCCEEDED:
                 success_end_time = managed_job_utils.try_to_get_job_end_time(
-                    self._backend, cluster_name)
+                    self._backend, cluster_name, job_id_on_pool_cluster)
                 # The job is done. Set the job to SUCCEEDED first before start
                 # downloading and streaming the logs to make it more responsive.
                 managed_job_state.set_succeeded(self._job_id,
@@ -299,6 +327,8 @@ class JobsController:
                     f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
                     f'Cleaning up the cluster {cluster_name}.')
                 try:
+                    logger.info(f'Downloading logs on cluster {cluster_name} '
+                                f'and job id {job_id_on_pool_cluster}.')
                     clusters = backend_utils.get_clusters(
                         cluster_names=[cluster_name],
                         refresh=common.StatusRefreshMode.NONE,
@@ -307,7 +337,8 @@ class JobsController:
                         assert len(clusters) == 1, (clusters, cluster_name)
                         handle = clusters[0].get('handle')
                         # Best effort to download and stream the logs.
-                        self._download_log_and_stream(task_id, handle)
+                        self._download_log_and_stream(task_id, handle,
+                                                      job_id_on_pool_cluster)
                 except Exception as e:  # pylint: disable=broad-except
                     # We don't want to crash here, so just log and continue.
                     logger.warning(
@@ -316,7 +347,7 @@ class JobsController:
                         exc_info=True)
                 # Only clean up the cluster, not the storages, because tasks may
                 # share storages.
-                managed_job_utils.terminate_cluster(cluster_name=cluster_name)
+                self._cleanup_cluster(cluster_name)
                 return True
             # For single-node jobs, non-terminated job_status indicates a
@@ -364,13 +395,14 @@ class JobsController:
                       job_status == job_lib.JobStatus.FAILED_DRIVER):
                     # The user code has probably crashed, fail immediately.
                     end_time = managed_job_utils.try_to_get_job_end_time(
-                        self._backend, cluster_name)
+                        self._backend, cluster_name, job_id_on_pool_cluster)
                     logger.info(
                         f'The user job failed ({job_status}). Please check the '
                         'logs below.\n'
                         f'== Logs of the user job (ID: {self._job_id}) ==\n')
-                    self._download_log_and_stream(task_id, handle)
+                    self._download_log_and_stream(task_id, handle,
+                                                  job_id_on_pool_cluster)
                     failure_reason = (
                         'To see the details, run: '
@@ -457,7 +489,7 @@ class JobsController:
                     # those clusters again may fail.
                     logger.info('Cleaning up the preempted or failed cluster'
                                 '...')
-                    managed_job_utils.terminate_cluster(cluster_name)
+                    self._cleanup_cluster(cluster_name)
             # Try to recover the managed jobs, when the cluster is preempted or
             # failed or the job status is failed to be fetched.
@@ -467,6 +499,10 @@ class JobsController:
                 force_transit_to_recovering=force_transit_to_recovering,
                 callback_func=callback_func)
             recovered_time = self._strategy_executor.recover()
+            if self._pool is not None:
+                cluster_name, job_id_on_pool_cluster = (
+                    managed_job_state.get_pool_submit_info(self._job_id))
+                assert cluster_name is not None
             managed_job_state.set_recovered(self._job_id,
                                             task_id,
                                             recovered_time=recovered_time,
@@ -541,11 +577,11 @@ class JobsController:
                 task=self._dag.tasks[task_id]))
-def _run_controller(job_id: int, dag_yaml: str):
+def _run_controller(job_id: int, dag_yaml: str, pool: Optional[str]):
     """Runs the controller in a remote process for interruption."""
     # The controller needs to be instantiated in the remote process, since
     # the controller is not serializable.
-    jobs_controller = JobsController(job_id, dag_yaml)
+    jobs_controller = JobsController(job_id, dag_yaml, pool)
     jobs_controller.run()
@@ -577,7 +613,7 @@ def _handle_signal(job_id):
         f'User sent {user_signal.value} signal.')
-def _cleanup(job_id: int, dag_yaml: str):
+def _cleanup(job_id: int, dag_yaml: str, pool: Optional[str]):
     """Clean up the cluster(s) and storages.
     (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
@@ -595,9 +631,18 @@ def _cleanup(job_id: int, dag_yaml: str):
     dag, _ = _get_dag_and_name(dag_yaml)
     for task in dag.tasks:
         assert task.name is not None, task
-        cluster_name = managed_job_utils.generate_managed_job_cluster_name(
-            task.name, job_id)
-        managed_job_utils.terminate_cluster(cluster_name)
+        if pool is None:
+            cluster_name = managed_job_utils.generate_managed_job_cluster_name(
+                task.name, job_id)
+            managed_job_utils.terminate_cluster(cluster_name)
+        else:
+            cluster_name, job_id_on_pool_cluster = (
+                managed_job_state.get_pool_submit_info(job_id))
+            if cluster_name is not None:
+                if job_id_on_pool_cluster is not None:
+                    core.cancel(cluster_name=cluster_name,
+                                job_ids=[job_id_on_pool_cluster],
+                                _try_cancel_if_cluster_is_init=True)
         # Clean up Storages with persistent=False.
         # TODO(zhwu): this assumes the specific backend.
@@ -629,7 +674,7 @@ def _cleanup(job_id: int, dag_yaml: str):
                     f'Failed to clean up file mount {file_mount}: {e}')
-def start(job_id, dag_yaml):
+def start(job_id, dag_yaml, pool):
     """Start the controller."""
     controller_process = None
     cancelling = False
@@ -643,7 +688,8 @@ def start(job_id, dag_yaml):
         #  So we can only enable daemon after we no longer need to
         #  start daemon processes like Ray.
         controller_process = multiprocessing.Process(target=_run_controller,
-                                                     args=(job_id, dag_yaml))
+                                                     args=(job_id, dag_yaml,
+                                                           pool))
         controller_process.start()
         while controller_process.is_alive():
             _handle_signal(job_id)
@@ -679,7 +725,7 @@ def start(job_id, dag_yaml):
         # https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
         # But anyway, a clean solution is killing the controller process
         # directly, and then cleanup the cluster job_state.
-        _cleanup(job_id, dag_yaml=dag_yaml)
+        _cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
         logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
         if cancelling:
@@ -717,8 +763,13 @@ if __name__ == '__main__':
     parser.add_argument('dag_yaml',
                         type=str,
                         help='The path to the user job yaml file.')
+    parser.add_argument('--pool',
+                        required=False,
+                        default=None,
+                        type=str,
+                        help='The pool to use for the controller job.')
     args = parser.parse_args()
     # We start process with 'spawn', because 'fork' could result in weird
     # behaviors; 'spawn' is also cross-platform.
     multiprocessing.set_start_method('spawn', force=True)
-    start(args.job_id, args.dag_yaml)
+    start(args.job_id, args.dag_yaml, args.pool)

skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250729py3-none-any.whl → 1.0.0.dev20250731py3-none-any.whl