PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250808py3-none-any.whl → 1.0.0.dev20250814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (120) hide show

sky/__init__.py +4 -2
sky/adaptors/kubernetes.py +5 -2
sky/backends/backend_utils.py +102 -8
sky/backends/cloud_vm_ray_backend.py +197 -31
sky/catalog/cudo_catalog.py +1 -1
sky/catalog/data_fetchers/fetch_cudo.py +1 -1
sky/catalog/data_fetchers/fetch_nebius.py +6 -3
sky/client/cli/command.py +60 -77
sky/client/common.py +1 -1
sky/client/sdk.py +19 -19
sky/client/sdk_async.py +5 -4
sky/clouds/aws.py +52 -1
sky/clouds/kubernetes.py +14 -0
sky/core.py +5 -0
sky/dag.py +1 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-37611fe6b86d274d.js} +1 -1
sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-c2ea34fda4f1f8c8.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-078751bad714c017.js +11 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-664c36eda967b1ba.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-339efec49c0cc7d0.js → webpack-00c0a51d21157453.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +11 -1
sky/exceptions.py +5 -0
sky/execution.py +15 -0
sky/global_user_state.py +160 -2
sky/jobs/constants.py +1 -1
sky/jobs/controller.py +0 -1
sky/jobs/recovery_strategy.py +6 -3
sky/jobs/scheduler.py +23 -68
sky/jobs/server/core.py +22 -12
sky/jobs/state.py +6 -2
sky/jobs/utils.py +17 -2
sky/provision/__init__.py +4 -2
sky/provision/aws/config.py +9 -0
sky/provision/aws/instance.py +41 -17
sky/provision/azure/instance.py +7 -4
sky/provision/cudo/cudo_wrapper.py +1 -1
sky/provision/cudo/instance.py +7 -4
sky/provision/do/instance.py +7 -4
sky/provision/fluidstack/instance.py +7 -4
sky/provision/gcp/instance.py +7 -4
sky/provision/hyperbolic/instance.py +7 -5
sky/provision/kubernetes/instance.py +169 -6
sky/provision/lambda_cloud/instance.py +7 -4
sky/provision/nebius/instance.py +7 -4
sky/provision/oci/instance.py +7 -4
sky/provision/paperspace/instance.py +7 -5
sky/provision/paperspace/utils.py +1 -1
sky/provision/provisioner.py +6 -0
sky/provision/runpod/instance.py +7 -4
sky/provision/runpod/utils.py +1 -1
sky/provision/scp/instance.py +7 -5
sky/provision/vast/instance.py +7 -5
sky/provision/vsphere/instance.py +7 -4
sky/resources.py +1 -2
sky/schemas/__init__.py +0 -0
sky/schemas/api/__init__.py +0 -0
sky/schemas/api/responses.py +70 -0
sky/schemas/db/global_user_state/001_initial_schema.py +1 -1
sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
sky/schemas/db/serve_state/001_initial_schema.py +1 -1
sky/schemas/db/spot_jobs/001_initial_schema.py +1 -1
sky/schemas/generated/__init__.py +0 -0
sky/schemas/generated/autostopv1_pb2.py +36 -0
sky/schemas/generated/autostopv1_pb2.pyi +43 -0
sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
sky/serve/constants.py +3 -7
sky/serve/replica_managers.py +15 -16
sky/serve/serve_state.py +10 -0
sky/serve/serve_utils.py +58 -23
sky/serve/server/impl.py +15 -19
sky/serve/service.py +31 -16
sky/server/server.py +20 -14
sky/setup_files/dependencies.py +11 -10
sky/skylet/autostop_lib.py +38 -5
sky/skylet/constants.py +3 -1
sky/skylet/services.py +44 -0
sky/skylet/skylet.py +49 -4
sky/skypilot_config.py +4 -4
sky/task.py +19 -16
sky/templates/aws-ray.yml.j2 +2 -2
sky/templates/jobs-controller.yaml.j2 +6 -0
sky/users/permission.py +1 -1
sky/utils/cli_utils/status_utils.py +9 -0
sky/utils/command_runner.py +1 -1
sky/utils/config_utils.py +29 -5
sky/utils/controller_utils.py +73 -0
sky/utils/db/db_utils.py +39 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/schemas.py +3 -0
sky/volumes/server/core.py +2 -2
sky/volumes/server/server.py +2 -2
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/METADATA +5 -7
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/RECORD +117 -108
sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +0 -1
/sky/dashboard/out/_next/static/{-DXZksWqf2waNHeU9YTQe → Y0eNlwi85qGRecLTin11y}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250808.dist-info → skypilot_nightly-1.0.0.dev20250814.dist-info}/top_level.txt +0 -0

sky/jobs/scheduler.py CHANGED Viewed

@@ -15,13 +15,14 @@ following section for more details).
 The scheduling logic limits #running jobs according to three limits:
 1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
-   once, based on the number of CPUs. (See _get_launch_parallelism.) This the
-   most compute-intensive part of the job lifecycle, which is why we have an
-   additional limit.
+   once, based on the number of CPUs. This the most compute-intensive part of
+   the job lifecycle, which is why we have an additional limit.
+   See sky/utils/controller_utils.py::_get_launch_parallelism.
 2. The number of jobs that can be running at any given time, based on the amount
-   of memory. (See _get_job_parallelism.) Since the job controller is doing very
-   little once a job starts (just checking its status periodically), the most
-   significant resource it consumes is memory.
+   of memory. Since the job controller is doing very little once a job starts
+   (just checking its status periodically), the most significant resource it
+   consumes is memory.
+   See sky/utils/controller_utils.py::_get_job_parallelism.
 3. The number of jobs that can be running in a pool at any given time, based on
    the number of ready workers in the pool. (See _can_start_new_job.)
@@ -42,55 +43,27 @@ Nomenclature:
 from argparse import ArgumentParser
 import contextlib
-from functools import lru_cache
 import os
 import sys
 import time
-import typing
 from typing import Optional
 import filelock
 from sky import exceptions
 from sky import sky_logging
-from sky.adaptors import common as adaptors_common
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import state
 from sky.serve import serve_utils
 from sky.skylet import constants
 from sky.utils import common_utils
+from sky.utils import controller_utils
 from sky.utils import subprocess_utils
-if typing.TYPE_CHECKING:
-    import psutil
-else:
-    psutil = adaptors_common.LazyImport('psutil')
 logger = sky_logging.init_logger('sky.jobs.controller')
-# The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
-# parallelism control or updating the schedule_state of any job.
-# Any code that takes this lock must conclude by calling
-# maybe_schedule_next_jobs.
-_MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
 _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
-# Based on testing, assume a running job uses 350MB memory.
-JOB_MEMORY_MB = 350
-# Past 2000 simultaneous jobs, we become unstable.
-# See https://github.com/skypilot-org/skypilot/issues/4649.
-MAX_JOB_LIMIT = 2000
-# Number of ongoing launches launches allowed per CPU.
-LAUNCHES_PER_CPU = 4
-@lru_cache(maxsize=1)
-def _get_lock_path() -> str:
-    # TODO(tian): Per pool lock.
-    path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
-    os.makedirs(os.path.dirname(path), exist_ok=True)
-    return path
 def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
                       pool: Optional[str]) -> None:
@@ -163,7 +136,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
         # parallelism control. If we cannot obtain the lock, exit immediately.
         # The current lock holder is expected to launch any jobs it can before
         # releasing the lock.
-        with filelock.FileLock(_get_lock_path(), blocking=False):
+        with filelock.FileLock(controller_utils.get_resources_lock_path(),
+                               blocking=False):
             while True:
                 maybe_next_job = state.get_waiting_job(pool)
                 if maybe_next_job is None:
@@ -184,7 +158,8 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
                 # an ALIVE_WAITING job, but we would be able to launch a WAITING
                 # job.
                 if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
-                    if not _can_lauch_in_alive_job():
+                    if not (controller_utils.can_provision() or
+                            actual_pool is not None):
                         # Can't schedule anything, break from scheduling loop.
                         break
                 elif current_state == state.ManagedJobScheduleState.WAITING:
@@ -234,7 +209,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
     The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
     """
-    with filelock.FileLock(_get_lock_path()):
+    with filelock.FileLock(controller_utils.get_resources_lock_path()):
         is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
                                                 original_user_yaml_path,
                                                 env_file_path,
@@ -286,11 +261,11 @@ def scheduled_launch(job_id: int):
     except exceptions.NoClusterLaunchedError:
         # NoClusterLaunchedError is indicates that the job is in retry backoff.
         # We should transition to ALIVE_BACKOFF instead of ALIVE.
-        with filelock.FileLock(_get_lock_path()):
+        with filelock.FileLock(controller_utils.get_resources_lock_path()):
             state.scheduler_set_alive_backoff(job_id)
         raise
     else:
-        with filelock.FileLock(_get_lock_path()):
+        with filelock.FileLock(controller_utils.get_resources_lock_path()):
             state.scheduler_set_alive(job_id)
     finally:
         maybe_schedule_next_jobs(pool)
@@ -310,56 +285,36 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
         return
     pool = state.get_pool_from_job_id(job_id)
-    with filelock.FileLock(_get_lock_path()):
+    with filelock.FileLock(controller_utils.get_resources_lock_path()):
         state.scheduler_set_done(job_id, idempotent)
     maybe_schedule_next_jobs(pool)
 def _set_alive_waiting(job_id: int) -> None:
     """Should use wait_until_launch_okay() to transition to this state."""
-    with filelock.FileLock(_get_lock_path()):
+    with filelock.FileLock(controller_utils.get_resources_lock_path()):
         state.scheduler_set_alive_waiting(job_id)
     pool = state.get_pool_from_job_id(job_id)
     maybe_schedule_next_jobs(pool)
-def _get_job_parallelism() -> int:
-    job_memory = JOB_MEMORY_MB * 1024 * 1024
-    job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
-    return max(job_limit, 1)
-def _get_launch_parallelism() -> int:
-    cpus = os.cpu_count()
-    return cpus * LAUNCHES_PER_CPU if cpus is not None else 1
 def _can_start_new_job(pool: Optional[str]) -> bool:
-    launching_jobs = state.get_num_launching_jobs()
-    alive_jobs = state.get_num_alive_jobs()
     # Check basic resource limits
-    if not (launching_jobs < _get_launch_parallelism() and
-            alive_jobs < _get_job_parallelism()):
+    # Pool jobs don't need to provision resources, so we skip the check.
+    if not ((controller_utils.can_provision() or pool is not None) and
+            controller_utils.can_start_new_process()):
         return False
-    # Check if there are available replicas in the pool
+    # Check if there are available workers in the pool
     if pool is not None:
         alive_jobs_in_pool = state.get_num_alive_jobs(pool)
-        if alive_jobs_in_pool >= serve_utils.num_replicas(pool):
-            logger.debug(f'No replicas available in pool {pool}')
+        if alive_jobs_in_pool >= len(serve_utils.get_ready_replicas(pool)):
+            logger.debug(f'No READY workers available in pool {pool}')
             return False
     return True
-def _can_lauch_in_alive_job() -> bool:
-    launching_jobs = state.get_num_launching_jobs()
-    return launching_jobs < _get_launch_parallelism()
 if __name__ == '__main__':
     parser = ArgumentParser()
     parser.add_argument('dag_yaml',

sky/jobs/server/core.py CHANGED Viewed

@@ -93,8 +93,8 @@ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
     return local_to_controller_file_mounts
-def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
-                              num_jobs: Optional[int]) -> Optional[List[int]]:
+def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
+                              num_jobs: int) -> Optional[List[int]]:
     """Submit the managed job locally if in consolidation mode.
     In normal mode the managed job submission is done in the ray job submission.
@@ -109,12 +109,13 @@ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag', pool: Optional[str],
     # Create local directory for the managed job.
     pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
     job_ids = []
+    pool = dag.pool
     pool_hash = None
     if pool is not None:
         pool_hash = serve_state.get_service_hash(pool)
         # Already checked in the sdk.
         assert pool_hash is not None, f'Pool {pool} not found'
-    for _ in range(num_jobs if num_jobs is not None else 1):
+    for _ in range(num_jobs):
         # TODO(tian): We should have a separate name for each job when
         # submitting multiple jobs. Current blocker is that we are sharing
         # the same dag object for all jobs. Maybe we can do copy.copy() for
@@ -172,9 +173,6 @@ def launch(
       handle: Optional[backends.ResourceHandle]; handle to the controller VM.
         None if dryrun.
     """
-    if pool is not None and not managed_job_utils.is_consolidation_mode():
-        with ux_utils.print_exception_no_traceback():
-            raise ValueError('pool is only supported in consolidation mode.')
     entrypoint = task
     # using hasattr instead of isinstance to avoid importing sky
     if hasattr(task, 'metadata'):
@@ -295,8 +293,13 @@ def launch(
         controller=controller,
         task_resources=sum([list(t.resources) for t in dag.tasks], []))
+    num_jobs = num_jobs if num_jobs is not None else 1
+    # We do this assignment after applying the admin policy, so that we don't
+    # need to serialize the pool name in the dag. The dag object will be
+    # preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
+    dag.pool = pool
     consolidation_mode_job_ids = _maybe_submit_job_locally(
-        prefix, dag, pool, num_jobs)
+        prefix, dag, num_jobs)
     # This is only needed for non-consolidation mode. For consolidation
     # mode, the controller uses the same catalog as API server.
@@ -373,8 +376,8 @@ def launch(
             controller_task._metadata = metadata
             job_identity = ''
-            if consolidation_mode_job_id is not None:
-                job_identity = f' (Job ID: {consolidation_mode_job_id})'
+            if job_rank is not None:
+                job_identity = f' (rank: {job_rank})'
             logger.info(f'{colorama.Fore.YELLOW}'
                         f'Launching managed job {dag.name!r}{job_identity} '
                         f'from jobs controller...{colorama.Style.RESET_ALL}')
@@ -428,14 +431,17 @@ def launch(
                     backend.run_on_head(local_handle, run_script)
                     return consolidation_mode_job_id, local_handle
-    if consolidation_mode_job_ids is None:
-        return _submit_one()
     if pool is None:
+        if consolidation_mode_job_ids is None:
+            return _submit_one()
         assert len(consolidation_mode_job_ids) == 1
         return _submit_one(consolidation_mode_job_ids[0])
     ids = []
     all_handle = None
-    for job_rank, job_id in enumerate(consolidation_mode_job_ids):
+    for job_rank in range(num_jobs):
+        job_id = (consolidation_mode_job_ids[job_rank]
+                  if consolidation_mode_job_ids is not None else None)
         jid, handle = _submit_one(job_id, job_rank)
         assert jid is not None, (job_id, handle)
         ids.append(jid)
@@ -547,6 +553,10 @@ def _maybe_restart_controller(
                                  'controller'))
     with skypilot_config.local_active_workspace_ctx(
             skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
+        global_user_state.add_cluster_event(
+            jobs_controller_type.value.cluster_name,
+            status_lib.ClusterStatus.INIT, 'Jobs controller restarted.',
+            global_user_state.ClusterEventType.STATUS_CHANGE)
         handle = core.start(
             cluster_name=jobs_controller_type.value.cluster_name)

sky/jobs/state.py CHANGED Viewed

@@ -441,7 +441,8 @@ class ManagedJobScheduleState(enum.Enum):
 # === Status transition functions ===
 @_init_db
-def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
+def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
+                 pool: Optional[str], pool_hash: Optional[str]):
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -457,7 +458,10 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
             name=name,
             schedule_state=ManagedJobScheduleState.INACTIVE.value,
             workspace=workspace,
-            entrypoint=entrypoint)
+            entrypoint=entrypoint,
+            pool=pool,
+            pool_hash=pool_hash,
+        )
         session.execute(insert_stmt)
         session.commit()

sky/jobs/utils.py CHANGED Viewed

@@ -141,7 +141,7 @@ def _validate_consolidation_mode_config(
         if global_user_state.get_cluster_from_name(controller_cn) is not None:
             with ux_utils.print_exception_no_traceback():
                 raise exceptions.InconsistentConsolidationModeError(
-                    f'{colorama.Fore.RED}Consolidation mode is '
+                    f'{colorama.Fore.RED}Consolidation mode for jobs is '
                     f'enabled, but the controller cluster '
                     f'{controller_cn} is still running. Please '
                     'terminate the controller cluster first.'
@@ -179,7 +179,11 @@ def _validate_consolidation_mode_config(
 def is_consolidation_mode() -> bool:
     consolidation_mode = skypilot_config.get_nested(
         ('jobs', 'controller', 'consolidation_mode'), default_value=False)
-    _validate_consolidation_mode_config(consolidation_mode)
+    # We should only do this check on API server, as the controller will not
+    # have related config and will always seemingly disabled for consolidation
+    # mode. Check #6611 for more details.
+    if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
+        _validate_consolidation_mode_config(consolidation_mode)
     return consolidation_mode
@@ -333,6 +337,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
             if handle is not None:
                 try:
                     if pool is None:
+                        global_user_state.add_cluster_event(
+                            cluster_name, None, 'Cluster was cleaned up.',
+                            global_user_state.ClusterEventType.STATUS_CHANGE)
                         terminate_cluster(cluster_name)
                 except Exception as e:  # pylint: disable=broad-except
                     error_msg = (
@@ -1683,6 +1690,7 @@ class ManagedJobCodeGen:
     def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
                     workspace: str, entrypoint: str) -> str:
         dag_name = managed_job_dag.name
+        pool = managed_job_dag.pool
         # Add the managed job to queue table.
         code = textwrap.dedent(f"""\
             set_job_info_kwargs = {{'workspace': {workspace!r}}}
@@ -1690,6 +1698,13 @@ class ManagedJobCodeGen:
                 set_job_info_kwargs = {{}}
             if managed_job_version >= 5:
                 set_job_info_kwargs['entrypoint'] = {entrypoint!r}
+            if managed_job_version >= 8:
+                from sky.serve import serve_state
+                pool_hash = None
+                if {pool!r} != None:
+                    pool_hash = serve_state.get_service_hash({pool!r})
+                set_job_info_kwargs['pool'] = {pool!r}
+                set_job_info_kwargs['pool_hash'] = pool_hash
             managed_job_state.set_job_info(
                 {job_id}, {dag_name!r}, **set_job_info_kwargs)
             """)

sky/provision/__init__.py CHANGED Viewed

@@ -73,13 +73,15 @@ def _route_to_cloud_impl(func):
 @_route_to_cloud_impl
 def query_instances(
     provider_name: str,
+    cluster_name: str,
     cluster_name_on_cloud: str,
     provider_config: Optional[Dict[str, Any]] = None,
     non_terminated_only: bool = True,
-) -> Dict[str, Optional['status_lib.ClusterStatus']]:
+) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
     """Query instances.
-    Returns a dictionary of instance IDs and status.
+    Returns a dictionary of instance IDs and a tuple of (status, reason for
+    being in status if any).
     A None status means the instance is marked as "terminated"
     or "terminating".

sky/provision/aws/config.py CHANGED Viewed

@@ -19,6 +19,7 @@ import colorama
 from sky import exceptions
 from sky import sky_logging
 from sky.adaptors import aws
+from sky.clouds import aws as aws_cloud
 from sky.provision import common
 from sky.provision.aws import utils
 from sky.utils import annotations
@@ -103,6 +104,14 @@ def bootstrap_instances(
         security_group_ids = _configure_security_group(ec2, vpc_id,
                                                        expected_sg_name,
                                                        extended_ip_rules)
+        if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
+            # Ensure the default security group is created. This is needed
+            # to enable us to use the default security group to quickly
+            # delete the cluster. If the default security group is not created,
+            # we will need to block on instance termination to delete the
+            # security group.
+            _configure_security_group(ec2, vpc_id,
+                                      aws_cloud.DEFAULT_SECURITY_GROUP_NAME, [])
         end_time = time.time()
         elapsed = end_time - start_time
         logger.info(

sky/provision/aws/instance.py CHANGED Viewed

@@ -10,7 +10,7 @@ from multiprocessing import pool
 import re
 import time
 import typing
-from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
 from sky import sky_logging
 from sky.adaptors import aws
@@ -527,6 +527,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
                 to_start_count,
                 associate_public_ip_address=(
                     not config.provider_config['use_internal_ips']))
             created_instances.extend(created_remaining_instances)
         created_instances.sort(key=lambda x: x.id)
@@ -585,11 +586,13 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
 # stop() and terminate() for example already implicitly assume non-terminated.
 @common_utils.retry
 def query_instances(
+    cluster_name: str,
     cluster_name_on_cloud: str,
     provider_config: Optional[Dict[str, Any]] = None,
     non_terminated_only: bool = True,
-) -> Dict[str, Optional[status_lib.ClusterStatus]]:
+) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
     """See sky/provision/__init__.py"""
+    del cluster_name  # unused
     assert provider_config is not None, (cluster_name_on_cloud, provider_config)
     region = provider_config['region']
     ec2 = _default_ec2_resource(region)
@@ -608,12 +611,13 @@ def query_instances(
         'shutting-down': None,
         'terminated': None,
     }
-    statuses = {}
+    statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
+                              Optional[str]]] = {}
     for inst in instances:
         status = status_map[inst.state['Name']]
         if non_terminated_only and status is None:
             continue
-        statuses[inst.id] = status
+        statuses[inst.id] = (status, None)
     return statuses
@@ -681,19 +685,39 @@ def terminate_instances(
                                   filters,
                                   included_instances=None,
                                   excluded_instances=None)
-    instances_list = list(instances)
-    instances.terminate()
-    if (sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME or
-            not managed_by_skypilot):
-        # Using default AWS SG or user specified security group. We don't need
-        # to wait for the termination of the instances, as we do not need to
-        # delete the SG.
-        return
-    # If ports are specified, we need to delete the newly created Security
-    # Group. Here we wait for all instances to be terminated, since the
-    # Security Group dependent on them.
-    for instance in instances_list:
-        instance.wait_until_terminated()
+    default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
+    if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
+        # Case 1: The default SG is used, we don't need to ensure instance are
+        # terminated.
+        instances.terminate()
+    elif not managed_by_skypilot:
+        # Case 2: We are not managing the non-default sg. We don't need to
+        # ensure instances are terminated.
+        instances.terminate()
+    elif (managed_by_skypilot and default_sg is not None):
+        # Case 3: We are managing the non-default sg. The default SG exists
+        # so we can move the instances to the default SG and terminate them
+        # without blocking.
+        # Make this multithreaded: modify all instances' SGs in parallel.
+        def modify_instance_sg(instance):
+            instance.modify_attribute(Groups=[default_sg.id])
+            logger.debug(f'Instance {instance.id} modified to use default SG:'
+                         f'{default_sg.id} for quick deletion.')
+        with pool.ThreadPool() as thread_pool:
+            thread_pool.map(modify_instance_sg, instances)
+            thread_pool.close()
+            thread_pool.join()
+        instances.terminate()
+    else:
+        # Case 4: We are managing the non-default sg. The default SG does not
+        # exist. We must block on instance termination.
+        instances.terminate()
+        for instance in instances:
+            instance.wait_until_terminated()
     # TODO(suquark): Currently, the implementation of GCP and Azure will
     #  wait util the cluster is fully terminated, while other clouds just
     #  trigger the termination process (via http call) and then return.

sky/provision/azure/instance.py CHANGED Viewed

@@ -952,11 +952,13 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
 @common_utils.retry
 def query_instances(
+    cluster_name: str,
     cluster_name_on_cloud: str,
     provider_config: Optional[Dict[str, Any]] = None,
     non_terminated_only: bool = True,
-) -> Dict[str, Optional[status_lib.ClusterStatus]]:
+) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
     """See sky/provision/__init__.py"""
+    del cluster_name  # unused
     assert provider_config is not None, cluster_name_on_cloud
     subscription_id = provider_config['subscription_id']
@@ -964,7 +966,8 @@ def query_instances(
     filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
     compute_client = azure.get_client('compute', subscription_id)
     nodes = _filter_instances(compute_client, resource_group, filters)
-    statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
+    statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
+                              Optional[str]]] = {}
     def _fetch_and_map_status(node, resource_group: str) -> None:
         compute_client = azure.get_client('compute', subscription_id)
@@ -972,8 +975,8 @@ def query_instances(
         if status is None and non_terminated_only:
             return
-        statuses[node.name] = (None if status is None else
-                               status.to_cluster_status())
+        statuses[node.name] = ((None if status is None else
+                                status.to_cluster_status()), None)
     with pool.ThreadPool() as p:
         p.starmap(_fetch_and_map_status,

sky/provision/cudo/cudo_wrapper.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Dict
 from sky import sky_logging
 from sky.adaptors import cudo
-import sky.provision.cudo.cudo_utils as utils
+from sky.provision.cudo import cudo_utils as utils
 logger = sky_logging.init_logger(__name__)

sky/provision/cudo/instance.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Cudo Compute instance provisioning."""
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from sky import sky_logging
 from sky.provision import common
@@ -191,11 +191,13 @@ def get_cluster_info(
 def query_instances(
+    cluster_name: str,
     cluster_name_on_cloud: str,
     provider_config: Optional[Dict[str, Any]] = None,
     non_terminated_only: bool = True,
-) -> Dict[str, Optional[status_lib.ClusterStatus]]:
+) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
     """See sky/provision/__init__.py"""
+    del cluster_name  # unused
     assert provider_config is not None, (cluster_name_on_cloud, provider_config)
     instances = _filter_instances(cluster_name_on_cloud, None)
@@ -210,12 +212,13 @@ def query_instances(
         'done': status_lib.ClusterStatus.STOPPED,
         'poff': status_lib.ClusterStatus.STOPPED,
     }
-    statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
+    statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
+                              Optional[str]]] = {}
     for inst_id, inst in instances.items():
         status = status_map[inst['status']]
         if non_terminated_only and status is None:
             continue
-        statuses[inst_id] = status
+        statuses[inst_id] = (status, None)
     return statuses

sky/provision/do/instance.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """DigitalOcean instance provisioning."""
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 import uuid
 from sky import sky_logging
@@ -242,11 +242,13 @@ def get_cluster_info(
 def query_instances(
+    cluster_name: str,
     cluster_name_on_cloud: str,
     provider_config: Optional[Dict[str, Any]] = None,
     non_terminated_only: bool = True,
-) -> Dict[str, Optional[status_lib.ClusterStatus]]:
+) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
     """See sky/provision/__init__.py"""
+    del cluster_name  # unused
     # terminated instances are not retrieved by the
     # API making `non_terminated_only` argument moot.
     del non_terminated_only
@@ -260,10 +262,11 @@ def query_instances(
         'active': status_lib.ClusterStatus.UP,
         'off': status_lib.ClusterStatus.STOPPED,
     }
-    statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
+    statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
+                              Optional[str]]] = {}
     for instance_meta in instances.values():
         status = status_map[instance_meta['status']]
-        statuses[instance_meta['name']] = status
+        statuses[instance_meta['name']] = (status, None)
     return statuses

sky/provision/fluidstack/instance.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """FluidStack instance provisioning."""
 import os
 import time
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from sky import authentication as auth
 from sky import exceptions
@@ -287,11 +287,13 @@ def get_cluster_info(
 def query_instances(
+    cluster_name: str,
     cluster_name_on_cloud: str,
     provider_config: Optional[Dict[str, Any]] = None,
     non_terminated_only: bool = True,
-) -> Dict[str, Optional[status_lib.ClusterStatus]]:
+) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
     """See sky/provision/__init__.py"""
+    del cluster_name  # unused
     assert provider_config is not None, (cluster_name_on_cloud, provider_config)
     instances = _filter_instances(cluster_name_on_cloud, None)
     instances = _filter_instances(cluster_name_on_cloud, None)
@@ -302,7 +304,8 @@ def query_instances(
         'failed': status_lib.ClusterStatus.INIT,
         'terminated': None,
     }
-    statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
+    statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
+                              Optional[str]]] = {}
     for inst_id, inst in instances.items():
         if inst['status'] not in status_map:
             with ux_utils.print_exception_no_traceback():
@@ -311,7 +314,7 @@ def query_instances(
         status = status_map.get(inst['status'], None)
         if non_terminated_only and status is None:
             continue
-        statuses[inst_id] = status
+        statuses[inst_id] = (status, None)
     return statuses

skypilot-nightly 1.0.0.dev20250808__py3-none-any.whl → 1.0.0.dev20250814__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250808py3-none-any.whl → 1.0.0.dev20250814py3-none-any.whl