PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250812py3-none-any.whl → 1.0.0.dev20250815py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (179) hide show

sky/__init__.py +4 -2
sky/adaptors/nebius.py +43 -1
sky/backends/backend_utils.py +74 -7
sky/backends/cloud_vm_ray_backend.py +169 -29
sky/catalog/cudo_catalog.py +1 -1
sky/catalog/data_fetchers/fetch_cudo.py +1 -1
sky/catalog/data_fetchers/fetch_nebius.py +6 -3
sky/client/cli/command.py +62 -85
sky/client/common.py +1 -1
sky/client/sdk.py +69 -19
sky/client/sdk_async.py +5 -4
sky/clouds/aws.py +52 -1
sky/clouds/kubernetes.py +15 -5
sky/clouds/nebius.py +3 -1
sky/dag.py +1 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/storage.py +11 -1
sky/exceptions.py +5 -0
sky/execution.py +13 -10
sky/global_user_state.py +191 -8
sky/jobs/constants.py +1 -1
sky/jobs/controller.py +0 -1
sky/jobs/recovery_strategy.py +3 -3
sky/jobs/scheduler.py +35 -87
sky/jobs/server/core.py +82 -22
sky/jobs/server/utils.py +1 -1
sky/jobs/state.py +7 -5
sky/jobs/utils.py +167 -8
sky/provision/__init__.py +1 -0
sky/provision/aws/config.py +25 -0
sky/provision/aws/instance.py +37 -13
sky/provision/azure/instance.py +2 -0
sky/provision/cudo/cudo_wrapper.py +1 -1
sky/provision/cudo/instance.py +2 -0
sky/provision/do/instance.py +2 -0
sky/provision/fluidstack/instance.py +2 -0
sky/provision/gcp/instance.py +2 -0
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/kubernetes/instance.py +133 -0
sky/provision/lambda_cloud/instance.py +2 -0
sky/provision/nebius/instance.py +2 -0
sky/provision/nebius/utils.py +101 -86
sky/provision/oci/instance.py +2 -0
sky/provision/paperspace/instance.py +2 -1
sky/provision/paperspace/utils.py +1 -1
sky/provision/provisioner.py +13 -8
sky/provision/runpod/instance.py +2 -0
sky/provision/runpod/utils.py +1 -1
sky/provision/scp/instance.py +2 -0
sky/provision/vast/instance.py +2 -0
sky/provision/vsphere/instance.py +2 -0
sky/resources.py +6 -7
sky/schemas/__init__.py +0 -0
sky/schemas/api/__init__.py +0 -0
sky/schemas/api/responses.py +70 -0
sky/schemas/db/global_user_state/006_provision_log.py +41 -0
sky/schemas/generated/__init__.py +0 -0
sky/schemas/generated/autostopv1_pb2.py +36 -0
sky/schemas/generated/autostopv1_pb2.pyi +43 -0
sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
sky/serve/constants.py +3 -7
sky/serve/replica_managers.py +138 -117
sky/serve/serve_state.py +42 -0
sky/serve/serve_utils.py +58 -36
sky/serve/server/impl.py +15 -19
sky/serve/service.py +82 -33
sky/server/constants.py +1 -1
sky/server/requests/payloads.py +6 -0
sky/server/requests/serializers/decoders.py +12 -2
sky/server/requests/serializers/encoders.py +10 -2
sky/server/server.py +64 -16
sky/setup_files/dependencies.py +11 -10
sky/skylet/autostop_lib.py +38 -5
sky/skylet/constants.py +3 -1
sky/skylet/services.py +44 -0
sky/skylet/skylet.py +49 -4
sky/task.py +19 -16
sky/templates/aws-ray.yml.j2 +2 -2
sky/templates/jobs-controller.yaml.j2 +6 -0
sky/templates/kubernetes-ray.yml.j2 +1 -0
sky/utils/command_runner.py +1 -1
sky/utils/common_utils.py +20 -0
sky/utils/config_utils.py +29 -5
sky/utils/controller_utils.py +86 -0
sky/utils/db/db_utils.py +17 -0
sky/utils/db/migration_utils.py +1 -1
sky/utils/log_utils.py +14 -5
sky/utils/resources_utils.py +25 -1
sky/utils/schemas.py +6 -0
sky/utils/ux_utils.py +36 -5
sky/volumes/server/core.py +2 -2
sky/volumes/server/server.py +2 -2
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
/sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0

sky/serve/replica_managers.py CHANGED Viewed

@@ -1,7 +1,5 @@
 """ReplicaManager: handles the creation and deletion of endpoint replicas."""
-import collections
 import dataclasses
-import enum
 import functools
 import multiprocessing
 from multiprocessing import pool as mp_pool
@@ -13,16 +11,16 @@ import typing
 from typing import Any, Dict, List, Optional, Tuple
 import colorama
-import psutil
+import filelock
 import requests
-import sky
 from sky import backends
 from sky import core
 from sky import exceptions
 from sky import execution
 from sky import global_user_state
 from sky import sky_logging
+from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.jobs import scheduler as jobs_scheduler
 from sky.serve import constants as serve_constants
@@ -41,7 +39,6 @@ from sky.utils import status_lib
 from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
-    from sky import resources
     from sky.serve import service_spec
 logger = sky_logging.init_logger(__name__)
@@ -51,10 +48,6 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
 _RETRY_INIT_GAP_SECONDS = 60
 _DEFAULT_DRAIN_SECONDS = 120
-# Since sky.launch is very resource demanding, we limit the number of
-# concurrent sky.launch process to avoid overloading the machine.
-_MAX_NUM_LAUNCH = psutil.cpu_count() * 2
 # TODO(tian): Combine this with
 # sky/spot/recovery_strategy.py::StrategyExecutor::launch
@@ -81,7 +74,7 @@ def launch_cluster(replica_id: int,
     try:
         config = common_utils.read_yaml(
             os.path.expanduser(service_task_yaml_path))
-        task = sky.Task.from_yaml_config(config)
+        task = task_lib.Task.from_yaml_config(config)
         if resources_override is not None:
             resources = task.resources
             overrided_resources = [
@@ -177,7 +170,7 @@ def terminate_cluster(cluster_name: str,
 def _get_resources_ports(service_task_yaml_path: str) -> str:
     """Get the resources ports used by the task."""
-    task = sky.Task.from_yaml(service_task_yaml_path)
+    task = task_lib.Task.from_yaml(service_task_yaml_path)
     # Already checked all ports are valid in sky.serve.core.up
     assert task.resources, task
     assert task.service is not None, task
@@ -195,7 +188,7 @@ def _should_use_spot(service_task_yaml_path: str,
         if use_spot_override is not None:
             assert isinstance(use_spot_override, bool)
             return use_spot_override
-    task = sky.Task.from_yaml(service_task_yaml_path)
+    task = task_lib.Task.from_yaml(service_task_yaml_path)
     spot_use_resources = [
         resources for resources in task.resources if resources.use_spot
     ]
@@ -204,6 +197,12 @@ def _should_use_spot(service_task_yaml_path: str,
     return len(spot_use_resources) == len(task.resources)
+# Every function that calls serve_state.add_or_update_replica should acquire
+# this lock. It is to prevent race condition when the replica status is updated
+# by multiple threads at the same time. The modification of replica info is
+# 2 database calls: read the whole replica info object, unpickle it, and modify
+# corresponding fields. Then it is write back to the database. We need to ensure
+# the read-modify-write operation is atomic.
 def with_lock(func):
     @functools.wraps(func)
@@ -214,22 +213,6 @@ def with_lock(func):
     return wrapper
-class ProcessStatus(enum.Enum):
-    """Process status."""
-    # The process is running
-    RUNNING = 'RUNNING'
-    # The process is finished and succeeded
-    SUCCEEDED = 'SUCCEEDED'
-    # The process is interrupted
-    INTERRUPTED = 'INTERRUPTED'
-    # The process failed
-    FAILED = 'FAILED'
 @dataclasses.dataclass
 class ReplicaStatusProperty:
     """Some properties that determine replica status.
@@ -241,15 +224,16 @@ class ReplicaStatusProperty:
         first_ready_time: The first time the service is ready.
         sky_down_status: Process status of sky.down.
     """
-    # None means sky.launch is not called yet.
-    sky_launch_status: Optional[ProcessStatus] = None
+    # sky.launch will always be scheduled on creation of ReplicaStatusProperty.
+    sky_launch_status: common_utils.ProcessStatus = (
+        common_utils.ProcessStatus.SCHEDULED)
     user_app_failed: bool = False
     service_ready_now: bool = False
     # None means readiness probe is not succeeded yet;
     # -1 means the initial delay seconds is exceeded.
     first_ready_time: Optional[float] = None
     # None means sky.down is not called yet.
-    sky_down_status: Optional[ProcessStatus] = None
+    sky_down_status: Optional[common_utils.ProcessStatus] = None
     # Whether the termination is caused by autoscaler's decision
     is_scale_down: bool = False
     # The replica's spot instance was preempted.
@@ -304,7 +288,7 @@ class ReplicaStatusProperty:
             (1) Job status;
             (2) Readiness probe.
         """
-        if self.sky_launch_status != ProcessStatus.SUCCEEDED:
+        if self.sky_launch_status != common_utils.ProcessStatus.SUCCEEDED:
             return False
         if self.sky_down_status is not None:
             return False
@@ -318,37 +302,43 @@ class ReplicaStatusProperty:
     def to_replica_status(self) -> serve_state.ReplicaStatus:
         """Convert status property to human-readable replica status."""
-        if self.sky_launch_status is None:
+        # Backward compatibility. Before we introduce ProcessStatus.SCHEDULED,
+        # we use None to represent sky.launch is not called yet.
+        if (self.sky_launch_status is None or
+                self.sky_launch_status == common_utils.ProcessStatus.SCHEDULED):
             # Pending to launch
             return serve_state.ReplicaStatus.PENDING
-        if self.sky_launch_status == ProcessStatus.RUNNING:
-            if self.sky_down_status == ProcessStatus.FAILED:
+        if self.sky_launch_status == common_utils.ProcessStatus.RUNNING:
+            if self.sky_down_status == common_utils.ProcessStatus.FAILED:
                 return serve_state.ReplicaStatus.FAILED_CLEANUP
-            if self.sky_down_status == ProcessStatus.SUCCEEDED:
+            if self.sky_down_status == common_utils.ProcessStatus.SUCCEEDED:
                 # This indicate it is a scale_down with correct teardown.
                 # Should have been cleaned from the replica table.
                 return serve_state.ReplicaStatus.UNKNOWN
             # Still launching
             return serve_state.ReplicaStatus.PROVISIONING
-        if self.sky_launch_status == ProcessStatus.INTERRUPTED:
+        if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
             # sky.down is running and a scale down interrupted sky.launch
             return serve_state.ReplicaStatus.SHUTTING_DOWN
         if self.sky_down_status is not None:
             if self.preempted:
                 # Replica (spot) is preempted
                 return serve_state.ReplicaStatus.PREEMPTED
-            if self.sky_down_status == ProcessStatus.RUNNING:
+            if self.sky_down_status == common_utils.ProcessStatus.SCHEDULED:
+                # sky.down is scheduled to run, but not started yet.
+                return serve_state.ReplicaStatus.SHUTTING_DOWN
+            if self.sky_down_status == common_utils.ProcessStatus.RUNNING:
                 # sky.down is running
                 return serve_state.ReplicaStatus.SHUTTING_DOWN
-            if self.sky_launch_status == ProcessStatus.INTERRUPTED:
+            if self.sky_launch_status == common_utils.ProcessStatus.INTERRUPTED:
                 return serve_state.ReplicaStatus.SHUTTING_DOWN
-            if self.sky_down_status == ProcessStatus.FAILED:
+            if self.sky_down_status == common_utils.ProcessStatus.FAILED:
                 # sky.down failed
                 return serve_state.ReplicaStatus.FAILED_CLEANUP
             if self.user_app_failed:
                 # Failed on user setup/run
                 return serve_state.ReplicaStatus.FAILED
-            if self.sky_launch_status == ProcessStatus.FAILED:
+            if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
                 # sky.launch failed
                 return serve_state.ReplicaStatus.FAILED_PROVISION
             if self.first_ready_time is None:
@@ -364,7 +354,7 @@ class ReplicaStatusProperty:
             # This indicate it is a scale_down with correct teardown.
             # Should have been cleaned from the replica table.
             return serve_state.ReplicaStatus.UNKNOWN
-        if self.sky_launch_status == ProcessStatus.FAILED:
+        if self.sky_launch_status == common_utils.ProcessStatus.FAILED:
             # sky.launch failed
             # The down process has not been started if it reaches here,
             # due to the `if self.sky_down_status is not None`` check above.
@@ -688,7 +678,7 @@ class SkyPilotReplicaManager(ReplicaManager):
                  service_task_yaml_path: str) -> None:
         super().__init__(service_name, spec)
         self.service_task_yaml_path = service_task_yaml_path
-        task = sky.Task.from_yaml(service_task_yaml_path)
+        task = task_lib.Task.from_yaml(service_task_yaml_path)
         self._spot_placer: Optional[spot_placer.SpotPlacer] = (
             spot_placer.SpotPlacer.from_task(spec, task))
         # TODO(tian): Store launch/down pid in the replica table, to make the
@@ -708,6 +698,7 @@ class SkyPilotReplicaManager(ReplicaManager):
         self._recover_replica_operations()
+    @with_lock
     def _recover_replica_operations(self):
         """Let's see are there something to do for ReplicaManager in a
         recovery run"""
@@ -748,9 +739,8 @@ class SkyPilotReplicaManager(ReplicaManager):
     # Replica management functions #
     ################################
-    # Adding lock here to make sure spot placer's current locations are
-    # consistent with the replicas' status.
-    @with_lock
+    # We don't need to add lock here since every caller of this function
+    # will acquire the lock.
     def _launch_replica(
         self,
         replica_id: int,
@@ -806,11 +796,61 @@ class SkyPilotReplicaManager(ReplicaManager):
         # to avoid too many sky.launch running at the same time.
         self._launch_process_pool[replica_id] = p
+    @with_lock
     def scale_up(self,
                  resources_override: Optional[Dict[str, Any]] = None) -> None:
         self._launch_replica(self._next_replica_id, resources_override)
         self._next_replica_id += 1
+    def _handle_sky_down_finish(self, info: ReplicaInfo, exitcode: int) -> None:
+        if exitcode != 0:
+            logger.error(f'Down process for replica {info.replica_id} '
+                         f'exited abnormally with code {exitcode}.')
+            info.status_property.sky_down_status = (
+                common_utils.ProcessStatus.FAILED)
+        else:
+            info.status_property.sky_down_status = (
+                common_utils.ProcessStatus.SUCCEEDED)
+        # Failed replica still count as a replica. In our current design, we
+        # want to fail early if user code have any error. This will prevent
+        # infinite loop of teardown and re-provision. However, there is a
+        # special case that if the replica is UP for longer than
+        # initial_delay_seconds, we assume it is just some random failure and
+        # we should restart the replica. Please refer to the implementation of
+        # `is_scale_down_succeeded` for more details.
+        # TODO(tian): Currently, restart replicas that failed within
+        # initial_delay_seconds is not supported. We should add it
+        # later when we support `sky serve update`.
+        removal_reason = None
+        if info.status_property.is_scale_down:
+            # This means the cluster is deleted due to an autoscaler
+            # decision or the cluster is recovering from preemption.
+            # Delete the replica info so it won't count as a replica.
+            if info.status_property.preempted:
+                removal_reason = 'for preemption recovery'
+            else:
+                removal_reason = 'normally'
+        # Don't keep failed record for version mismatch replicas,
+        # since user should fixed the error before update.
+        elif info.version != self.latest_version:
+            removal_reason = 'for version outdated'
+        elif info.status_property.purged:
+            removal_reason = 'for purge'
+        elif info.status_property.failed_spot_availability:
+            removal_reason = 'for spot availability failure'
+        else:
+            logger.info(f'Termination of replica {info.replica_id} '
+                        'finished. Replica info is kept since some '
+                        'failure detected.')
+            serve_state.add_or_update_replica(self._service_name,
+                                              info.replica_id, info)
+        if removal_reason is not None:
+            serve_state.remove_replica(self._service_name, info.replica_id)
+            logger.info(f'Replica {info.replica_id} removed from the '
+                        f'replica table {removal_reason}.')
+    # We don't need to add lock here since every caller of this function
+    # will acquire the lock.
     def _terminate_replica(self,
                            replica_id: int,
                            sync_down_logs: bool,
@@ -828,7 +868,8 @@ class SkyPilotReplicaManager(ReplicaManager):
             info = serve_state.get_replica_info_from_id(self._service_name,
                                                         replica_id)
             assert info is not None
-            info.status_property.sky_launch_status = ProcessStatus.INTERRUPTED
+            info.status_property.sky_launch_status = (
+                common_utils.ProcessStatus.INTERRUPTED)
             serve_state.add_or_update_replica(self._service_name, replica_id,
                                               info)
             launch_process = self._launch_process_pool[replica_id]
@@ -872,8 +913,9 @@ class SkyPilotReplicaManager(ReplicaManager):
             assert isinstance(handle, backends.CloudVmRayResourceHandle)
             replica_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
                                                 'replica_jobs')
-            job_log_file_name = (controller_utils.download_and_stream_job_log(
-                backend, handle, replica_job_logs_dir))
+            job_ids = ['1'] if self._is_pool else None
+            job_log_file_name = controller_utils.download_and_stream_job_log(
+                backend, handle, replica_job_logs_dir, job_ids)
             if job_log_file_name is not None:
                 logger.info(f'\n== End of logs (Replica: {replica_id}) ==')
                 with open(log_file_name, 'a',
@@ -899,18 +941,30 @@ class SkyPilotReplicaManager(ReplicaManager):
         logger.info(f'preempted: {info.status_property.preempted}, '
                     f'replica_id: {replica_id}')
+        info.status_property.is_scale_down = is_scale_down
+        info.status_property.purged = purge
+        # If the cluster does not exist, it means either the cluster never
+        # exists (e.g., the cluster is scaled down before it gets a chance to
+        # provision) or the cluster is preempted and cleaned up by the status
+        # refresh. In this case, we skip spawning a new down process to save
+        # controller resources.
+        if global_user_state.get_cluster_from_name(info.cluster_name) is None:
+            self._handle_sky_down_finish(info, exitcode=0)
+            return
+        # Otherwise, start the process to terminate the cluster.
         p = multiprocessing.Process(
             target=ux_utils.RedirectOutputForProcess(terminate_cluster,
                                                      log_file_name, 'a').run,
             args=(info.cluster_name, replica_drain_delay_seconds),
         )
-        info.status_property.sky_down_status = ProcessStatus.RUNNING
-        info.status_property.is_scale_down = is_scale_down
-        info.status_property.purged = purge
+        info.status_property.sky_down_status = (
+            common_utils.ProcessStatus.SCHEDULED)
         serve_state.add_or_update_replica(self._service_name, replica_id, info)
-        p.start()
         self._down_process_pool[replica_id] = p
+    @with_lock
     def scale_down(self, replica_id: int, purge: bool = False) -> None:
         self._terminate_replica(
             replica_id,
@@ -919,6 +973,8 @@ class SkyPilotReplicaManager(ReplicaManager):
             is_scale_down=True,
             purge=purge)
+    # We don't need to add lock here since every caller of this function
+    # will acquire the lock.
     def _handle_preemption(self, info: ReplicaInfo) -> bool:
         """Handle preemption of the replica if any error happened.
@@ -981,7 +1037,9 @@ class SkyPilotReplicaManager(ReplicaManager):
         # To avoid `dictionary changed size during iteration` error.
         launch_process_pool_snapshot = list(self._launch_process_pool.items())
         for replica_id, p in launch_process_pool_snapshot:
-            if not p.is_alive():
+            if p.is_alive():
+                continue
+            with filelock.FileLock(controller_utils.get_resources_lock_path()):
                 info = serve_state.get_replica_info_from_id(
                     self._service_name, replica_id)
                 assert info is not None, replica_id
@@ -989,11 +1047,10 @@ class SkyPilotReplicaManager(ReplicaManager):
                 schedule_next_jobs = False
                 if info.status == serve_state.ReplicaStatus.PENDING:
                     # sky.launch not started yet
-                    if (serve_state.total_number_provisioning_replicas() <
-                            _MAX_NUM_LAUNCH):
+                    if controller_utils.can_provision():
                         p.start()
                         info.status_property.sky_launch_status = (
-                            ProcessStatus.RUNNING)
+                            common_utils.ProcessStatus.RUNNING)
                 else:
                     # sky.launch finished
                     # TODO(tian): Try-catch in process, and have an enum return
@@ -1010,11 +1067,11 @@ class SkyPilotReplicaManager(ReplicaManager):
                             f'exited abnormally with code {p.exitcode}.'
                             ' Terminating...')
                         info.status_property.sky_launch_status = (
-                            ProcessStatus.FAILED)
+                            common_utils.ProcessStatus.FAILED)
                         error_in_sky_launch = True
                     else:
                         info.status_property.sky_launch_status = (
-                            ProcessStatus.SUCCEEDED)
+                            common_utils.ProcessStatus.SUCCEEDED)
                         schedule_next_jobs = True
                     if self._spot_placer is not None and info.is_spot:
                         # TODO(tian): Currently, we set the location to
@@ -1036,69 +1093,36 @@ class SkyPilotReplicaManager(ReplicaManager):
                 serve_state.add_or_update_replica(self._service_name,
                                                   replica_id, info)
                 if schedule_next_jobs and self._is_pool:
-                    jobs_scheduler.maybe_schedule_next_jobs(
-                        pool=self._service_name)
+                    jobs_scheduler.maybe_schedule_next_jobs()
                 if error_in_sky_launch:
                     # Teardown after update replica info since
                     # _terminate_replica will update the replica info too.
                     self._terminate_replica(replica_id,
                                             sync_down_logs=True,
                                             replica_drain_delay_seconds=0)
+            # Try schedule next job after acquiring the lock.
+            jobs_scheduler.maybe_schedule_next_jobs()
         down_process_pool_snapshot = list(self._down_process_pool.items())
         for replica_id, p in down_process_pool_snapshot:
-            if not p.is_alive():
-                logger.info(
-                    f'Terminate process for replica {replica_id} finished.')
-                del self._down_process_pool[replica_id]
-                info = serve_state.get_replica_info_from_id(
-                    self._service_name, replica_id)
-                assert info is not None, replica_id
-                if p.exitcode != 0:
-                    logger.error(f'Down process for replica {replica_id} '
-                                 f'exited abnormally with code {p.exitcode}.')
-                    info.status_property.sky_down_status = (
-                        ProcessStatus.FAILED)
-                else:
+            if p.is_alive():
+                continue
+            info = serve_state.get_replica_info_from_id(self._service_name,
+                                                        replica_id)
+            assert info is not None, replica_id
+            if (info.status_property.sky_down_status ==
+                    common_utils.ProcessStatus.SCHEDULED):
+                # sky.down not started yet
+                if controller_utils.can_terminate():
+                    p.start()
                     info.status_property.sky_down_status = (
-                        ProcessStatus.SUCCEEDED)
-                # Failed replica still count as a replica. In our current
-                # design, we want to fail early if user code have any error.
-                # This will prevent infinite loop of teardown and
-                # re-provision. However, there is a special case that if the
-                # replica is UP for longer than initial_delay_seconds, we
-                # assume it is just some random failure and we should restart
-                # the replica. Please refer to the implementation of
-                # `is_scale_down_succeeded` for more details.
-                # TODO(tian): Currently, restart replicas that failed within
-                # initial_delay_seconds is not supported. We should add it
-                # later when we support `sky serve update`.
-                removal_reason = None
-                if info.status_property.is_scale_down:
-                    # This means the cluster is deleted due to an autoscaler
-                    # decision or the cluster is recovering from preemption.
-                    # Delete the replica info so it won't count as a replica.
-                    if info.status_property.preempted:
-                        removal_reason = 'for preemption recovery'
-                    else:
-                        removal_reason = 'normally'
-                # Don't keep failed record for version mismatch replicas,
-                # since user should fixed the error before update.
-                elif info.version != self.latest_version:
-                    removal_reason = 'for version outdated'
-                elif info.status_property.purged:
-                    removal_reason = 'for purge'
-                elif info.status_property.failed_spot_availability:
-                    removal_reason = 'for spot availability failure'
-                else:
-                    logger.info(f'Termination of replica {replica_id} '
-                                'finished. Replica info is kept since some '
-                                'failure detected.')
+                        common_utils.ProcessStatus.RUNNING)
                     serve_state.add_or_update_replica(self._service_name,
                                                       replica_id, info)
-                if removal_reason is not None:
-                    serve_state.remove_replica(self._service_name, replica_id)
-                    logger.info(f'Replica {replica_id} removed from the '
-                                f'replica table {removal_reason}.')
+            else:
+                logger.info(
+                    f'Terminate process for replica {replica_id} finished.')
+                del self._down_process_pool[replica_id]
+                self._handle_sky_down_finish(info, exitcode=p.exitcode)
         # Clean old version
         replica_infos = serve_state.get_replica_infos(self._service_name)
@@ -1394,12 +1418,9 @@ class SkyPilotReplicaManager(ReplicaManager):
                 old_config_any_of = old_config.get('resources',
                                                    {}).pop('any_of', [])
-                def normalize_dict_list(lst):
-                    return collections.Counter(
-                        frozenset(d.items()) for d in lst)
-                if (normalize_dict_list(old_config_any_of) !=
-                        normalize_dict_list(new_config_any_of)):
+                if (resources_utils.normalize_any_of_resources_config(
+                        old_config_any_of) != resources_utils.
+                        normalize_any_of_resources_config(new_config_any_of)):
                     logger.info('Replica config changed (any_of), skipping. '
                                 f'old: {old_config_any_of}, '
                                 f'new: {new_config_any_of}')

sky/serve/serve_state.py CHANGED Viewed

@@ -502,6 +502,16 @@ def get_services() -> List[Dict[str, Any]]:
     return records
+@init_db
+def get_num_services() -> int:
+    """Get the number of services."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        return session.execute(
+            sqlalchemy.select(sqlalchemy.func.count()  # pylint: disable=not-callable
+                             ).select_from(services_table)).fetchone()[0]
 @init_db
 def get_service_from_name(service_name: str) -> Optional[Dict[str, Any]]:
     """Get all existing service records."""
@@ -660,6 +670,38 @@ def total_number_provisioning_replicas() -> int:
     return provisioning_count
+@init_db
+def total_number_terminating_replicas() -> int:
+    """Returns the total number of terminating replicas."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.execute(sqlalchemy.select(
+            replicas_table.c.replica_info)).fetchall()
+    terminating_count = 0
+    for row in rows:
+        replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
+        if (replica_info.status_property.sky_down_status ==
+                common_utils.ProcessStatus.RUNNING):
+            terminating_count += 1
+    return terminating_count
+@init_db
+def total_number_scheduled_to_terminate_replicas() -> int:
+    """Returns the total number of terminating replicas."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.execute(sqlalchemy.select(
+            replicas_table.c.replica_info)).fetchall()
+    terminating_count = 0
+    for row in rows:
+        replica_info: 'replica_managers.ReplicaInfo' = pickle.loads(row[0])
+        if (replica_info.status_property.sky_down_status ==
+                common_utils.ProcessStatus.SCHEDULED):
+            terminating_count += 1
+    return terminating_count
 def get_replicas_at_status(
     service_name: str,
     status: ReplicaStatus,

skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250812py3-none-any.whl → 1.0.0.dev20250815py3-none-any.whl