PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250823__py3-none-any.whl → 1.0.0.dev20250825__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250823py3-none-any.whl → 1.0.0.dev20250825py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (37) hide show

sky/__init__.py +2 -2
sky/adaptors/kubernetes.py +4 -0
sky/catalog/data_fetchers/fetch_lambda_cloud.py +1 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/exceptions.py +12 -0
sky/provision/kubernetes/utils.py +176 -34
sky/serve/autoscalers.py +347 -5
sky/serve/controller.py +36 -6
sky/serve/load_balancer.py +49 -16
sky/serve/load_balancing_policies.py +115 -1
sky/serve/service.py +2 -1
sky/serve/service_spec.py +26 -4
sky/setup_files/dependencies.py +4 -2
sky/utils/schemas.py +18 -2
{skypilot_nightly-1.0.0.dev20250823.dist-info → skypilot_nightly-1.0.0.dev20250825.dist-info}/METADATA +5 -1
{skypilot_nightly-1.0.0.dev20250823.dist-info → skypilot_nightly-1.0.0.dev20250825.dist-info}/RECORD +37 -37
/sky/dashboard/out/_next/static/{UQeqZCi6L_itwVPDcn3ba → n7XGGtvnHqbVUS8eayoGG}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{UQeqZCi6L_itwVPDcn3ba → n7XGGtvnHqbVUS8eayoGG}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250823.dist-info → skypilot_nightly-1.0.0.dev20250825.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250823.dist-info → skypilot_nightly-1.0.0.dev20250825.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250823.dist-info → skypilot_nightly-1.0.0.dev20250825.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250823.dist-info → skypilot_nightly-1.0.0.dev20250825.dist-info}/top_level.txt +0 -0

sky/serve/autoscalers.py CHANGED Viewed

@@ -6,7 +6,7 @@ import enum
 import math
 import time
 import typing
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 from sky import sky_logging
 from sky.serve import constants
@@ -213,6 +213,10 @@ class Autoscaler:
         # TODO(MaoZiming): use NAME to get the class.
         if spec.use_ondemand_fallback:
             return FallbackRequestRateAutoscaler(service_name, spec)
+        elif isinstance(spec.target_qps_per_replica, dict):
+            # Use instance-aware autoscaler
+            # when target_qps_per_replica is a dict
+            return InstanceAwareRequestRateAutoscaler(service_name, spec)
         else:
             return RequestRateAutoscaler(service_name, spec)
@@ -464,20 +468,28 @@ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
             request_timestamps: All request timestamps within the window.
         """
         super().__init__(service_name, spec)
-        self.target_qps_per_replica: Optional[
-            float] = spec.target_qps_per_replica
+        self.target_qps_per_replica: Optional[Union[float, Dict[
+            str, float]]] = spec.target_qps_per_replica
         self.qps_window_size: int = constants.AUTOSCALER_QPS_WINDOW_SIZE_SECONDS
         self.request_timestamps: List[float] = []
     def _calculate_target_num_replicas(self) -> int:
         if self.target_qps_per_replica is None:
             return self.min_replicas
+        # RequestRateAutoscaler should only handle float values
+        if isinstance(self.target_qps_per_replica, dict):
+            raise ValueError('RequestRateAutoscaler does not support dict '
+                             'target_qps_per_replica. Should use '
+                             'InstanceAwareRequestRateAutoscaler instead.')
         num_requests_per_second = len(
             self.request_timestamps) / self.qps_window_size
-        target_num_replicas = math.ceil(num_requests_per_second /
-                                        self.target_qps_per_replica)
+        target_num_replicas = \
+            math.ceil(num_requests_per_second / self.target_qps_per_replica)
         logger.info(f'Requests per second: {num_requests_per_second}. '
                     f'Target number of replicas: {target_num_replicas}.')
         return self._clip_target_num_replicas(target_num_replicas)
     def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
@@ -510,6 +522,7 @@ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
     ) -> List[AutoscalerDecision]:
         """Generate Autoscaling decisions based on request rate."""
+        # Use standard hysteresis-based logic (non-instance-aware)
         self._set_target_num_replicas_with_hysteresis()
         latest_nonterminal_replicas: List['replica_managers.ReplicaInfo'] = []
@@ -538,6 +551,7 @@ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
         if len(latest_nonterminal_replicas) > target_num_replicas:
             num_replicas_to_scale_down = (len(latest_nonterminal_replicas) -
                                           target_num_replicas)
+            # Use standard downscaling logic
             replicas_to_scale_down = (
                 _select_nonterminal_replicas_to_scale_down(
                     num_replicas_to_scale_down, latest_nonterminal_replicas))
@@ -562,6 +576,334 @@ class RequestRateAutoscaler(_AutoscalerWithHysteresis):
             logger.info(f'Remaining dynamic states: {dynamic_states}')
+class InstanceAwareRequestRateAutoscaler(RequestRateAutoscaler):
+    """Instance-aware RequestRateAutoscaler:
+    Autoscale based on each replica's GPU-specific QPS.
+    This autoscaler considers different QPS targets for different GPU types
+    when target_qps_per_replica is provided as a dictionary mapping GPU types
+    to their respective QPS targets.
+    """
+    def __init__(self, service_name: str,
+                 spec: 'service_spec.SkyServiceSpec') -> None:
+        super().__init__(service_name, spec)
+        # Ensure target_qps_per_replica is a dict for instance-aware logic
+        assert isinstance(spec.target_qps_per_replica, dict), \
+            'InstanceAware Autoscaler requires dict type target_qps_per_replica'
+        # Re-assign with correct type using setattr to avoid typing issues
+        self.target_qps_per_replica = spec.target_qps_per_replica
+    def _generate_scaling_decisions(
+        self,
+        replica_infos: List['replica_managers.ReplicaInfo'],
+    ) -> List[AutoscalerDecision]:
+        """Generate autoscaling decisions with instance-aware logic."""
+        # Always use instance-aware logic
+        # since target_qps_per_replica is guaranteed to be dict
+        self._set_target_num_replicas_with_instance_aware_logic(replica_infos)
+        latest_nonterminal_replicas: List['replica_managers.ReplicaInfo'] = []
+        for info in replica_infos:
+            if not info.is_terminal and info.version == self.latest_version:
+                latest_nonterminal_replicas.append(info)
+        target_num_replicas = self.get_final_target_num_replicas()
+        current_num_replicas = len(latest_nonterminal_replicas)
+        scaling_decisions: List[AutoscalerDecision] = []
+        # Decide if to scale up or down.
+        if target_num_replicas > current_num_replicas:
+            for _ in range(target_num_replicas - current_num_replicas):
+                # No resources_override to use when scaling up
+                scaling_decisions.append(
+                    AutoscalerDecision(AutoscalerDecisionOperator.SCALE_UP,
+                                       target=None))
+        elif target_num_replicas < current_num_replicas:
+            num_replicas_to_scale_down = \
+                current_num_replicas - target_num_replicas
+            # Use instance-aware scale down logic
+            replicas_to_scale_down = self._select_replicas_to_scale_down_by_qps(
+                num_replicas_to_scale_down, latest_nonterminal_replicas)
+            for replica_id in replicas_to_scale_down:
+                scaling_decisions.append(
+                    AutoscalerDecision(AutoscalerDecisionOperator.SCALE_DOWN,
+                                       target=replica_id))
+        # Outdated replicas are handled by base class generate_scaling_decisions
+        # No need to handle them here
+        upscale_decisions = [
+            d for d in scaling_decisions
+            if d.operator == AutoscalerDecisionOperator.SCALE_UP
+        ]
+        downscale_decisions = [
+            d for d in scaling_decisions
+            if d.operator == AutoscalerDecisionOperator.SCALE_DOWN
+        ]
+        logger.info(f'Scaling decisions: '
+                    f'{len(upscale_decisions)} scale up, '
+                    f'{len(downscale_decisions)} scale down '
+                    f'(latest nonterminal: {current_num_replicas}, '
+                    f'target: {target_num_replicas})')
+        return scaling_decisions
+    def _set_target_num_replicas_with_instance_aware_logic(
+            self, replica_infos: List['replica_managers.ReplicaInfo']) -> None:
+        """Set target_num_replicas using instance-aware logic."""
+        assert isinstance(self.target_qps_per_replica,
+                          dict), 'Expected dict for instance-aware logic'
+        target_qps_dict = self.target_qps_per_replica
+        num_requests_per_second = len(
+            self.request_timestamps) / self.qps_window_size
+        total_qps = self._calculate_total_qps_from_replicas(replica_infos)
+        if total_qps > 0:
+            if num_requests_per_second >= total_qps:
+                # for upscaling, max_target_qps is the standard qps
+                max_target_qps = max(target_qps_dict.values())
+                over_request_num = num_requests_per_second - total_qps
+                current_num_replicas = len(replica_infos)
+                raw_target_num = current_num_replicas + math.ceil(
+                    over_request_num / max_target_qps)
+                target_num_replicas = self._clip_target_num_replicas(
+                    raw_target_num)
+                logger.info(
+                    f'Instance-aware autoscaling: total QPS {total_qps}, '
+                    f'num_requests_per_second: {num_requests_per_second}, '
+                    f'upscaling, using maximum QPS {max_target_qps} '
+                    f'from {target_qps_dict}, '
+                    f'target replicas: {target_num_replicas}')
+            else:
+                # for downscaling, use qps for every ready_target_qps_list
+                # to calculate target_num_replicas
+                ready_target_qps_list = \
+                    self._extract_target_qps_list_from_ready_replicas(
+                        replica_infos)
+                ready_target_qps_list = sorted(ready_target_qps_list,
+                                               reverse=True)
+                if not ready_target_qps_list:
+                    # Fallback to maximum QPS from config if no ready replicas
+                    ready_target_qps_list = [max(target_qps_dict.values())]
+                raw_target_num = 0
+                qps_sum = 0.0
+                for qps in ready_target_qps_list:
+                    raw_target_num += 1
+                    qps_sum += qps
+                    if qps_sum > num_requests_per_second:
+                        break
+                target_num_replicas = self._clip_target_num_replicas(
+                    raw_target_num)
+                logger.info(
+                    f'Instance-aware autoscaling: total QPS {total_qps}, '
+                    f'num_requests_per_second: {num_requests_per_second}, '
+                    f'downscaling, using ready QPS list '
+                    f'{ready_target_qps_list}, '
+                    f'target replicas: {target_num_replicas}')
+        else:
+            # no replica is ready; use the normal min_replicas
+            target_num_replicas = self._clip_target_num_replicas(
+                self.min_replicas)
+            logger.info(f'Instance-aware autoscaling: no replica QPS available,'
+                        f' target replicas: {target_num_replicas}')
+        # Apply hysteresis logic
+        old_target_num_replicas = self.target_num_replicas
+        # Faster scale up when there is no replica.
+        if self.target_num_replicas == 0:
+            self.target_num_replicas = target_num_replicas
+        elif target_num_replicas > self.target_num_replicas:
+            self.upscale_counter += 1
+            self.downscale_counter = 0
+            if self.upscale_counter >= self.scale_up_threshold:
+                self.upscale_counter = 0
+                self.target_num_replicas = target_num_replicas
+        elif target_num_replicas < self.target_num_replicas:
+            self.downscale_counter += 1
+            self.upscale_counter = 0
+            if self.downscale_counter >= self.scale_down_threshold:
+                self.downscale_counter = 0
+                self.target_num_replicas = target_num_replicas
+        else:
+            self.upscale_counter = self.downscale_counter = 0
+        logger.info(
+            f'Instance-aware: Old target number of replicas: '
+            f'{old_target_num_replicas}. '
+            f'Current target number of replicas: {target_num_replicas}. '
+            f'Final target number of replicas: {self.target_num_replicas}. '
+            f'Num overprovision: {self.num_overprovision}. '
+            f'Upscale counter: {self.upscale_counter}/'
+            f'{self.scale_up_threshold}. '
+            f'Downscale counter: {self.downscale_counter}/'
+            f'{self.scale_down_threshold}. ')
+    def _calculate_total_qps_from_replicas(
+            self, replica_infos: List['replica_managers.ReplicaInfo']) -> float:
+        """Calculate total QPS based on current replica GPU types."""
+        total_qps = 0.0
+        logger.info(f'Calculating total QPS from {len(replica_infos)} replicas')
+        for replica_info in replica_infos:
+            # Skip non-valid replicas
+            valid_statuses = [
+                serve_state.ReplicaStatus.READY,
+                serve_state.ReplicaStatus.STARTING,
+                serve_state.ReplicaStatus.PROVISIONING
+            ]
+            if replica_info.status not in valid_statuses:
+                logger.info(f'Skipping replica {replica_info.replica_id} '
+                            f'with status: {replica_info.status}')
+                continue
+            gpu_type = self._get_gpu_type_from_replica_info(replica_info)
+            logger.info(f'Processing replica {replica_info.replica_id} '
+                        f'with GPU type: {gpu_type}')
+            # Use flexible matching logic
+            qps_for_this_gpu = self._get_target_qps_for_gpu_type(gpu_type)
+            total_qps += qps_for_this_gpu
+            logger.info(f'GPU type {gpu_type} -> {qps_for_this_gpu} QPS')
+        logger.info(f'Calculated total QPS: {total_qps}')
+        return total_qps
+    def _get_target_qps_for_gpu_type(self, gpu_type: str) -> float:
+        """Get target QPS for a specific GPU type with flexible matching."""
+        assert isinstance(self.target_qps_per_replica,
+                          dict), 'Expected dict for instance-aware logic'
+        target_qps_dict = self.target_qps_per_replica
+        # Direct match first
+        if gpu_type in target_qps_dict:
+            return target_qps_dict[gpu_type]
+        # Try matching by base name (e.g., 'A100' matches 'A100:1')
+        for config_key in target_qps_dict.keys():
+            # Remove count suffix (e.g., 'A100:1' -> 'A100')
+            base_name = config_key.split(':')[0]
+            if gpu_type == base_name:
+                return target_qps_dict[config_key]
+        # Fallback to minimum QPS
+        logger.warning(f'No matching QPS found for GPU type: {gpu_type}. '
+                       f'Available types: {list(target_qps_dict.keys())}. '
+                       f'Using minimum QPS as fallback.')
+        return min(target_qps_dict.values())
+    def _get_gpu_type_from_replica_info(
+            self, replica_info: 'replica_managers.ReplicaInfo') -> str:
+        """Extract GPU type from ReplicaInfo object."""
+        gpu_type = 'unknown'
+        handle = replica_info.handle()
+        if handle is not None:
+            accelerators = handle.launched_resources.accelerators
+            if accelerators and len(accelerators) > 0:
+                # Get the first accelerator type
+                gpu_type = list(accelerators.keys())[0]
+        return gpu_type
+    def _extract_target_qps_list_from_ready_replicas(
+            self,
+            replica_infos: List['replica_managers.ReplicaInfo']) -> List[float]:
+        """Extract target QPS list from current READY replicas."""
+        ready_replica_qps = []
+        for replica_info in replica_infos:
+            # Check if replica is READY
+            if replica_info.status != serve_state.ReplicaStatus.READY:
+                logger.info(
+                    f'Replica {replica_info.replica_id} '
+                    f'not ready (status: {replica_info.status}), skipping')
+                continue
+            gpu_type = self._get_gpu_type_from_replica_info(replica_info)
+            # Use flexible matching logic
+            qps_for_this_gpu = self._get_target_qps_for_gpu_type(gpu_type)
+            ready_replica_qps.append(qps_for_this_gpu)
+            logger.info(f'Ready replica {replica_info.replica_id} '
+                        f'with GPU {gpu_type}: {qps_for_this_gpu} QPS')
+        if ready_replica_qps:
+            logger.info(
+                f'Target QPS list from ready replicas: {ready_replica_qps}')
+            return ready_replica_qps
+        return []
+    def _select_replicas_to_scale_down_by_qps(
+            self, num_replicas_to_scale_down: int,
+            replica_infos: List['replica_managers.ReplicaInfo']) -> List[int]:
+        """Select replicas to scale down (lowest QPS first)."""
+        # Create a list of (replica_info, target_qps) tuples
+        replica_qps_pairs: List[Tuple['replica_managers.ReplicaInfo',
+                                      float]] = []
+        for info in replica_infos:
+            # Include old-version replicas as well so they also get a target_qps
+            # assigned. Skip terminal replicas only.
+            if info.is_terminal:
+                continue
+            # Get GPU type directly from replica info
+            gpu_type = self._get_gpu_type_from_replica_info(info)
+            # Use flexible matching logic
+            target_qps = self._get_target_qps_for_gpu_type(gpu_type)
+            replica_qps_pairs.append((info, float(target_qps)))
+            logger.info(f'Replica {info.replica_id} '
+                        f'with GPU {gpu_type}: {target_qps} QPS')
+        # Create a mapping from replica_id to target_qps for sorting
+        replica_qps_map = {
+            info.replica_id: target_qps
+            for info, target_qps in replica_qps_pairs
+        }
+        # Sort replicas by: 1. status order, 2. target_qps (asc),
+        # 3. version (asc), 4. replica_id (desc)
+        sorted_replicas = sorted(
+            replica_infos,
+            key=lambda info: (
+                info.status.scale_down_decision_order(),
+                replica_qps_map.get(info.replica_id, float('inf')),
+                info.version,
+                -info.replica_id,
+            ))
+        selected_replica_ids = []
+        for info in sorted_replicas:
+            if info.is_terminal:
+                continue
+            selected_replica_ids.append(info.replica_id)
+            if len(selected_replica_ids) >= num_replicas_to_scale_down:
+                break
+        logger.info(
+            f'Selected {len(selected_replica_ids)} replicas to scale down: '
+            f'{selected_replica_ids}')
+        return selected_replica_ids
+    def update_version(self, version: int, spec: 'service_spec.SkyServiceSpec',
+                       update_mode: serve_utils.UpdateMode) -> None:
+        super(RequestRateAutoscaler,
+              self).update_version(version, spec, update_mode)
+        # Ensure it's a dict and re-assign using setattr to avoid typing
+        assert isinstance(spec.target_qps_per_replica, dict), \
+            'InstanceAware Autoscaler requires dict type target_qps_per_replica'
+        self.target_qps_per_replica = spec.target_qps_per_replica
 class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
     """FallbackRequestRateAutoscaler

sky/serve/controller.py CHANGED Viewed

@@ -78,7 +78,11 @@ class SkyServeController:
                 assert record is not None, ('No service record found for '
                                             f'{self._service_name}')
                 active_versions = record['active_versions']
-                logger.info(f'All replica info: {replica_infos}')
+                logger.info(f'All replica info for autoscaler: {replica_infos}')
+                # Autoscaler now extracts GPU type info directly from
+                # replica_infos in generate_scaling_decisions method
+                # for better decoupling.
                 scaling_options = self._autoscaler.generate_scaling_decisions(
                     replica_infos, active_versions)
                 for scaling_option in scaling_options:
@@ -118,11 +122,37 @@ class SkyServeController:
             timestamps: List[int] = request_aggregator.get('timestamps', [])
             logger.info(f'Received {len(timestamps)} inflight requests.')
             self._autoscaler.collect_request_information(request_aggregator)
-            return responses.JSONResponse(content={
-                'ready_replica_urls':
-                    self._replica_manager.get_active_replica_urls()
-            },
-                                          status_code=200)
+            # Get replica information for instance-aware load balancing
+            replica_infos = serve_state.get_replica_infos(self._service_name)
+            ready_replica_urls = self._replica_manager.get_active_replica_urls()
+            # Use URL-to-info mapping to avoid duplication
+            replica_info = {}
+            for info in replica_infos:
+                if info.url in ready_replica_urls:
+                    # Get GPU type from handle.launched_resources.accelerators
+                    gpu_type = 'unknown'
+                    handle = info.handle()
+                    if handle is not None:
+                        accelerators = handle.launched_resources.accelerators
+                        if accelerators and len(accelerators) > 0:
+                            # Get the first accelerator type
+                            gpu_type = list(accelerators.keys())[0]
+                    replica_info[info.url] = {'gpu_type': gpu_type}
+            # Check that all ready replica URLs are included in replica_info
+            missing_urls = set(ready_replica_urls) - set(replica_info.keys())
+            if missing_urls:
+                logger.warning(f'Ready replica URLs missing from replica_info: '
+                               f'{missing_urls}')
+                # fallback: add missing URLs with unknown GPU type
+                for url in missing_urls:
+                    replica_info[url] = {'gpu_type': 'unknown'}
+            return responses.JSONResponse(
+                content={'replica_info': replica_info}, status_code=200)
         @self._app.post('/controller/update_service')
         async def update_service(request: fastapi.Request) -> fastapi.Response:

sky/serve/load_balancer.py CHANGED Viewed

@@ -30,11 +30,13 @@ class SkyServeLoadBalancer:
     """
     def __init__(
-            self,
-            controller_url: str,
-            load_balancer_port: int,
-            load_balancing_policy_name: Optional[str] = None,
-            tls_credential: Optional[serve_utils.TLSCredential] = None) -> None:
+        self,
+        controller_url: str,
+        load_balancer_port: int,
+        load_balancing_policy_name: Optional[str] = None,
+        tls_credential: Optional[serve_utils.TLSCredential] = None,
+        target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None
+    ) -> None:
         """Initialize the load balancer.
         Args:
@@ -44,6 +46,9 @@ class SkyServeLoadBalancer:
                 to use. Defaults to None.
             tls_credentials: The TLS credentials for HTTPS endpoint. Defaults
                 to None.
+            target_qps_per_replica: Target QPS per replica for instance-aware
+                load balancing. Can be a float or dict mapping GPU types to QPS.
+                Defaults to None.
         """
         self._app = fastapi.FastAPI()
         self._controller_url: str = controller_url
@@ -51,6 +56,15 @@ class SkyServeLoadBalancer:
         # Use the registry to create the load balancing policy
         self._load_balancing_policy = lb_policies.LoadBalancingPolicy.make(
             load_balancing_policy_name)
+        # Set accelerator QPS for instance-aware policies
+        if (target_qps_per_replica and
+                isinstance(target_qps_per_replica, dict) and
+                isinstance(self._load_balancing_policy,
+                           lb_policies.InstanceAwareLeastLoadPolicy)):
+            self._load_balancing_policy.set_target_qps_per_accelerator(
+                target_qps_per_replica)
         logger.info('Starting load balancer with policy '
                     f'{load_balancing_policy_name}.')
         self._request_aggregator: serve_utils.RequestsAggregator = (
@@ -73,6 +87,9 @@ class SkyServeLoadBalancer:
     async def _sync_with_controller_once(self) -> List[asyncio.Task]:
         close_client_tasks = []
+        ready_replica_urls = []
+        replica_info = {}
         async with aiohttp.ClientSession() as session:
             try:
                 # Send request information
@@ -88,8 +105,8 @@ class SkyServeLoadBalancer:
                     self._request_aggregator.clear()
                     response.raise_for_status()
                     response_json = await response.json()
-                    ready_replica_urls = response_json.get(
-                        'ready_replica_urls', [])
+                    replica_info = response_json.get('replica_info', {})
+                    ready_replica_urls = list(replica_info.keys())
             except (aiohttp.ClientError, asyncio.TimeoutError) as e:
                 logger.error(f'An error occurred when syncing with '
                              f'the controller: {e}'
@@ -99,6 +116,11 @@ class SkyServeLoadBalancer:
                 with self._client_pool_lock:
                     self._load_balancing_policy.set_ready_replicas(
                         ready_replica_urls)
+                    # Set replica info for instance-aware policies
+                    if isinstance(self._load_balancing_policy,
+                                  lb_policies.InstanceAwareLeastLoadPolicy):
+                        self._load_balancing_policy.set_replica_info(
+                            replica_info)
                     for replica_url in ready_replica_urls:
                         if replica_url not in self._client_pool:
                             self._client_pool[replica_url] = httpx.AsyncClient(
@@ -265,23 +287,31 @@ class SkyServeLoadBalancer:
 def run_load_balancer(
-        controller_addr: str,
-        load_balancer_port: int,
-        load_balancing_policy_name: Optional[str] = None,
-        tls_credential: Optional[serve_utils.TLSCredential] = None) -> None:
+    controller_addr: str,
+    load_balancer_port: int,
+    load_balancing_policy_name: Optional[str] = None,
+    tls_credential: Optional[serve_utils.TLSCredential] = None,
+    target_qps_per_replica: Optional[Union[float, Dict[str, float]]] = None
+) -> None:
     """ Run the load balancer.
     Args:
         controller_addr: The address of the controller.
         load_balancer_port: The port where the load balancer listens to.
-        policy_name: The name of the load balancing policy to use. Defaults to
-            None.
+        policy_name: The name of the load balancing policy to use.
+        Defaults to None.
+        tls_credential:
+            The TLS credentials for HTTPS endpoint. Defaults to None.
+        target_qps_per_replica: Target QPS per replica for instance-aware
+            load balancing. Can be a float or dict mapping GPU types to QPS.
+            Defaults to None.
     """
     load_balancer = SkyServeLoadBalancer(
         controller_url=controller_addr,
         load_balancer_port=load_balancer_port,
         load_balancing_policy_name=load_balancing_policy_name,
-        tls_credential=tls_credential)
+        tls_credential=tls_credential,
+        target_qps_per_replica=target_qps_per_replica)
     load_balancer.run()
@@ -305,5 +335,8 @@ if __name__ == '__main__':
         help=f'The load balancing policy to use. Available policies: '
         f'{", ".join(available_policies)}.')
     args = parser.parse_args()
-    run_load_balancer(args.controller_addr, args.load_balancer_port,
-                      args.load_balancing_policy)
+    run_load_balancer(args.controller_addr,
+                      args.load_balancer_port,
+                      args.load_balancing_policy,
+                      tls_credential=None,
+                      target_qps_per_replica=None)

skypilot-nightly 1.0.0.dev20250823__py3-none-any.whl → 1.0.0.dev20250825__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250823py3-none-any.whl → 1.0.0.dev20250825py3-none-any.whl