PyPI - modal - Versions diffs - 1.1.3.dev7__py3-none-any.whl → 1.1.4__py3-none-any.whl - Mend

modal 1.1.3.dev7py3-none-any.whl → 1.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

modal/_clustered_functions.py +3 -0
modal/_clustered_functions.pyi +3 -2
modal/_functions.py +11 -0
modal/_runtime/asgi.py +1 -1
modal/_utils/grpc_utils.py +1 -0
modal/app.py +6 -2
modal/app.pyi +4 -0
modal/builder/2025.06.txt +1 -0
modal/builder/PREVIEW.txt +1 -0
modal/client.pyi +2 -10
modal/cls.py +6 -1
modal/cls.pyi +16 -0
modal/experimental/__init__.py +2 -1
modal/experimental/flash.py +183 -23
modal/experimental/flash.pyi +83 -9
modal/functions.pyi +18 -6
modal/image.py +8 -2
modal/image.pyi +16 -4
modal/mount.py +17 -11
modal/mount.pyi +4 -0
modal/parallel_map.py +26 -6
modal/parallel_map.pyi +1 -0
modal/sandbox.py +31 -4
modal/sandbox.pyi +12 -3
{modal-1.1.3.dev7.dist-info → modal-1.1.4.dist-info}/METADATA +1 -1
{modal-1.1.3.dev7.dist-info → modal-1.1.4.dist-info}/RECORD +38 -38
modal_proto/api.proto +30 -0
modal_proto/api_grpc.py +32 -0
modal_proto/api_pb2.py +893 -853
modal_proto/api_pb2.pyi +94 -5
modal_proto/api_pb2_grpc.py +68 -1
modal_proto/api_pb2_grpc.pyi +25 -3
modal_proto/modal_api_grpc.py +2 -0
modal_version/__init__.py +1 -1
{modal-1.1.3.dev7.dist-info → modal-1.1.4.dist-info}/WHEEL +0 -0
{modal-1.1.3.dev7.dist-info → modal-1.1.4.dist-info}/entry_points.txt +0 -0
{modal-1.1.3.dev7.dist-info → modal-1.1.4.dist-info}/licenses/LICENSE +0 -0
{modal-1.1.3.dev7.dist-info → modal-1.1.4.dist-info}/top_level.txt +0 -0

modal/_clustered_functions.py CHANGED Viewed

@@ -14,6 +14,7 @@ from modal_proto import api_pb2
 @dataclass
 class ClusterInfo:
     rank: int
+    cluster_id: str
     container_ips: list[str]
     container_ipv4_ips: list[str]
@@ -69,12 +70,14 @@ async def _initialize_clustered_function(client: _Client, task_id: str, world_si
         )
         cluster_info = ClusterInfo(
             rank=resp.cluster_rank,
+            cluster_id=resp.cluster_id,
             container_ips=resp.container_ips,
             container_ipv4_ips=resp.container_ipv4_ips,
         )
     else:
         cluster_info = ClusterInfo(
             rank=0,
+            cluster_id="",  # No cluster ID for single-node  # TODO(irfansharif): Is this right?
             container_ips=[container_ip],
             container_ipv4_ips=[],  # No IPv4 IPs for single-node
         )

modal/_clustered_functions.pyi CHANGED Viewed

@@ -3,13 +3,14 @@ import typing
 import typing_extensions
 class ClusterInfo:
-    """ClusterInfo(rank: int, container_ips: list[str], container_ipv4_ips: list[str])"""
+    """ClusterInfo(rank: int, cluster_id: str, container_ips: list[str], container_ipv4_ips: list[str])"""
     rank: int
+    cluster_id: str
     container_ips: list[str]
     container_ipv4_ips: list[str]
-    def __init__(self, rank: int, container_ips: list[str], container_ipv4_ips: list[str]) -> None:
+    def __init__(self, rank: int, cluster_id: str, container_ips: list[str], container_ipv4_ips: list[str]) -> None:
         """Initialize self.  See help(type(self)) for accurate signature."""
         ...

modal/_functions.py CHANGED Viewed

@@ -674,6 +674,7 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
         proxy: Optional[_Proxy] = None,
         retries: Optional[Union[int, Retries]] = None,
         timeout: int = 300,
+        startup_timeout: Optional[int] = None,
         min_containers: Optional[int] = None,
         max_containers: Optional[int] = None,
         buffer_containers: Optional[int] = None,
@@ -966,6 +967,7 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
                     proxy_id=(proxy.object_id if proxy else None),
                     retry_policy=retry_policy,
                     timeout_secs=timeout_secs or 0,
+                    startup_timeout_secs=startup_timeout or timeout_secs,
                     pty_info=pty_info,
                     cloud_provider_str=cloud if cloud else "",
                     runtime=config.get("function_runtime"),
@@ -1019,6 +1021,7 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
                         autoscaler_settings=function_definition.autoscaler_settings,
                         worker_id=function_definition.worker_id,
                         timeout_secs=function_definition.timeout_secs,
+                        startup_timeout_secs=function_definition.startup_timeout_secs,
                         web_url=function_definition.web_url,
                         web_url_info=function_definition.web_url_info,
                         webhook_config=function_definition.webhook_config,
@@ -1471,6 +1474,7 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
         self._info = None
         self._serve_mounts = frozenset()
         self._metadata = None
+        self._experimental_flash_urls = None
     def _hydrate_metadata(self, metadata: Optional[Message]):
         # Overridden concrete implementation of base class method
@@ -1498,6 +1502,7 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
         self._max_object_size_bytes = (
             metadata.max_object_size_bytes if metadata.HasField("max_object_size_bytes") else MAX_OBJECT_SIZE_BYTES
         )
+        self._experimental_flash_urls = metadata._experimental_flash_urls
     def _get_metadata(self):
         # Overridden concrete implementation of base class method
@@ -1515,6 +1520,7 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
             input_plane_url=self._input_plane_url,
             input_plane_region=self._input_plane_region,
             max_object_size_bytes=self._max_object_size_bytes,
+            _experimental_flash_urls=self._experimental_flash_urls,
         )
     def _check_no_web_url(self, fn_name: str):
@@ -1545,6 +1551,11 @@ Use the `Function.get_web_url()` method instead.
         """URL of a Function running as a web endpoint."""
         return self._web_url
+    @live_method
+    async def _experimental_get_flash_urls(self) -> Optional[list[str]]:
+        """URL of the flash service for the function."""
+        return list(self._experimental_flash_urls) if self._experimental_flash_urls else None
     @property
     async def is_generator(self) -> bool:
         """mdmd:hidden"""

modal/_runtime/asgi.py CHANGED Viewed

@@ -120,7 +120,7 @@ def asgi_app_wrapper(asgi_app, container_io_manager) -> tuple[Callable[..., Asyn
         async def handle_first_input_timeout():
             if scope["type"] == "http":
-                await messages_from_app.put({"type": "http.response.start", "status": 502})
+                await messages_from_app.put({"type": "http.response.start", "status": 408})
                 await messages_from_app.put(
                     {
                         "type": "http.response.body",

modal/_utils/grpc_utils.py CHANGED Viewed

@@ -204,6 +204,7 @@ async def retry_transient_errors(
     else:
         total_deadline = None
+    metadata = metadata + [("x-modal-timestamp", str(time.time()))]
     while True:
         attempt_metadata = [
             ("x-idempotency-key", idempotency_key),

modal/app.py CHANGED Viewed

@@ -641,7 +641,8 @@ class _App:
         scaledown_window: Optional[int] = None,  # Max time (in seconds) a container can remain idle while scaling down.
         proxy: Optional[_Proxy] = None,  # Reference to a Modal Proxy to use in front of this function.
         retries: Optional[Union[int, Retries]] = None,  # Number of times to retry each input in case of failure.
-        timeout: int = 300,  # Maximum execution time in seconds.
+        timeout: int = 300,  # Maximum execution time for inputs and startup time in seconds.
+        startup_timeout: Optional[int] = None,  # Maximum startup time in seconds with higher precedence than `timeout`.
         name: Optional[str] = None,  # Sets the Modal name of the function within the app
         is_generator: Optional[
             bool
@@ -816,6 +817,7 @@ class _App:
                 batch_max_size=batch_max_size,
                 batch_wait_ms=batch_wait_ms,
                 timeout=timeout,
+                startup_timeout=startup_timeout or timeout,
                 cloud=cloud,
                 webhook_config=webhook_config,
                 enable_memory_snapshot=enable_memory_snapshot,
@@ -869,7 +871,8 @@ class _App:
         scaledown_window: Optional[int] = None,  # Max time (in seconds) a container can remain idle while scaling down.
         proxy: Optional[_Proxy] = None,  # Reference to a Modal Proxy to use in front of this function.
         retries: Optional[Union[int, Retries]] = None,  # Number of times to retry each input in case of failure.
-        timeout: int = 300,  # Maximum execution time in seconds; applies independently to startup and each input.
+        timeout: int = 300,  # Maximum execution time for inputs and startup time in seconds.
+        startup_timeout: Optional[int] = None,  # Maximum startup time in seconds with higher precedence than `timeout`.
         cloud: Optional[str] = None,  # Cloud provider to run the function on. Possible values are aws, gcp, oci, auto.
         region: Optional[Union[str, Sequence[str]]] = None,  # Region or regions to run the function on.
         enable_memory_snapshot: bool = False,  # Enable memory checkpointing for faster cold starts.
@@ -1002,6 +1005,7 @@ class _App:
                 batch_max_size=batch_max_size,
                 batch_wait_ms=batch_wait_ms,
                 timeout=timeout,
+                startup_timeout=startup_timeout or timeout,
                 cloud=cloud,
                 enable_memory_snapshot=enable_memory_snapshot,
                 block_network=block_network,

modal/app.pyi CHANGED Viewed

@@ -411,6 +411,7 @@ class _App:
         proxy: typing.Optional[modal.proxy._Proxy] = None,
         retries: typing.Union[int, modal.retries.Retries, None] = None,
         timeout: int = 300,
+        startup_timeout: typing.Optional[int] = None,
         name: typing.Optional[str] = None,
         is_generator: typing.Optional[bool] = None,
         cloud: typing.Optional[str] = None,
@@ -464,6 +465,7 @@ class _App:
         proxy: typing.Optional[modal.proxy._Proxy] = None,
         retries: typing.Union[int, modal.retries.Retries, None] = None,
         timeout: int = 300,
+        startup_timeout: typing.Optional[int] = None,
         cloud: typing.Optional[str] = None,
         region: typing.Union[str, collections.abc.Sequence[str], None] = None,
         enable_memory_snapshot: bool = False,
@@ -1014,6 +1016,7 @@ class App:
         proxy: typing.Optional[modal.proxy.Proxy] = None,
         retries: typing.Union[int, modal.retries.Retries, None] = None,
         timeout: int = 300,
+        startup_timeout: typing.Optional[int] = None,
         name: typing.Optional[str] = None,
         is_generator: typing.Optional[bool] = None,
         cloud: typing.Optional[str] = None,
@@ -1067,6 +1070,7 @@ class App:
         proxy: typing.Optional[modal.proxy.Proxy] = None,
         retries: typing.Union[int, modal.retries.Retries, None] = None,
         timeout: int = 300,
+        startup_timeout: typing.Optional[int] = None,
         cloud: typing.Optional[str] = None,
         region: typing.Union[str, collections.abc.Sequence[str], None] = None,
         enable_memory_snapshot: bool = False,

modal/builder/2025.06.txt CHANGED Viewed

@@ -3,6 +3,7 @@ aiohttp==3.12.7
 aiosignal==1.3.2
 async-timeout==5.0.1 ; python_version < "3.11"
 attrs==25.3.0
+cbor2==5.7.0
 certifi==2025.4.26
 frozenlist==1.6.0
 grpclib==0.4.8

modal/builder/PREVIEW.txt CHANGED Viewed

@@ -3,6 +3,7 @@ aiohttp==3.12.7
 aiosignal==1.3.2
 async-timeout==5.0.1 ; python_version < "3.11"
 attrs==25.3.0
+cbor2==5.7.0
 certifi==2025.4.26
 frozenlist==1.6.0
 grpclib==0.4.8

modal/client.pyi CHANGED Viewed

@@ -29,11 +29,7 @@ class _Client:
     _snapshotted: bool
     def __init__(
-        self,
-        server_url: str,
-        client_type: int,
-        credentials: typing.Optional[tuple[str, str]],
-        version: str = "1.1.3.dev7",
+        self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "1.1.4"
     ):
         """mdmd:hidden
         The Modal client object is not intended to be instantiated directly by users.
@@ -160,11 +156,7 @@ class Client:
     _snapshotted: bool
     def __init__(
-        self,
-        server_url: str,
-        client_type: int,
-        credentials: typing.Optional[tuple[str, str]],
-        version: str = "1.1.3.dev7",
+        self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "1.1.4"
     ):
         """mdmd:hidden
         The Modal client object is not intended to be instantiated directly by users.

modal/cls.py CHANGED Viewed

@@ -12,7 +12,7 @@ from grpclib import GRPCError, Status
 from modal_proto import api_pb2
 from ._functions import _Function, _parse_retries
-from ._object import _Object
+from ._object import _Object, live_method
 from ._partial_function import (
     _find_callables_for_obj,
     _find_partial_methods_for_user_cls,
@@ -510,6 +510,11 @@ class _Cls(_Object, type_prefix="cs"):
         # returns method names for a *local* class only for now (used by cli)
         return self._method_partials.keys()
+    @live_method
+    async def _experimental_get_flash_urls(self) -> Optional[list[str]]:
+        """URL of the flash service for the class."""
+        return await self._get_class_service_function()._experimental_get_flash_urls()
     def _hydrate_metadata(self, metadata: Message):
         assert isinstance(metadata, api_pb2.ClassHandleMetadata)
         class_service_function = self._get_class_service_function()

modal/cls.pyi CHANGED Viewed

@@ -354,6 +354,10 @@ class _Cls(modal._object._Object):
     def _get_name(self) -> str: ...
     def _get_class_service_function(self) -> modal._functions._Function: ...
     def _get_method_names(self) -> collections.abc.Collection[str]: ...
+    async def _experimental_get_flash_urls(self) -> typing.Optional[list[str]]:
+        """URL of the flash service for the class."""
+        ...
     def _hydrate_metadata(self, metadata: google.protobuf.message.Message): ...
     @staticmethod
     def validate_construction_mechanism(user_cls):
@@ -520,6 +524,18 @@ class Cls(modal.object.Object):
     def _get_name(self) -> str: ...
     def _get_class_service_function(self) -> modal.functions.Function: ...
     def _get_method_names(self) -> collections.abc.Collection[str]: ...
+    class ___experimental_get_flash_urls_spec(typing_extensions.Protocol[SUPERSELF]):
+        def __call__(self, /) -> typing.Optional[list[str]]:
+            """URL of the flash service for the class."""
+            ...
+        async def aio(self, /) -> typing.Optional[list[str]]:
+            """URL of the flash service for the class."""
+            ...
+    _experimental_get_flash_urls: ___experimental_get_flash_urls_spec[typing_extensions.Self]
     def _hydrate_metadata(self, metadata: google.protobuf.message.Message): ...
     @staticmethod
     def validate_construction_mechanism(user_cls):

modal/experimental/__init__.py CHANGED Viewed

@@ -311,7 +311,8 @@ async def notebook_base_image(*, python_version: Optional[str] = None, force_bui
     commands: list[str] = [
         "apt-get update",
-        "apt-get install -y libpq-dev pkg-config cmake git curl wget unzip zip libsqlite3-dev openssh-server vim",
+        "apt-get install -y "
+        + "libpq-dev pkg-config cmake git curl wget unzip zip libsqlite3-dev openssh-server vim ffmpeg",
         _install_cuda_command(),
         # Install uv since it's faster than pip for installing packages.
         "pip install uv",

modal/experimental/flash.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # Copyright Modal Labs 2025
 import asyncio
 import math
+import os
+import subprocess
 import sys
 import time
 import traceback
@@ -19,28 +21,87 @@ from ..client import _Client
 from ..config import logger
 from ..exception import InvalidError
+MAX_FAILURES = 3
 class _FlashManager:
-    def __init__(self, client: _Client, port: int, health_check_url: Optional[str] = None):
+    def __init__(
+        self,
+        client: _Client,
+        port: int,
+        process: Optional[subprocess.Popen] = None,
+        health_check_url: Optional[str] = None,
+    ):
         self.client = client
         self.port = port
+        # Health check is not currently being used
         self.health_check_url = health_check_url
+        self.process = process
         self.tunnel_manager = _forward_tunnel(port, client=client)
         self.stopped = False
+        self.num_failures = 0
+        self.task_id = os.environ["MODAL_TASK_ID"]
+    async def check_port_connection(self, process: Optional[subprocess.Popen], timeout: int = 10):
+        import socket
+        start_time = time.monotonic()
+        while time.monotonic() - start_time < timeout:
+            try:
+                if process is not None and process.poll() is not None:
+                    return Exception(f"Process {process.pid} exited with code {process.returncode}")
+                with socket.create_connection(("localhost", self.port), timeout=1):
+                    return
+            except (ConnectionRefusedError, OSError):
+                await asyncio.sleep(0.1)
+        return Exception(f"Waited too long for port {self.port} to start accepting connections")
     async def _start(self):
         self.tunnel = await self.tunnel_manager.__aenter__()
         parsed_url = urlparse(self.tunnel.url)
         host = parsed_url.hostname
         port = parsed_url.port or 443
         self.heartbeat_task = asyncio.create_task(self._run_heartbeat(host, port))
+        self.drain_task = asyncio.create_task(self._drain_container())
+    async def _drain_container(self):
+        """
+        Background task that checks if we've encountered too many failures and drains the container if so.
+        """
+        while True:
+            try:
+                # Check if the container should be drained (e.g., too many failures)
+                if self.num_failures > MAX_FAILURES:
+                    logger.warning(
+                        f"[Modal Flash] Draining task {self.task_id} on {self.tunnel.url} due to too many failures."
+                    )
+                    await self.stop()
+                    # handle close upon container exit
+                    if self.task_id:
+                        await self.client.stub.ContainerStop(api_pb2.ContainerStopRequest(task_id=self.task_id))
+                    return
+            except asyncio.CancelledError:
+                logger.warning("[Modal Flash] Shutting down...")
+                return
+            except Exception as e:
+                logger.error(f"[Modal Flash] Error draining container: {e}")
+                await asyncio.sleep(1)
+            try:
+                await asyncio.sleep(1)
+            except asyncio.CancelledError:
+                logger.warning("[Modal Flash] Shutting down...")
+                return
     async def _run_heartbeat(self, host: str, port: int):
         first_registration = True
         while True:
             try:
+                await self.check_port_connection(process=self.process)
                 resp = await self.client.stub.FlashContainerRegister(
                     api_pb2.FlashContainerRegisterRequest(
                         priority=10,
@@ -50,14 +111,25 @@ class _FlashManager:
                     ),
                     timeout=10,
                 )
+                self.num_failures = 0
                 if first_registration:
-                    logger.warning(f"[Modal Flash] Listening at {resp.url} over {self.tunnel.url}")
+                    logger.warning(
+                        f"[Modal Flash] Listening at {resp.url} over {self.tunnel.url} for task_id {self.task_id}"
+                    )
                     first_registration = False
             except asyncio.CancelledError:
                 logger.warning("[Modal Flash] Shutting down...")
                 break
             except Exception as e:
                 logger.error(f"[Modal Flash] Heartbeat failed: {e}")
+                self.num_failures += 1
+                logger.error(
+                    f"[Modal Flash] Deregistering container {self.tunnel.url}, num_failures: {self.num_failures}"
+                )
+                await retry_transient_errors(
+                    self.client.stub.FlashContainerDeregister,
+                    api_pb2.FlashContainerDeregisterRequest(),
+                )
             try:
                 await asyncio.sleep(1)
@@ -94,16 +166,17 @@ FlashManager = synchronize_api(_FlashManager)
 @synchronizer.create_blocking
-async def flash_forward(port: int, health_check_url: Optional[str] = None) -> _FlashManager:
+async def flash_forward(
+    port: int, process: Optional[subprocess.Popen] = None, health_check_url: Optional[str] = None
+) -> _FlashManager:
     """
     Forward a port to the Modal Flash service, exposing that port as a stable web endpoint.
     This is a highly experimental method that can break or be removed at any time without warning.
     Do not use this method unless explicitly instructed to do so by Modal support.
     """
     client = await _Client.from_env()
-    manager = _FlashManager(client, port, health_check_url)
+    manager = _FlashManager(client, port, process=process, health_check_url=health_check_url)
     await manager._start()
     return manager
@@ -127,6 +200,8 @@ class _FlashPrometheusAutoscaler:
         scale_down_stabilization_window_seconds: int,
         autoscaling_interval_seconds: int,
     ):
+        import aiohttp
         if scale_up_stabilization_window_seconds > self._max_window_seconds:
             raise InvalidError(
                 f"scale_up_stabilization_window_seconds must be less than or equal to {self._max_window_seconds}"
@@ -138,8 +213,6 @@ class _FlashPrometheusAutoscaler:
         if target_metric_value <= 0:
             raise InvalidError("target_metric_value must be greater than 0")
-        import aiohttp
         self.client = client
         self.app_name = app_name
         self.cls_name = cls_name
@@ -200,7 +273,10 @@ class _FlashPrometheusAutoscaler:
                     if timestamp >= autoscaling_time - self._max_window_seconds
                 ]
-                current_target_containers = await self._compute_target_containers(current_replicas)
+                if self.metrics_endpoint == "internal":
+                    current_target_containers = await self._compute_target_containers_internal(current_replicas)
+                else:
+                    current_target_containers = await self._compute_target_containers_prometheus(current_replicas)
                 autoscaling_decisions.append((autoscaling_time, current_target_containers))
                 actual_target_containers = self._make_scaling_decision(
@@ -213,8 +289,8 @@ class _FlashPrometheusAutoscaler:
                 )
                 logger.warning(
-                    f"[Modal Flash] Scaling to {actual_target_containers} containers. Autoscaling decision "
-                    f"made in {time.time() - autoscaling_time} seconds."
+                    f"[Modal Flash] Scaling to {actual_target_containers=} containers. "
+                    f" Autoscaling decision made in {time.time() - autoscaling_time} seconds."
                 )
                 await self.autoscaling_decisions_dict.put(
@@ -223,9 +299,7 @@ class _FlashPrometheusAutoscaler:
                 )
                 await self.autoscaling_decisions_dict.put("current_replicas", actual_target_containers)
-                await self.cls.update_autoscaler(
-                    min_containers=actual_target_containers,
-                )
+                await self.cls.update_autoscaler(min_containers=actual_target_containers)
                 if time.time() - autoscaling_time < self.autoscaling_interval_seconds:
                     await asyncio.sleep(self.autoscaling_interval_seconds - (time.time() - autoscaling_time))
@@ -238,7 +312,55 @@ class _FlashPrometheusAutoscaler:
                 logger.error(traceback.format_exc())
                 await asyncio.sleep(self.autoscaling_interval_seconds)
-    async def _compute_target_containers(self, current_replicas: int) -> int:
+    async def _compute_target_containers_internal(self, current_replicas: int) -> int:
+        """
+        Gets internal metrics from container to autoscale up or down.
+        """
+        containers = await self._get_all_containers()
+        if len(containers) > current_replicas:
+            logger.info(
+                f"[Modal Flash] Current replicas {current_replicas} is less than the number of containers "
+                f"{len(containers)}. Setting current_replicas = num_containers."
+            )
+            current_replicas = len(containers)
+        if current_replicas == 0:
+            return 1
+        internal_metrics_list = []
+        for container in containers:
+            internal_metric = await self._get_container_metrics(container.task_id)
+            if internal_metric is None:
+                continue
+            internal_metrics_list.append(getattr(internal_metric.metrics, self.target_metric))
+        if not internal_metrics_list:
+            return current_replicas
+        avg_internal_metric = sum(internal_metrics_list) / len(internal_metrics_list)
+        scale_factor = avg_internal_metric / self.target_metric_value
+        desired_replicas = current_replicas
+        if scale_factor > 1 + self.scale_up_tolerance:
+            desired_replicas = math.ceil(current_replicas * scale_factor)
+        elif scale_factor < 1 - self.scale_down_tolerance:
+            desired_replicas = math.ceil(current_replicas * scale_factor)
+        logger.warning(
+            f"[Modal Flash] Current replicas: {current_replicas}, "
+            f"avg internal metric `{self.target_metric}`: {avg_internal_metric}, "
+            f"target internal metric value: {self.target_metric_value}, "
+            f"scale factor: {scale_factor}, "
+            f"desired replicas: {desired_replicas}"
+        )
+        desired_replicas = max(1, min(desired_replicas, self.max_containers or 1000))
+        return desired_replicas
+    async def _compute_target_containers_prometheus(self, current_replicas: int) -> int:
+        # current_replicas is the number of live containers + cold starting containers (not yet live)
+        # containers is the number of live containers that are registered in flash dns
         containers = await self._get_all_containers()
         if len(containers) > current_replicas:
             logger.info(
@@ -253,6 +375,7 @@ class _FlashPrometheusAutoscaler:
         target_metric = self.target_metric
         target_metric_value = float(self.target_metric_value)
+        # Gets metrics from prometheus
         sum_metric = 0
         containers_with_metrics = 0
         container_metrics_list = await asyncio.gather(
@@ -271,11 +394,17 @@ class _FlashPrometheusAutoscaler:
             sum_metric += container_metrics[target_metric][0].value
             containers_with_metrics += 1
+        # n_containers_missing_metric is the number of unhealthy containers + number of cold starting containers
         n_containers_missing_metric = current_replicas - containers_with_metrics
+        # n_containers_unhealthy is the number of live containers that are not emitting metrics i.e. unhealthy
+        n_containers_unhealthy = len(containers) - containers_with_metrics
+        # Scale up assuming that every unhealthy container is at 2x the target metric value.
+        scale_up_target_metric_value = (sum_metric + n_containers_unhealthy * target_metric_value) / (
+            (containers_with_metrics + n_containers_unhealthy) or 1
+        )
-        # Scale up / down conservatively: Any container that is missing the metric is assumed to be at the minimum
-        # value of the metric when scaling up and the maximum value of the metric when scaling down.
-        scale_up_target_metric_value = sum_metric / current_replicas
+        # Scale down assuming that every container (including cold starting containers) are at the target metric value.
         scale_down_target_metric_value = (
             sum_metric + n_containers_missing_metric * target_metric_value
         ) / current_replicas
@@ -290,9 +419,14 @@ class _FlashPrometheusAutoscaler:
             desired_replicas = math.ceil(current_replicas * scale_down_ratio)
         logger.warning(
-            f"[Modal Flash] Current replicas: {current_replicas}, target metric value: {target_metric_value}, "
-            f"current sum of metric values: {sum_metric}, number of containers missing metric: "
-            f"{n_containers_missing_metric}, scale up ratio: {scale_up_ratio}, scale down ratio: {scale_down_ratio}, "
+            f"[Modal Flash] Current replicas: {current_replicas}, "
+            f"target metric value: {target_metric_value}, "
+            f"current sum of metric values: {sum_metric}, "
+            f"number of containers with metrics: {containers_with_metrics}, "
+            f"number of containers unhealthy: {n_containers_unhealthy}, "
+            f"number of containers missing metric (includes unhealthy): {n_containers_missing_metric}, "
+            f"scale up ratio: {scale_up_ratio}, "
+            f"scale down ratio: {scale_down_ratio}, "
             f"desired replicas: {desired_replicas}"
         )
@@ -303,20 +437,42 @@ class _FlashPrometheusAutoscaler:
         # Fetch the metrics from the endpoint
         try:
-            response = await self.http_client.get(url)
+            response = await self.http_client.get(url, timeout=3)
             response.raise_for_status()
+        except asyncio.TimeoutError:
+            logger.warning(f"[Modal Flash] Timeout getting metrics from {url}")
+            return None
         except Exception as e:
             logger.warning(f"[Modal Flash] Error getting metrics from {url}: {e}")
             return None
+        # Read body with timeout/error handling and parse Prometheus metrics
+        try:
+            text_body = await response.text()
+        except asyncio.TimeoutError:
+            logger.warning(f"[Modal Flash] Timeout reading metrics body from {url}")
+            return None
+        except Exception as e:
+            logger.warning(f"[Modal Flash] Error reading metrics body from {url}: {e}")
+            return None
         # Parse the text-based Prometheus metrics format
         metrics: dict[str, list[Sample]] = defaultdict(list)
-        for family in text_string_to_metric_families(await response.text()):
+        for family in text_string_to_metric_families(text_body):
             for sample in family.samples:
                 metrics[sample.name] += [sample]
         return metrics
+    async def _get_container_metrics(self, container_id: str) -> Optional[api_pb2.TaskGetAutoscalingMetricsResponse]:
+        req = api_pb2.TaskGetAutoscalingMetricsRequest(task_id=container_id)
+        try:
+            resp = await retry_transient_errors(self.client.stub.TaskGetAutoscalingMetrics, req)
+            return resp
+        except Exception as e:
+            logger.warning(f"[Modal Flash] Error getting metrics for container {container_id}: {e}")
+            return None
     async def _get_all_containers(self):
         req = api_pb2.FlashContainerListRequest(function_id=self.fn.object_id)
         resp = await retry_transient_errors(self.client.stub.FlashContainerList, req)
@@ -395,10 +551,14 @@ async def flash_prometheus_autoscaler(
     app_name: str,
     cls_name: str,
     # Endpoint to fetch metrics from. Must be in Prometheus format. Example: "/metrics"
+    # If metrics_endpoint is "internal", we will use containers' internal metrics to autoscale instead.
     metrics_endpoint: str,
     # Target metric to autoscale on. Example: "vllm:num_requests_running"
+    # If metrics_endpoint is "internal", target_metrics options are: [cpu_usage_percent, memory_usage_percent]
     target_metric: str,
     # Target metric value. Example: 25
+    # If metrics_endpoint is "internal", target_metric_value is a percentage value between 0.1 and 1.0 (inclusive),
+    # indicating container's usage of that metric.
     target_metric_value: float,
     min_containers: Optional[int] = None,
     max_containers: Optional[int] = None,

modal 1.1.3.dev7__py3-none-any.whl → 1.1.4__py3-none-any.whl

modal 1.1.3.dev7py3-none-any.whl → 1.1.4py3-none-any.whl