PyPI - modal - Versions diffs - 1.1.0__tar.gz → 1.1.1__tar.gz - Mend

modal 1.1.0tar.gz → 1.1.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of modal might be problematic. Click here for more details.

Files changed (190) hide show

{modal-1.1.0 → modal-1.1.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: modal
-Version: 1.1.0
+Version: 1.1.1
 Summary: Python client library for Modal
 Author-email: Modal Labs <support@modal.com>
 License: Apache-2.0
@@ -18,7 +18,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: aiohttp
 Requires-Dist: certifi
-Requires-Dist: click~=8.1.0
+Requires-Dist: click~=8.1
 Requires-Dist: grpclib<0.4.9,>=0.4.7
 Requires-Dist: protobuf!=4.24.0,<7.0,>=3.19
 Requires-Dist: rich>=12.0.0

{modal-1.1.0 → modal-1.1.1}/modal/__main__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 # Copyright Modal Labs 2022
 import sys
+from ._output import make_console
 from ._traceback import reduce_traceback_to_user_code
 from .cli._traceback import highlight_modal_warnings, setup_rich_traceback
 from .cli.entry_point import entrypoint_cli
@@ -35,7 +36,6 @@ def main():
             raise
         from grpclib import GRPCError, Status
-        from rich.console import Console
         from rich.panel import Panel
         from rich.text import Text
@@ -68,7 +68,7 @@ def main():
             if notes := getattr(exc, "__notes__", []):
                 content = f"{content}\n\nNote: {' '.join(notes)}"
-        console = Console(stderr=True)
+        console = make_console(stderr=True)
         panel = Panel(Text(content), title=title, title_align="left", border_style="red")
         console.print(panel, highlight=False)
         sys.exit(1)

{modal-1.1.0 → modal-1.1.1}/modal/_clustered_functions.py RENAMED Viewed

@@ -15,6 +15,7 @@ from modal_proto import api_pb2
 class ClusterInfo:
     rank: int
     container_ips: list[str]
+    container_ipv4_ips: list[str]
 cluster_info: Optional[ClusterInfo] = None
@@ -69,11 +70,13 @@ async def _initialize_clustered_function(client: _Client, task_id: str, world_si
         cluster_info = ClusterInfo(
             rank=resp.cluster_rank,
             container_ips=resp.container_ips,
+            container_ipv4_ips=resp.container_ipv4_ips,
         )
     else:
         cluster_info = ClusterInfo(
             rank=0,
             container_ips=[container_ip],
+            container_ipv4_ips=[],  # No IPv4 IPs for single-node
         )

{modal-1.1.0 → modal-1.1.1}/modal/_clustered_functions.pyi RENAMED Viewed

@@ -3,12 +3,13 @@ import typing
 import typing_extensions
 class ClusterInfo:
-    """ClusterInfo(rank: int, container_ips: list[str])"""
+    """ClusterInfo(rank: int, container_ips: list[str], container_ipv4_ips: list[str])"""
     rank: int
     container_ips: list[str]
+    container_ipv4_ips: list[str]
-    def __init__(self, rank: int, container_ips: list[str]) -> None:
+    def __init__(self, rank: int, container_ips: list[str], container_ipv4_ips: list[str]) -> None:
         """Initialize self.  See help(type(self)) for accurate signature."""
         ...

{modal-1.1.0 → modal-1.1.1}/modal/_functions.py RENAMED Viewed

@@ -75,6 +75,7 @@ from .parallel_map import (
     _for_each_sync,
     _map_async,
     _map_invocation,
+    _map_invocation_inputplane,
     _map_sync,
     _spawn_map_async,
     _spawn_map_sync,
@@ -399,7 +400,8 @@ class _InputPlaneInvocation:
             parent_input_id=current_input_id() or "",
             input=input_item,
         )
-        metadata = await _InputPlaneInvocation._get_metadata(input_plane_region, client)
+        metadata = await client.get_input_plane_metadata(input_plane_region)
         response = await retry_transient_errors(stub.AttemptStart, request, metadata=metadata)
         attempt_token = response.attempt_token
@@ -415,7 +417,7 @@ class _InputPlaneInvocation:
                 timeout_secs=OUTPUTS_TIMEOUT,
                 requested_at=time.time(),
             )
-            metadata = await self._get_metadata(self.input_plane_region, self.client)
+            metadata = await self.client.get_input_plane_metadata(self.input_plane_region)
             await_response: api_pb2.AttemptAwaitResponse = await retry_transient_errors(
                 self.stub.AttemptAwait,
                 await_request,
@@ -451,6 +453,33 @@ class _InputPlaneInvocation:
                     await_response.output.result, await_response.output.data_format, control_plane_stub, self.client
                 )
+    async def run_generator(self):
+        items_received = 0
+        # populated when self.run_function() completes
+        items_total: Union[int, None] = None
+        async with aclosing(
+            async_merge(
+                _stream_function_call_data(
+                    self.client,
+                    self.stub,
+                    "",
+                    variant="data_out",
+                    attempt_token=self.attempt_token,
+                ),
+                callable_to_agen(self.run_function),
+            )
+        ) as streamer:
+            async for item in streamer:
+                if isinstance(item, api_pb2.GeneratorDone):
+                    items_total = item.items_total
+                else:
+                    yield item
+                    items_received += 1
+                # The comparison avoids infinite loops if a non-deterministic generator is retried
+                # and produces less data in the second run than what was already sent.
+                if items_total is not None and items_received >= items_total:
+                    break
     @staticmethod
     async def _get_metadata(input_plane_region: str, client: _Client) -> list[tuple[str, str]]:
         if not input_plane_region:
@@ -600,7 +629,6 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
         experimental_options: Optional[dict[str, str]] = None,
         _experimental_proxy_ip: Optional[str] = None,
         _experimental_custom_scaling_factor: Optional[float] = None,
-        _experimental_enable_gpu_snapshot: bool = False,
     ) -> "_Function":
         """mdmd:hidden"""
         # Needed to avoid circular imports
@@ -901,7 +929,6 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
                     _experimental_concurrent_cancellations=True,
                     _experimental_proxy_ip=_experimental_proxy_ip,
                     _experimental_custom_scaling=_experimental_custom_scaling_factor is not None,
-                    _experimental_enable_gpu_snapshot=_experimental_enable_gpu_snapshot,
                     # --- These are deprecated in favor of autoscaler_settings
                     warm_pool_size=min_containers or 0,
                     concurrency_limit=max_containers or 0,
@@ -938,7 +965,6 @@ class _Function(typing.Generic[P, ReturnType, OriginalReturnType], _Object, type
                         _experimental_group_size=function_definition._experimental_group_size,
                         _experimental_buffer_containers=function_definition._experimental_buffer_containers,
                         _experimental_custom_scaling=function_definition._experimental_custom_scaling,
-                        _experimental_enable_gpu_snapshot=_experimental_enable_gpu_snapshot,
                         _experimental_proxy_ip=function_definition._experimental_proxy_ip,
                         snapshot_debug=function_definition.snapshot_debug,
                         runtime_perf_record=function_definition.runtime_perf_record,
@@ -1487,20 +1513,35 @@ Use the `Function.get_web_url()` method instead.
         else:
             count_update_callback = None
-        async with aclosing(
-            _map_invocation(
-                self,
-                input_queue,
-                self.client,
-                order_outputs,
-                return_exceptions,
-                wrap_returned_exceptions,
-                count_update_callback,
-                api_pb2.FUNCTION_CALL_INVOCATION_TYPE_SYNC,
-            )
-        ) as stream:
-            async for item in stream:
-                yield item
+        if self._input_plane_url:
+            async with aclosing(
+                _map_invocation_inputplane(
+                    self,
+                    input_queue,
+                    self.client,
+                    order_outputs,
+                    return_exceptions,
+                    wrap_returned_exceptions,
+                    count_update_callback,
+                )
+            ) as stream:
+                async for item in stream:
+                    yield item
+        else:
+            async with aclosing(
+                _map_invocation(
+                    self,
+                    input_queue,
+                    self.client,
+                    order_outputs,
+                    return_exceptions,
+                    wrap_returned_exceptions,
+                    count_update_callback,
+                    api_pb2.FUNCTION_CALL_INVOCATION_TYPE_SYNC,
+                )
+            ) as stream:
+                async for item in stream:
+                    yield item
     async def _call_function(self, args, kwargs) -> ReturnType:
         invocation: Union[_Invocation, _InputPlaneInvocation]
@@ -1544,13 +1585,24 @@ Use the `Function.get_web_url()` method instead.
     @live_method_gen
     @synchronizer.no_input_translation
     async def _call_generator(self, args, kwargs):
-        invocation = await _Invocation.create(
-            self,
-            args,
-            kwargs,
-            client=self.client,
-            function_call_invocation_type=api_pb2.FUNCTION_CALL_INVOCATION_TYPE_SYNC_LEGACY,
-        )
+        invocation: Union[_Invocation, _InputPlaneInvocation]
+        if self._input_plane_url:
+            invocation = await _InputPlaneInvocation.create(
+                self,
+                args,
+                kwargs,
+                client=self.client,
+                input_plane_url=self._input_plane_url,
+                input_plane_region=self._input_plane_region,
+            )
+        else:
+            invocation = await _Invocation.create(
+                self,
+                args,
+                kwargs,
+                client=self.client,
+                function_call_invocation_type=api_pb2.FUNCTION_CALL_INVOCATION_TYPE_SYNC_LEGACY,
+            )
         async for res in invocation.run_generator():
             yield res

{modal-1.1.0 → modal-1.1.1}/modal/_object.py RENAMED Viewed

@@ -48,6 +48,10 @@ class _Object:
     _is_hydrated: bool
     _is_rehydrated: bool
+    # Not all object subclasses have a meaningful "name" concept
+    # So whether they expose this is a matter of having a name property
+    _name: Optional[str]
     @classmethod
     def __init_subclass__(cls, type_prefix: Optional[str] = None):
         super().__init_subclass__()
@@ -68,6 +72,7 @@ class _Object:
         hydrate_lazily: bool = False,
         deps: Optional[Callable[..., Sequence["_Object"]]] = None,
         deduplication_key: Optional[Callable[[], Awaitable[Hashable]]] = None,
+        name: Optional[str] = None,
     ):
         self._local_uuid = str(uuid.uuid4())
         self._load = load
@@ -83,6 +88,8 @@ class _Object:
         self._is_hydrated = False
         self._is_rehydrated = False
+        self._name = name
         self._initialize_from_empty()
     def _unhydrate(self):
@@ -163,10 +170,11 @@ class _Object:
         hydrate_lazily: bool = False,
         deps: Optional[Callable[..., Sequence["_Object"]]] = None,
         deduplication_key: Optional[Callable[[], Awaitable[Hashable]]] = None,
+        name: Optional[str] = None,
     ):
         # TODO(erikbern): flip the order of the two first arguments
         obj = _Object.__new__(cls)
-        obj._init(rep, load, is_another_app, preload, hydrate_lazily, deps, deduplication_key)
+        obj._init(rep, load, is_another_app, preload, hydrate_lazily, deps, deduplication_key, name)
         return obj
     @staticmethod

{modal-1.1.0 → modal-1.1.1}/modal/_output.py RENAMED Viewed

@@ -4,7 +4,6 @@ from __future__ import annotations
 import asyncio
 import contextlib
 import functools
-import io
 import platform
 import re
 import socket
@@ -32,7 +31,7 @@ from rich.progress import (
 from rich.spinner import Spinner
 from rich.text import Text
-from modal._utils.time_utils import timestamp_to_local
+from modal._utils.time_utils import timestamp_to_localized_str
 from modal_proto import api_pb2
 from ._utils.grpc_utils import RETRYABLE_GRPC_STATUS_CODES, retry_transient_errors
@@ -46,6 +45,16 @@ else:
     default_spinner = "dots"
+def make_console(*, stderr: bool = False, highlight: bool = True) -> Console:
+    """Create a rich Console tuned for Modal CLI output."""
+    return Console(
+        stderr=stderr,
+        highlight=highlight,
+        # CLI does not work with auto-detected Jupyter HTML display_data.
+        force_jupyter=False,
+    )
 class FunctionQueuingColumn(ProgressColumn):
     """Renders time elapsed, including task.completed as additional elapsed time."""
@@ -63,25 +72,6 @@ class FunctionQueuingColumn(ProgressColumn):
         return Text(str(delta), style="progress.elapsed")
-def download_progress_bar() -> Progress:
-    """
-    Returns a progress bar suitable for showing file download progress.
-    Requires passing a `path: str` data field for rendering.
-    """
-    return Progress(
-        TextColumn("[bold white]{task.fields[path]}", justify="right"),
-        BarColumn(bar_width=None),
-        "[progress.percentage]{task.percentage:>3.1f}%",
-        "•",
-        DownloadColumn(),
-        "•",
-        TransferSpeedColumn(),
-        "•",
-        TimeRemainingColumn(),
-        transient=True,
-    )
 class LineBufferedOutput:
     """Output stream that buffers lines and passes them to a callback."""
@@ -101,7 +91,7 @@ class LineBufferedOutput:
         if self._show_timestamps:
             for i in range(0, len(chunks) - 1, 2):
-                chunks[i] = f"{timestamp_to_local(log.timestamp)} {chunks[i]}"
+                chunks[i] = f"{timestamp_to_localized_str(log.timestamp)} {chunks[i]}"
         completed_lines = "".join(chunks[:-1])
         remainder = chunks[-1]
@@ -147,12 +137,11 @@ class OutputManager:
     def __init__(
         self,
         *,
-        stdout: io.TextIOWrapper | None = None,
         status_spinner_text: str = "Running app...",
         show_timestamps: bool = False,
     ):
-        self._stdout = stdout or sys.stdout
-        self._console = Console(file=stdout, highlight=False)
+        self._stdout = sys.stdout
+        self._console = make_console(highlight=False)
         self._task_states = {}
         self._task_progress_items = {}
         self._current_render_group = None

modal-1.1.1/modal/_runtime/gpu_memory_snapshot.py ADDED Viewed

@@ -0,0 +1,303 @@
+# Copyright Modal Labs 2022
+#
+# This module provides a simple interface for creating GPU memory snapshots,
+# providing a convenient interface to `cuda-checkpoint` [1]. This is intended
+# to be used in conjunction with memory snapshots.
+#
+# [1] https://github.com/NVIDIA/cuda-checkpoint
+import subprocess
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import List, Optional
+from modal.config import config, logger
+CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
+class CudaCheckpointState(Enum):
+    """State representation from the CUDA API [1].
+    [1] https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html"""
+    RUNNING = "running"
+    LOCKED = "locked"
+    CHECKPOINTED = "checkpointed"
+    FAILED = "failed"
+class CudaCheckpointException(Exception):
+    """Exception raised for CUDA checkpoint operations."""
+    pass
+@dataclass
+class CudaCheckpointProcess:
+    """Contains a reference to a PID with active CUDA session. This also provides
+    methods for checkpointing and restoring GPU memory."""
+    pid: int
+    state: CudaCheckpointState
+    def toggle(self, target_state: CudaCheckpointState, timeout_secs: float = 5 * 60.0) -> None:
+        """Toggle CUDA checkpoint state for current process, moving GPU memory to the
+        CPU and back depending on the current process state when called.
+        """
+        logger.debug(f"PID: {self.pid} Toggling CUDA checkpoint state to {target_state.value}")
+        start_time = time.monotonic()
+        retry_count = 0
+        max_retries = 3
+        while self._should_continue_toggle(target_state, start_time, timeout_secs):
+            try:
+                self._execute_toggle_command()
+                # Use exponential backoff for retries
+                sleep_time = min(0.1 * (2**retry_count), 1.0)
+                time.sleep(sleep_time)
+                retry_count = 0
+            except CudaCheckpointException as e:
+                retry_count += 1
+                if retry_count >= max_retries:
+                    raise CudaCheckpointException(
+                        f"PID: {self.pid} Failed to toggle state after {max_retries} retries: {e}"
+                    )
+                logger.debug(f"PID: {self.pid} Retry {retry_count}/{max_retries} after error: {e}")
+                time.sleep(0.5 * retry_count)
+        logger.debug(f"PID: {self.pid} Target state {target_state.value} reached")
+    def _should_continue_toggle(
+        self, target_state: CudaCheckpointState, start_time: float, timeout_secs: float
+    ) -> bool:
+        """Check if toggle operation should continue based on current state and timeout."""
+        self.refresh_state()
+        if self.state == target_state:
+            return False
+        if self.state == CudaCheckpointState.FAILED:
+            raise CudaCheckpointException(f"PID: {self.pid} CUDA process state is {self.state}")
+        elapsed = time.monotonic() - start_time
+        if elapsed >= timeout_secs:
+            raise CudaCheckpointException(
+                f"PID: {self.pid} Timeout after {elapsed:.2f}s waiting for state {target_state.value}. "
+                f"Current state: {self.state}"
+            )
+        return True
+    def _execute_toggle_command(self) -> None:
+        """Execute the cuda-checkpoint toggle command."""
+        try:
+            _ = subprocess.run(
+                [CUDA_CHECKPOINT_PATH, "--toggle", "--pid", str(self.pid)],
+                check=True,
+                capture_output=True,
+                text=True,
+                timeout=30,
+            )
+            logger.debug(f"PID: {self.pid} Successfully toggled CUDA checkpoint state")
+        except subprocess.CalledProcessError as e:
+            error_msg = f"PID: {self.pid} Failed to toggle CUDA checkpoint state: {e.stderr}"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
+        except subprocess.TimeoutExpired:
+            error_msg = f"PID: {self.pid} Toggle command timed out"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
+    def refresh_state(self) -> None:
+        """Refreshes the current CUDA checkpoint state for this process."""
+        try:
+            result = subprocess.run(
+                [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(self.pid)],
+                check=True,
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            state_str = result.stdout.strip().lower()
+            self.state = CudaCheckpointState(state_str)
+        except subprocess.CalledProcessError as e:
+            error_msg = f"PID: {self.pid} Failed to get CUDA checkpoint state: {e.stderr}"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
+        except subprocess.TimeoutExpired:
+            error_msg = f"PID: {self.pid} Get state command timed out"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
+class CudaCheckpointSession:
+    """Manages the checkpointing state of processes with active CUDA sessions."""
+    def __init__(self):
+        self.cuda_processes = self._get_cuda_pids()
+        if self.cuda_processes:
+            logger.debug(
+                f"Found {len(self.cuda_processes)} PID(s) with CUDA sessions: {[c.pid for c in self.cuda_processes]}"
+            )
+        else:
+            logger.debug("No CUDA sessions found.")
+    def _get_cuda_pids(self) -> List[CudaCheckpointProcess]:
+        """Iterates over all PIDs and identifies the ones that have running
+        CUDA sessions."""
+        cuda_pids: List[CudaCheckpointProcess] = []
+        # Get all active process IDs from /proc directory
+        proc_dir = Path("/proc")
+        if not proc_dir.exists():
+            raise CudaCheckpointException(
+                "OS does not have /proc path rendering it incompatible with GPU memory snapshots."
+            )
+        # Get all numeric directories (PIDs) from /proc
+        pid_dirs = [entry for entry in proc_dir.iterdir() if entry.name.isdigit()]
+        # Use ThreadPoolExecutor to check PIDs in parallel for better performance
+        with ThreadPoolExecutor(max_workers=min(50, len(pid_dirs))) as executor:
+            future_to_pid = {
+                executor.submit(self._check_cuda_session, int(entry.name)): int(entry.name) for entry in pid_dirs
+            }
+            for future in as_completed(future_to_pid):
+                pid = future_to_pid[future]
+                try:
+                    cuda_process = future.result()
+                    if cuda_process:
+                        cuda_pids.append(cuda_process)
+                except Exception as e:
+                    logger.debug(f"Error checking PID {pid}: {e}")
+        # Sort PIDs for ordered checkpointing
+        cuda_pids.sort(key=lambda x: x.pid)
+        return cuda_pids
+    def _check_cuda_session(self, pid: int) -> Optional[CudaCheckpointProcess]:
+        """Check if a specific PID has a CUDA session."""
+        try:
+            result = subprocess.run(
+                [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            # If the command succeeds (return code 0), this PID has a CUDA session
+            if result.returncode == 0:
+                state_str = result.stdout.strip().lower()
+                state = CudaCheckpointState(state_str)
+                return CudaCheckpointProcess(pid=pid, state=state)
+        except subprocess.CalledProcessError:
+            # Command failed, which is expected for PIDs without CUDA sessions
+            pass
+        except subprocess.TimeoutExpired:
+            logger.debug(f"Timeout checking CUDA state for PID {pid}")
+        except Exception as e:
+            logger.debug(f"Error checking PID {pid}: {e}")
+        return None
+    def checkpoint(self) -> None:
+        """Checkpoint all CUDA processes, moving GPU memory to CPU."""
+        if not self.cuda_processes:
+            logger.debug("No CUDA processes to checkpoint.")
+            return
+        # Validate all states first
+        for proc in self.cuda_processes:
+            proc.refresh_state()  # Refresh state before validation
+            if proc.state != CudaCheckpointState.RUNNING:
+                raise CudaCheckpointException(
+                    f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.RUNNING.value} state. "
+                    f"Current state: {proc.state.value}"
+                )
+        # Moving state from GPU to CPU can take several seconds per CUDA session.
+        # Make a parallel call per CUDA session.
+        start = time.perf_counter()
+        def checkpoint_impl(proc: CudaCheckpointProcess) -> None:
+            proc.toggle(CudaCheckpointState.CHECKPOINTED)
+        with ThreadPoolExecutor() as executor:
+            futures = [executor.submit(checkpoint_impl, proc) for proc in self.cuda_processes]
+            # Wait for all futures and collect any exceptions
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    exceptions.append(e)
+            if exceptions:
+                raise CudaCheckpointException(
+                    f"Failed to checkpoint {len(exceptions)} processes: {'; '.join(str(e) for e in exceptions)}"
+                )
+        elapsed = time.perf_counter() - start
+        logger.debug(f"Checkpointing {len(self.cuda_processes)} CUDA sessions took => {elapsed:.3f}s")
+    def restore(self) -> None:
+        """Restore all CUDA processes, moving memory back from CPU to GPU."""
+        if not self.cuda_processes:
+            logger.debug("No CUDA sessions to restore.")
+            return
+        # Validate all states first
+        for proc in self.cuda_processes:
+            proc.refresh_state()  # Refresh state before validation
+            if proc.state != CudaCheckpointState.CHECKPOINTED:
+                raise CudaCheckpointException(
+                    f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.CHECKPOINTED.value} state. "
+                    f"Current state: {proc.state.value}"
+                )
+        # See checkpoint() for rationale about parallelism.
+        start = time.perf_counter()
+        def restore_process(proc: CudaCheckpointProcess) -> None:
+            proc.toggle(CudaCheckpointState.RUNNING)
+        with ThreadPoolExecutor() as executor:
+            futures = [executor.submit(restore_process, proc) for proc in self.cuda_processes]
+            # Wait for all futures and collect any exceptions
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    exceptions.append(e)
+            if exceptions:
+                raise CudaCheckpointException(
+                    f"Failed to restore {len(exceptions)} processes: {'; '.join(str(e) for e in exceptions)}"
+                )
+        elapsed = time.perf_counter() - start
+        logger.debug(f"Restoring {len(self.cuda_processes)} CUDA session(s) took => {elapsed:.3f}s")
+    def get_process_count(self) -> int:
+        """Get the number of CUDA processes managed by this session."""
+        return len(self.cuda_processes)
+    def get_process_states(self) -> List[tuple[int, CudaCheckpointState]]:
+        """Get current states of all managed processes."""
+        states = []
+        for proc in self.cuda_processes:
+            proc.refresh_state()
+            states.append((proc.pid, proc.state))
+        return states

modal 1.1.0__tar.gz → 1.1.1__tar.gz

Potentially problematic release.

modal 1.1.0tar.gz → 1.1.1tar.gz