PyPI - modal - Versions diffs - 1.1.1.dev41__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

modal 1.1.1.dev41py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of modal might be problematic. Click here for more details.

Files changed (68) hide show

modal/__main__.py +1 -2
modal/_container_entrypoint.py +18 -7
modal/_functions.py +135 -13
modal/_object.py +13 -2
modal/_partial_function.py +8 -8
modal/_runtime/asgi.py +3 -2
modal/_runtime/container_io_manager.py +20 -14
modal/_runtime/container_io_manager.pyi +38 -13
modal/_runtime/execution_context.py +18 -2
modal/_runtime/execution_context.pyi +4 -1
modal/_runtime/gpu_memory_snapshot.py +158 -54
modal/_utils/blob_utils.py +83 -24
modal/_utils/function_utils.py +4 -3
modal/_utils/time_utils.py +28 -4
modal/app.py +8 -4
modal/app.pyi +8 -8
modal/cli/dict.py +14 -11
modal/cli/entry_point.py +9 -3
modal/cli/launch.py +102 -4
modal/cli/profile.py +1 -0
modal/cli/programs/launch_instance_ssh.py +94 -0
modal/cli/programs/run_marimo.py +95 -0
modal/cli/queues.py +49 -19
modal/cli/secret.py +45 -18
modal/cli/volume.py +14 -16
modal/client.pyi +2 -10
modal/cls.py +12 -2
modal/cls.pyi +9 -1
modal/config.py +7 -7
modal/dict.py +206 -12
modal/dict.pyi +358 -4
modal/experimental/__init__.py +130 -0
modal/file_io.py +1 -1
modal/file_io.pyi +2 -2
modal/file_pattern_matcher.py +25 -16
modal/functions.pyi +111 -11
modal/image.py +9 -3
modal/image.pyi +7 -7
modal/mount.py +20 -13
modal/mount.pyi +16 -3
modal/network_file_system.py +8 -2
modal/object.pyi +3 -0
modal/parallel_map.py +346 -101
modal/parallel_map.pyi +108 -0
modal/proxy.py +2 -1
modal/queue.py +199 -9
modal/queue.pyi +357 -3
modal/sandbox.py +6 -5
modal/sandbox.pyi +17 -14
modal/secret.py +196 -3
modal/secret.pyi +372 -0
modal/volume.py +239 -23
modal/volume.pyi +405 -10
{modal-1.1.1.dev41.dist-info → modal-1.1.2.dist-info}/METADATA +2 -2
{modal-1.1.1.dev41.dist-info → modal-1.1.2.dist-info}/RECORD +68 -66
modal_docs/mdmd/mdmd.py +11 -1
modal_proto/api.proto +37 -10
modal_proto/api_grpc.py +32 -0
modal_proto/api_pb2.py +627 -597
modal_proto/api_pb2.pyi +107 -19
modal_proto/api_pb2_grpc.py +67 -2
modal_proto/api_pb2_grpc.pyi +24 -8
modal_proto/modal_api_grpc.py +2 -0
modal_version/__init__.py +1 -1
{modal-1.1.1.dev41.dist-info → modal-1.1.2.dist-info}/WHEEL +0 -0
{modal-1.1.1.dev41.dist-info → modal-1.1.2.dist-info}/entry_points.txt +0 -0
{modal-1.1.1.dev41.dist-info → modal-1.1.2.dist-info}/licenses/LICENSE +0 -0
{modal-1.1.1.dev41.dist-info → modal-1.1.2.dist-info}/top_level.txt +0 -0

modal/_runtime/container_io_manager.pyi CHANGED Viewed

@@ -27,6 +27,7 @@ class IOContext:
     input_ids: list[str]
     retry_counts: list[int]
     function_call_ids: list[str]
+    attempt_tokens: list[str]
     function_inputs: list[modal_proto.api_pb2.FunctionInput]
     finalized_function: modal._runtime.user_code_imports.FinalizedFunction
     _cancel_issued: bool
@@ -37,6 +38,7 @@ class IOContext:
         input_ids: list[str],
         retry_counts: list[int],
         function_call_ids: list[str],
+        attempt_tokens: list[str],
         finalized_function: modal._runtime.user_code_imports.FinalizedFunction,
         function_inputs: list[modal_proto.api_pb2.FunctionInput],
         is_batched: bool,
@@ -50,7 +52,7 @@ class IOContext:
         cls,
         client: modal.client._Client,
         finalized_functions: dict[str, modal._runtime.user_code_imports.FinalizedFunction],
-        inputs: list[tuple[str, int, str, modal_proto.api_pb2.FunctionInput]],
+        inputs: list[tuple[str, int, str, str, modal_proto.api_pb2.FunctionInput]],
         is_batched: bool,
     ) -> IOContext: ...
     def set_cancel_callback(self, cb: collections.abc.Callable[[], None]): ...
@@ -133,12 +135,19 @@ class _ContainerIOManager:
     async def _dynamic_concurrency_loop(self): ...
     def serialize_data_format(self, obj: typing.Any, data_format: int) -> bytes: ...
     async def format_blob_data(self, data: bytes) -> dict[str, typing.Any]: ...
-    def get_data_in(self, function_call_id: str) -> collections.abc.AsyncIterator[typing.Any]:
+    def get_data_in(
+        self, function_call_id: str, attempt_token: typing.Optional[str]
+    ) -> collections.abc.AsyncIterator[typing.Any]:
         """Read from the `data_in` stream of a function call."""
         ...
     async def put_data_out(
-        self, function_call_id: str, start_index: int, data_format: int, serialized_messages: list[typing.Any]
+        self,
+        function_call_id: str,
+        attempt_token: str,
+        start_index: int,
+        data_format: int,
+        serialized_messages: list[typing.Any],
     ) -> None:
         """Put data onto the `data_out` stream of a function call.
@@ -149,7 +158,7 @@ class _ContainerIOManager:
         ...
     def generator_output_sender(
-        self, function_call_id: str, data_format: int, message_rx: asyncio.queues.Queue
+        self, function_call_id: str, attempt_token: str, data_format: int, message_rx: asyncio.queues.Queue
     ) -> typing.AsyncContextManager[None]:
         """Runs background task that feeds generator outputs into a function call's `data_out` stream."""
         ...
@@ -166,7 +175,7 @@ class _ContainerIOManager:
     def get_max_inputs_to_fetch(self): ...
     def _generate_inputs(
         self, batch_max_size: int, batch_wait_ms: int
-    ) -> collections.abc.AsyncIterator[list[tuple[str, int, str, modal_proto.api_pb2.FunctionInput]]]: ...
+    ) -> collections.abc.AsyncIterator[list[tuple[str, int, str, str, modal_proto.api_pb2.FunctionInput]]]: ...
     def run_inputs_outputs(
         self,
         finalized_functions: dict[str, modal._runtime.user_code_imports.FinalizedFunction],
@@ -332,11 +341,15 @@ class ContainerIOManager:
     format_blob_data: __format_blob_data_spec[typing_extensions.Self]
     class __get_data_in_spec(typing_extensions.Protocol[SUPERSELF]):
-        def __call__(self, /, function_call_id: str) -> typing.Iterator[typing.Any]:
+        def __call__(
+            self, /, function_call_id: str, attempt_token: typing.Optional[str]
+        ) -> typing.Iterator[typing.Any]:
             """Read from the `data_in` stream of a function call."""
             ...
-        def aio(self, /, function_call_id: str) -> collections.abc.AsyncIterator[typing.Any]:
+        def aio(
+            self, /, function_call_id: str, attempt_token: typing.Optional[str]
+        ) -> collections.abc.AsyncIterator[typing.Any]:
             """Read from the `data_in` stream of a function call."""
             ...
@@ -344,7 +357,13 @@ class ContainerIOManager:
     class __put_data_out_spec(typing_extensions.Protocol[SUPERSELF]):
         def __call__(
-            self, /, function_call_id: str, start_index: int, data_format: int, serialized_messages: list[typing.Any]
+            self,
+            /,
+            function_call_id: str,
+            attempt_token: str,
+            start_index: int,
+            data_format: int,
+            serialized_messages: list[typing.Any],
         ) -> None:
             """Put data onto the `data_out` stream of a function call.
@@ -355,7 +374,13 @@ class ContainerIOManager:
             ...
         async def aio(
-            self, /, function_call_id: str, start_index: int, data_format: int, serialized_messages: list[typing.Any]
+            self,
+            /,
+            function_call_id: str,
+            attempt_token: str,
+            start_index: int,
+            data_format: int,
+            serialized_messages: list[typing.Any],
         ) -> None:
             """Put data onto the `data_out` stream of a function call.
@@ -369,13 +394,13 @@ class ContainerIOManager:
     class __generator_output_sender_spec(typing_extensions.Protocol[SUPERSELF]):
         def __call__(
-            self, /, function_call_id: str, data_format: int, message_rx: asyncio.queues.Queue
+            self, /, function_call_id: str, attempt_token: str, data_format: int, message_rx: asyncio.queues.Queue
         ) -> synchronicity.combined_types.AsyncAndBlockingContextManager[None]:
             """Runs background task that feeds generator outputs into a function call's `data_out` stream."""
             ...
         def aio(
-            self, /, function_call_id: str, data_format: int, message_rx: asyncio.queues.Queue
+            self, /, function_call_id: str, attempt_token: str, data_format: int, message_rx: asyncio.queues.Queue
         ) -> typing.AsyncContextManager[None]:
             """Runs background task that feeds generator outputs into a function call's `data_out` stream."""
             ...
@@ -410,10 +435,10 @@ class ContainerIOManager:
     class ___generate_inputs_spec(typing_extensions.Protocol[SUPERSELF]):
         def __call__(
             self, /, batch_max_size: int, batch_wait_ms: int
-        ) -> typing.Iterator[list[tuple[str, int, str, modal_proto.api_pb2.FunctionInput]]]: ...
+        ) -> typing.Iterator[list[tuple[str, int, str, str, modal_proto.api_pb2.FunctionInput]]]: ...
         def aio(
             self, /, batch_max_size: int, batch_wait_ms: int
-        ) -> collections.abc.AsyncIterator[list[tuple[str, int, str, modal_proto.api_pb2.FunctionInput]]]: ...
+        ) -> collections.abc.AsyncIterator[list[tuple[str, int, str, str, modal_proto.api_pb2.FunctionInput]]]: ...
     _generate_inputs: ___generate_inputs_spec[typing_extensions.Self]

modal/_runtime/execution_context.py CHANGED Viewed

@@ -72,22 +72,38 @@ def current_function_call_id() -> Optional[str]:
         return None
-def _set_current_context_ids(input_ids: list[str], function_call_ids: list[str]) -> Callable[[], None]:
-    assert len(input_ids) == len(function_call_ids) and len(input_ids) > 0
+def current_attempt_token() -> Optional[str]:
+    # This ContextVar isn't useful to expose to users.
+    try:
+        return _current_attempt_token.get()
+    except LookupError:
+        return None
+def _set_current_context_ids(
+    input_ids: list[str], function_call_ids: list[str], attempt_tokens: list[str]
+) -> Callable[[], None]:
+    assert len(input_ids) == len(function_call_ids) == len(attempt_tokens) and input_ids
     input_id = input_ids[0]
     function_call_id = function_call_ids[0]
+    attempt_token = attempt_tokens[0]
     input_token = _current_input_id.set(input_id)
     function_call_token = _current_function_call_id.set(function_call_id)
+    attempt_token_token = _current_attempt_token.set(attempt_token)
     def _reset_current_context_ids():
         _current_input_id.reset(input_token)
         _current_function_call_id.reset(function_call_token)
+        _current_attempt_token.reset(attempt_token_token)
     return _reset_current_context_ids
 _current_input_id: ContextVar = ContextVar("_current_input_id")
 _current_function_call_id: ContextVar = ContextVar("_current_function_call_id")
+_current_attempt_token: ContextVar = ContextVar("_current_attempt_token")
 _is_currently_importing = False  # we set this to True while a container is importing user code

modal/_runtime/execution_context.pyi CHANGED Viewed

@@ -68,11 +68,14 @@ def current_function_call_id() -> typing.Optional[str]:
     """
     ...
+def current_attempt_token() -> typing.Optional[str]: ...
 def _set_current_context_ids(
-    input_ids: list[str], function_call_ids: list[str]
+    input_ids: list[str], function_call_ids: list[str], attempt_tokens: list[str]
 ) -> collections.abc.Callable[[], None]: ...
 def _import_context(): ...
 _current_input_id: contextvars.ContextVar
 _current_function_call_id: contextvars.ContextVar
+_current_attempt_token: contextvars.ContextVar

modal/_runtime/gpu_memory_snapshot.py CHANGED Viewed

@@ -1,17 +1,18 @@
 # Copyright Modal Labs 2022
 #
 # This module provides a simple interface for creating GPU memory snapshots,
-# provising a convenient interface to `cuda-checkpoint` [1]. This is intended
+# providing a convenient interface to `cuda-checkpoint` [1]. This is intended
 # to be used in conjunction with memory snapshots.
 #
 # [1] https://github.com/NVIDIA/cuda-checkpoint
 import subprocess
 import time
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
+from typing import List, Optional
 from modal.config import config, logger
@@ -19,7 +20,9 @@ CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
 class CudaCheckpointState(Enum):
-    """State representation from the CUDA API: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc96cdda177a2b8c296144567cbea4f23"""
+    """State representation from the CUDA API [1].
+    [1] https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html"""
     RUNNING = "running"
     LOCKED = "locked"
@@ -28,6 +31,8 @@ class CudaCheckpointState(Enum):
 class CudaCheckpointException(Exception):
+    """Exception raised for CUDA checkpoint operations."""
     pass
@@ -39,16 +44,31 @@ class CudaCheckpointProcess:
     pid: int
     state: CudaCheckpointState
-    def toggle(self, target_state: CudaCheckpointState, timeout_secs: float = 5 * 60.0):
+    def toggle(self, target_state: CudaCheckpointState, timeout_secs: float = 5 * 60.0) -> None:
         """Toggle CUDA checkpoint state for current process, moving GPU memory to the
-        CPU and back depending on the current process state when called."""
+        CPU and back depending on the current process state when called.
+        """
         logger.debug(f"PID: {self.pid} Toggling CUDA checkpoint state to {target_state.value}")
         start_time = time.monotonic()
+        retry_count = 0
+        max_retries = 3
         while self._should_continue_toggle(target_state, start_time, timeout_secs):
-            self._execute_toggle_command()
-            time.sleep(0.1)
+            try:
+                self._execute_toggle_command()
+                # Use exponential backoff for retries
+                sleep_time = min(0.1 * (2**retry_count), 1.0)
+                time.sleep(sleep_time)
+                retry_count = 0
+            except CudaCheckpointException as e:
+                retry_count += 1
+                if retry_count >= max_retries:
+                    raise CudaCheckpointException(
+                        f"PID: {self.pid} Failed to toggle state after {max_retries} retries: {e}"
+                    )
+                logger.debug(f"PID: {self.pid} Retry {retry_count}/{max_retries} after error: {e}")
+                time.sleep(0.5 * retry_count)
         logger.debug(f"PID: {self.pid} Target state {target_state.value} reached")
@@ -73,19 +93,25 @@ class CudaCheckpointProcess:
         return True
-    def _execute_toggle_command(self):
+    def _execute_toggle_command(self) -> None:
         """Execute the cuda-checkpoint toggle command."""
         try:
-            subprocess.run(
+            _ = subprocess.run(
                 [CUDA_CHECKPOINT_PATH, "--toggle", "--pid", str(self.pid)],
                 check=True,
                 capture_output=True,
                 text=True,
+                timeout=30,
             )
             logger.debug(f"PID: {self.pid} Successfully toggled CUDA checkpoint state")
         except subprocess.CalledProcessError as e:
-            logger.debug(f"PID: {self.pid} Failed to toggle CUDA checkpoint state: {e.stderr}")
-            raise CudaCheckpointException(e.stderr)
+            error_msg = f"PID: {self.pid} Failed to toggle CUDA checkpoint state: {e.stderr}"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
+        except subprocess.TimeoutExpired:
+            error_msg = f"PID: {self.pid} Toggle command timed out"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
     def refresh_state(self) -> None:
         """Refreshes the current CUDA checkpoint state for this process."""
@@ -95,15 +121,20 @@ class CudaCheckpointProcess:
                 check=True,
                 capture_output=True,
                 text=True,
-                timeout=5,
+                timeout=10,
             )
             state_str = result.stdout.strip().lower()
             self.state = CudaCheckpointState(state_str)
         except subprocess.CalledProcessError as e:
-            logger.debug(f"PID: {self.pid} Failed to get CUDA checkpoint state: {e.stderr}")
-            raise CudaCheckpointException(e.stderr)
+            error_msg = f"PID: {self.pid} Failed to get CUDA checkpoint state: {e.stderr}"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
+        except subprocess.TimeoutExpired:
+            error_msg = f"PID: {self.pid} Get state command timed out"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
 class CudaCheckpointSession:
@@ -111,12 +142,17 @@ class CudaCheckpointSession:
     def __init__(self):
         self.cuda_processes = self._get_cuda_pids()
-        logger.debug(f"PIDs with CUDA sessions: {[c.pid for c in self.cuda_processes]}")
+        if self.cuda_processes:
+            logger.debug(
+                f"Found {len(self.cuda_processes)} PID(s) with CUDA sessions: {[c.pid for c in self.cuda_processes]}"
+            )
+        else:
+            logger.debug("No CUDA sessions found.")
-    def _get_cuda_pids(self) -> list[CudaCheckpointProcess]:
+    def _get_cuda_pids(self) -> List[CudaCheckpointProcess]:
         """Iterates over all PIDs and identifies the ones that have running
         CUDA sessions."""
-        cuda_pids: list[CudaCheckpointProcess] = []
+        cuda_pids: List[CudaCheckpointProcess] = []
         # Get all active process IDs from /proc directory
         proc_dir = Path("/proc")
@@ -125,75 +161,143 @@ class CudaCheckpointSession:
                 "OS does not have /proc path rendering it incompatible with GPU memory snapshots."
             )
-        for entry in proc_dir.iterdir():
-            if not entry.name.isdigit():
-                continue
-            pid = int(entry.name)
-            try:
-                # Call cuda-checkpoint to check if this PID has a CUDA session
-                result = subprocess.run(
-                    [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
-                    capture_output=True,
-                    text=True,
-                    timeout=10,
-                )
-                # If the command succeeds (return code 0), this PID has a CUDA session
-                if result.returncode == 0:
-                    state_str = result.stdout.strip().lower()
-                    state = CudaCheckpointState(state_str)
-                    cuda_checkpoint_process = CudaCheckpointProcess(pid=pid, state=state)
-                    cuda_pids.append(cuda_checkpoint_process)
+        # Get all numeric directories (PIDs) from /proc
+        pid_dirs = [entry for entry in proc_dir.iterdir() if entry.name.isdigit()]
-            # Command failed, which is expected for PIDs without CUDA sessions
-            except subprocess.CalledProcessError:
-                continue
+        # Use ThreadPoolExecutor to check PIDs in parallel for better performance
+        with ThreadPoolExecutor(max_workers=min(50, len(pid_dirs))) as executor:
+            future_to_pid = {
+                executor.submit(self._check_cuda_session, int(entry.name)): int(entry.name) for entry in pid_dirs
+            }
-            # Raise other exceptions
-            except subprocess.TimeoutExpired:
-                raise CudaCheckpointException(f"Failed to get CUDA state for PID {pid}")
-            except Exception as e:
-                raise CudaCheckpointException(e)
+            for future in as_completed(future_to_pid):
+                pid = future_to_pid[future]
+                try:
+                    cuda_process = future.result()
+                    if cuda_process:
+                        cuda_pids.append(cuda_process)
+                except Exception as e:
+                    logger.debug(f"Error checking PID {pid}: {e}")
         # Sort PIDs for ordered checkpointing
         cuda_pids.sort(key=lambda x: x.pid)
         return cuda_pids
+    def _check_cuda_session(self, pid: int) -> Optional[CudaCheckpointProcess]:
+        """Check if a specific PID has a CUDA session."""
+        try:
+            result = subprocess.run(
+                [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+            # If the command succeeds (return code 0), this PID has a CUDA session
+            if result.returncode == 0:
+                state_str = result.stdout.strip().lower()
+                state = CudaCheckpointState(state_str)
+                return CudaCheckpointProcess(pid=pid, state=state)
+        except subprocess.CalledProcessError:
+            # Command failed, which is expected for PIDs without CUDA sessions
+            pass
+        except subprocess.TimeoutExpired:
+            logger.debug(f"Timeout checking CUDA state for PID {pid}")
+        except Exception as e:
+            logger.debug(f"Error checking PID {pid}: {e}")
+        return None
     def checkpoint(self) -> None:
+        """Checkpoint all CUDA processes, moving GPU memory to CPU."""
+        if not self.cuda_processes:
+            logger.debug("No CUDA processes to checkpoint.")
+            return
         # Validate all states first
         for proc in self.cuda_processes:
+            proc.refresh_state()  # Refresh state before validation
             if proc.state != CudaCheckpointState.RUNNING:
-                raise CudaCheckpointException(f"CUDA session not in {CudaCheckpointState.RUNNING} state.")
+                raise CudaCheckpointException(
+                    f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.RUNNING.value} state. "
+                    f"Current state: {proc.state.value}"
+                )
         # Moving state from GPU to CPU can take several seconds per CUDA session.
         # Make a parallel call per CUDA session.
         start = time.perf_counter()
-        def checkpoint_impl(proc: CudaCheckpointProcess):
+        def checkpoint_impl(proc: CudaCheckpointProcess) -> None:
             proc.toggle(CudaCheckpointState.CHECKPOINTED)
         with ThreadPoolExecutor() as executor:
-            list(executor.map(checkpoint_impl, self.cuda_processes))
+            futures = [executor.submit(checkpoint_impl, proc) for proc in self.cuda_processes]
+            # Wait for all futures and collect any exceptions
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    exceptions.append(e)
+            if exceptions:
+                raise CudaCheckpointException(
+                    f"Failed to checkpoint {len(exceptions)} processes: {'; '.join(str(e) for e in exceptions)}"
+                )
         elapsed = time.perf_counter() - start
-        logger.debug(f"Checkpointing CUDA sessions took => {elapsed:.3f}s")
+        logger.debug(f"Checkpointing {len(self.cuda_processes)} CUDA sessions took => {elapsed:.3f}s")
     def restore(self) -> None:
+        """Restore all CUDA processes, moving memory back from CPU to GPU."""
+        if not self.cuda_processes:
+            logger.debug("No CUDA sessions to restore.")
+            return
         # Validate all states first
         for proc in self.cuda_processes:
+            proc.refresh_state()  # Refresh state before validation
             if proc.state != CudaCheckpointState.CHECKPOINTED:
-                raise CudaCheckpointException(f"CUDA session not in {CudaCheckpointState.CHECKPOINTED} state.")
+                raise CudaCheckpointException(
+                    f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.CHECKPOINTED.value} state. "
+                    f"Current state: {proc.state.value}"
+                )
         # See checkpoint() for rationale about parallelism.
         start = time.perf_counter()
-        def restore_process(proc: CudaCheckpointProcess):
+        def restore_process(proc: CudaCheckpointProcess) -> None:
             proc.toggle(CudaCheckpointState.RUNNING)
         with ThreadPoolExecutor() as executor:
-            list(executor.map(restore_process, self.cuda_processes))
+            futures = [executor.submit(restore_process, proc) for proc in self.cuda_processes]
+            # Wait for all futures and collect any exceptions
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    exceptions.append(e)
+            if exceptions:
+                raise CudaCheckpointException(
+                    f"Failed to restore {len(exceptions)} processes: {'; '.join(str(e) for e in exceptions)}"
+                )
         elapsed = time.perf_counter() - start
-        logger.debug(f"Restoring CUDA sessions took => {elapsed:.3f}s")
+        logger.debug(f"Restoring {len(self.cuda_processes)} CUDA session(s) took => {elapsed:.3f}s")
+    def get_process_count(self) -> int:
+        """Get the number of CUDA processes managed by this session."""
+        return len(self.cuda_processes)
+    def get_process_states(self) -> List[tuple[int, CudaCheckpointState]]:
+        """Get current states of all managed processes."""
+        states = []
+        for proc in self.cuda_processes:
+            proc.refresh_state()
+            states.append((proc.pid, proc.state))
+        return states

modal 1.1.1.dev41__py3-none-any.whl → 1.1.2__py3-none-any.whl

Potentially problematic release.

modal 1.1.1.dev41py3-none-any.whl → 1.1.2py3-none-any.whl