PyPI - modal - Versions diffs - 1.0.6.dev58__py3-none-any.whl → 1.2.3.dev7__py3-none-any.whl - Mend

modal 1.0.6.dev58py3-none-any.whl → 1.2.3.dev7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of modal might be problematic. Click here for more details.

Files changed (147) hide show

modal/__main__.py +3 -4
modal/_billing.py +80 -0
modal/_clustered_functions.py +7 -3
modal/_clustered_functions.pyi +4 -2
modal/_container_entrypoint.py +41 -49
modal/_functions.py +424 -195
modal/_grpc_client.py +171 -0
modal/_load_context.py +105 -0
modal/_object.py +68 -20
modal/_output.py +58 -45
modal/_partial_function.py +36 -11
modal/_pty.py +7 -3
modal/_resolver.py +21 -35
modal/_runtime/asgi.py +4 -3
modal/_runtime/container_io_manager.py +301 -186
modal/_runtime/container_io_manager.pyi +70 -61
modal/_runtime/execution_context.py +18 -2
modal/_runtime/execution_context.pyi +4 -1
modal/_runtime/gpu_memory_snapshot.py +170 -63
modal/_runtime/user_code_imports.py +28 -58
modal/_serialization.py +57 -1
modal/_utils/async_utils.py +33 -12
modal/_utils/auth_token_manager.py +2 -5
modal/_utils/blob_utils.py +110 -53
modal/_utils/function_utils.py +49 -42
modal/_utils/grpc_utils.py +80 -50
modal/_utils/mount_utils.py +26 -1
modal/_utils/name_utils.py +17 -3
modal/_utils/task_command_router_client.py +536 -0
modal/_utils/time_utils.py +34 -6
modal/app.py +219 -83
modal/app.pyi +229 -56
modal/billing.py +5 -0
modal/{requirements → builder}/2025.06.txt +1 -0
modal/{requirements → builder}/PREVIEW.txt +1 -0
modal/cli/_download.py +19 -3
modal/cli/_traceback.py +3 -2
modal/cli/app.py +4 -4
modal/cli/cluster.py +15 -7
modal/cli/config.py +5 -3
modal/cli/container.py +7 -6
modal/cli/dict.py +22 -16
modal/cli/entry_point.py +12 -5
modal/cli/environment.py +5 -4
modal/cli/import_refs.py +3 -3
modal/cli/launch.py +102 -5
modal/cli/network_file_system.py +9 -13
modal/cli/profile.py +3 -2
modal/cli/programs/launch_instance_ssh.py +94 -0
modal/cli/programs/run_jupyter.py +1 -1
modal/cli/programs/run_marimo.py +95 -0
modal/cli/programs/vscode.py +1 -1
modal/cli/queues.py +57 -26
modal/cli/run.py +58 -16
modal/cli/secret.py +48 -22
modal/cli/utils.py +3 -4
modal/cli/volume.py +28 -25
modal/client.py +13 -116
modal/client.pyi +9 -91
modal/cloud_bucket_mount.py +5 -3
modal/cloud_bucket_mount.pyi +5 -1
modal/cls.py +130 -102
modal/cls.pyi +45 -85
modal/config.py +29 -10
modal/container_process.py +291 -13
modal/container_process.pyi +95 -32
modal/dict.py +282 -63
modal/dict.pyi +423 -73
modal/environments.py +15 -27
modal/environments.pyi +5 -15
modal/exception.py +8 -0
modal/experimental/__init__.py +143 -38
modal/experimental/flash.py +247 -78
modal/experimental/flash.pyi +137 -9
modal/file_io.py +14 -28
modal/file_io.pyi +2 -2
modal/file_pattern_matcher.py +25 -16
modal/functions.pyi +134 -61
modal/image.py +255 -86
modal/image.pyi +300 -62
modal/io_streams.py +436 -126
modal/io_streams.pyi +236 -171
modal/mount.py +62 -157
modal/mount.pyi +45 -172
modal/network_file_system.py +30 -53
modal/network_file_system.pyi +16 -76
modal/object.pyi +42 -8
modal/parallel_map.py +821 -113
modal/parallel_map.pyi +134 -0
modal/partial_function.pyi +4 -1
modal/proxy.py +16 -7
modal/proxy.pyi +10 -2
modal/queue.py +263 -61
modal/queue.pyi +409 -66
modal/runner.py +112 -92
modal/runner.pyi +45 -27
modal/sandbox.py +451 -124
modal/sandbox.pyi +513 -67
modal/secret.py +291 -67
modal/secret.pyi +425 -19
modal/serving.py +7 -11
modal/serving.pyi +7 -8
modal/snapshot.py +11 -8
modal/token_flow.py +4 -4
modal/volume.py +344 -98
modal/volume.pyi +464 -68
{modal-1.0.6.dev58.dist-info → modal-1.2.3.dev7.dist-info}/METADATA +9 -8
modal-1.2.3.dev7.dist-info/RECORD +195 -0
modal_docs/mdmd/mdmd.py +11 -1
modal_proto/api.proto +399 -67
modal_proto/api_grpc.py +241 -1
modal_proto/api_pb2.py +1395 -1000
modal_proto/api_pb2.pyi +1239 -79
modal_proto/api_pb2_grpc.py +499 -4
modal_proto/api_pb2_grpc.pyi +162 -14
modal_proto/modal_api_grpc.py +175 -160
modal_proto/sandbox_router.proto +145 -0
modal_proto/sandbox_router_grpc.py +105 -0
modal_proto/sandbox_router_pb2.py +149 -0
modal_proto/sandbox_router_pb2.pyi +333 -0
modal_proto/sandbox_router_pb2_grpc.py +203 -0
modal_proto/sandbox_router_pb2_grpc.pyi +75 -0
modal_proto/task_command_router.proto +144 -0
modal_proto/task_command_router_grpc.py +105 -0
modal_proto/task_command_router_pb2.py +149 -0
modal_proto/task_command_router_pb2.pyi +333 -0
modal_proto/task_command_router_pb2_grpc.py +203 -0
modal_proto/task_command_router_pb2_grpc.pyi +75 -0
modal_version/__init__.py +1 -1
modal-1.0.6.dev58.dist-info/RECORD +0 -183
modal_proto/modal_options_grpc.py +0 -3
modal_proto/options.proto +0 -19
modal_proto/options_grpc.py +0 -3
modal_proto/options_pb2.py +0 -35
modal_proto/options_pb2.pyi +0 -20
modal_proto/options_pb2_grpc.py +0 -4
modal_proto/options_pb2_grpc.pyi +0 -7
/modal/{requirements → builder}/2023.12.312.txt +0 -0
/modal/{requirements → builder}/2023.12.txt +0 -0
/modal/{requirements → builder}/2024.04.txt +0 -0
/modal/{requirements → builder}/2024.10.txt +0 -0
/modal/{requirements → builder}/README.md +0 -0
/modal/{requirements → builder}/base-images.json +0 -0
{modal-1.0.6.dev58.dist-info → modal-1.2.3.dev7.dist-info}/WHEEL +0 -0
{modal-1.0.6.dev58.dist-info → modal-1.2.3.dev7.dist-info}/entry_points.txt +0 -0
{modal-1.0.6.dev58.dist-info → modal-1.2.3.dev7.dist-info}/licenses/LICENSE +0 -0
{modal-1.0.6.dev58.dist-info → modal-1.2.3.dev7.dist-info}/top_level.txt +0 -0

modal/_runtime/container_io_manager.pyi CHANGED Viewed

@@ -27,6 +27,7 @@ class IOContext:
     input_ids: list[str]
     retry_counts: list[int]
     function_call_ids: list[str]
+    attempt_tokens: list[str]
     function_inputs: list[modal_proto.api_pb2.FunctionInput]
     finalized_function: modal._runtime.user_code_imports.FinalizedFunction
     _cancel_issued: bool
@@ -37,6 +38,7 @@ class IOContext:
         input_ids: list[str],
         retry_counts: list[int],
         function_call_ids: list[str],
+        attempt_tokens: list[str],
         finalized_function: modal._runtime.user_code_imports.FinalizedFunction,
         function_inputs: list[modal_proto.api_pb2.FunctionInput],
         is_batched: bool,
@@ -50,14 +52,29 @@ class IOContext:
         cls,
         client: modal.client._Client,
         finalized_functions: dict[str, modal._runtime.user_code_imports.FinalizedFunction],
-        inputs: list[tuple[str, int, str, modal_proto.api_pb2.FunctionInput]],
+        inputs: list[tuple[str, int, str, str, modal_proto.api_pb2.FunctionInput]],
         is_batched: bool,
     ) -> IOContext: ...
     def set_cancel_callback(self, cb: collections.abc.Callable[[], None]): ...
     def cancel(self): ...
     def _args_and_kwargs(self) -> tuple[tuple[typing.Any, ...], dict[str, list[typing.Any]]]: ...
-    def call_finalized_function(self) -> typing.Any: ...
-    def validate_output_data(self, data: typing.Any) -> list[typing.Any]: ...
+    def _generator_output_format(self) -> int: ...
+    def _prepare_batch_output(self, data: typing.Any) -> list[typing.Any]: ...
+    def call_function_sync(self) -> list[typing.Any]: ...
+    async def call_function_async(self) -> list[typing.Any]: ...
+    def call_generator_sync(self) -> typing.Generator[typing.Any, None, None]: ...
+    def call_generator_async(self) -> collections.abc.AsyncGenerator[typing.Any, None]: ...
+    async def output_items_cancellation(self, started_at: float): ...
+    def _determine_output_format(self, input_format: int) -> int: ...
+    async def output_items_exception(
+        self, started_at: float, task_id: str, exc: BaseException
+    ) -> list[modal_proto.api_pb2.FunctionPutOutputsItem]: ...
+    def output_items_generator_done(
+        self, started_at: float, items_total: int
+    ) -> list[modal_proto.api_pb2.FunctionPutOutputsItem]: ...
+    async def output_items(
+        self, started_at: float, data: list[typing.Any]
+    ) -> list[modal_proto.api_pb2.FunctionPutOutputsItem]: ...
 class InputSlots:
     """A semaphore that allows dynamically adjusting the concurrency."""
@@ -131,14 +148,19 @@ class _ContainerIOManager:
     def stop_heartbeat(self): ...
     def dynamic_concurrency_manager(self) -> typing.AsyncContextManager[None]: ...
     async def _dynamic_concurrency_loop(self): ...
-    def serialize_data_format(self, obj: typing.Any, data_format: int) -> bytes: ...
-    async def format_blob_data(self, data: bytes) -> dict[str, typing.Any]: ...
-    def get_data_in(self, function_call_id: str) -> collections.abc.AsyncIterator[typing.Any]:
+    def get_data_in(
+        self, function_call_id: str, attempt_token: typing.Optional[str]
+    ) -> collections.abc.AsyncIterator[typing.Any]:
         """Read from the `data_in` stream of a function call."""
         ...
     async def put_data_out(
-        self, function_call_id: str, start_index: int, data_format: int, serialized_messages: list[typing.Any]
+        self,
+        function_call_id: str,
+        attempt_token: str,
+        start_index: int,
+        data_format: int,
+        serialized_messages: list[typing.Any],
     ) -> None:
         """Put data onto the `data_out` stream of a function call.
@@ -149,7 +171,7 @@ class _ContainerIOManager:
         ...
     def generator_output_sender(
-        self, function_call_id: str, data_format: int, message_rx: asyncio.queues.Queue
+        self, function_call_id: str, attempt_token: str, data_format: int, message_rx: asyncio.queues.Queue
     ) -> typing.AsyncContextManager[None]:
         """Runs background task that feeds generator outputs into a function call's `data_out` stream."""
         ...
@@ -166,22 +188,17 @@ class _ContainerIOManager:
     def get_max_inputs_to_fetch(self): ...
     def _generate_inputs(
         self, batch_max_size: int, batch_wait_ms: int
-    ) -> collections.abc.AsyncIterator[list[tuple[str, int, str, modal_proto.api_pb2.FunctionInput]]]: ...
+    ) -> collections.abc.AsyncIterator[list[tuple[str, int, str, str, modal_proto.api_pb2.FunctionInput]]]: ...
     def run_inputs_outputs(
         self,
         finalized_functions: dict[str, modal._runtime.user_code_imports.FinalizedFunction],
         batch_max_size: int = 0,
         batch_wait_ms: int = 0,
     ) -> collections.abc.AsyncIterator[IOContext]: ...
-    async def _push_outputs(
-        self,
-        io_context: IOContext,
-        started_at: float,
-        data_format: int,
-        results: list[modal_proto.api_pb2.GenericResult],
-    ) -> None: ...
-    def serialize_exception(self, exc: BaseException) -> bytes: ...
-    def serialize_traceback(self, exc: BaseException) -> tuple[typing.Optional[bytes], typing.Optional[bytes]]: ...
+    async def _send_outputs(self, started_at: float, outputs: list[modal_proto.api_pb2.FunctionPutOutputsItem]) -> None:
+        """Send pre-built output items with retry and chunking."""
+        ...
     def handle_user_exception(self) -> typing.AsyncContextManager[None]:
         """Sets the task as failed in a way where it's not retried.
@@ -195,9 +212,7 @@ class _ContainerIOManager:
         ...
     def exit_context(self, started_at, input_ids: list[str]): ...
-    async def push_outputs(
-        self, io_context: IOContext, started_at: float, data: typing.Any, data_format: int
-    ) -> None: ...
+    async def push_outputs(self, io_context: IOContext, started_at: float, output_data: list[typing.Any]) -> None: ...
     async def memory_restore(self) -> None: ...
     async def memory_snapshot(self) -> None:
         """Message server indicating that function is ready to be checkpointed."""
@@ -323,20 +338,16 @@ class ContainerIOManager:
     _dynamic_concurrency_loop: ___dynamic_concurrency_loop_spec[typing_extensions.Self]
-    def serialize_data_format(self, obj: typing.Any, data_format: int) -> bytes: ...
-    class __format_blob_data_spec(typing_extensions.Protocol[SUPERSELF]):
-        def __call__(self, /, data: bytes) -> dict[str, typing.Any]: ...
-        async def aio(self, /, data: bytes) -> dict[str, typing.Any]: ...
-    format_blob_data: __format_blob_data_spec[typing_extensions.Self]
     class __get_data_in_spec(typing_extensions.Protocol[SUPERSELF]):
-        def __call__(self, /, function_call_id: str) -> typing.Iterator[typing.Any]:
+        def __call__(
+            self, /, function_call_id: str, attempt_token: typing.Optional[str]
+        ) -> typing.Iterator[typing.Any]:
             """Read from the `data_in` stream of a function call."""
             ...
-        def aio(self, /, function_call_id: str) -> collections.abc.AsyncIterator[typing.Any]:
+        def aio(
+            self, /, function_call_id: str, attempt_token: typing.Optional[str]
+        ) -> collections.abc.AsyncIterator[typing.Any]:
             """Read from the `data_in` stream of a function call."""
             ...
@@ -344,7 +355,13 @@ class ContainerIOManager:
     class __put_data_out_spec(typing_extensions.Protocol[SUPERSELF]):
         def __call__(
-            self, /, function_call_id: str, start_index: int, data_format: int, serialized_messages: list[typing.Any]
+            self,
+            /,
+            function_call_id: str,
+            attempt_token: str,
+            start_index: int,
+            data_format: int,
+            serialized_messages: list[typing.Any],
         ) -> None:
             """Put data onto the `data_out` stream of a function call.
@@ -355,7 +372,13 @@ class ContainerIOManager:
             ...
         async def aio(
-            self, /, function_call_id: str, start_index: int, data_format: int, serialized_messages: list[typing.Any]
+            self,
+            /,
+            function_call_id: str,
+            attempt_token: str,
+            start_index: int,
+            data_format: int,
+            serialized_messages: list[typing.Any],
         ) -> None:
             """Put data onto the `data_out` stream of a function call.
@@ -369,13 +392,13 @@ class ContainerIOManager:
     class __generator_output_sender_spec(typing_extensions.Protocol[SUPERSELF]):
         def __call__(
-            self, /, function_call_id: str, data_format: int, message_rx: asyncio.queues.Queue
+            self, /, function_call_id: str, attempt_token: str, data_format: int, message_rx: asyncio.queues.Queue
         ) -> synchronicity.combined_types.AsyncAndBlockingContextManager[None]:
             """Runs background task that feeds generator outputs into a function call's `data_out` stream."""
             ...
         def aio(
-            self, /, function_call_id: str, data_format: int, message_rx: asyncio.queues.Queue
+            self, /, function_call_id: str, attempt_token: str, data_format: int, message_rx: asyncio.queues.Queue
         ) -> typing.AsyncContextManager[None]:
             """Runs background task that feeds generator outputs into a function call's `data_out` stream."""
             ...
@@ -410,10 +433,10 @@ class ContainerIOManager:
     class ___generate_inputs_spec(typing_extensions.Protocol[SUPERSELF]):
         def __call__(
             self, /, batch_max_size: int, batch_wait_ms: int
-        ) -> typing.Iterator[list[tuple[str, int, str, modal_proto.api_pb2.FunctionInput]]]: ...
+        ) -> typing.Iterator[list[tuple[str, int, str, str, modal_proto.api_pb2.FunctionInput]]]: ...
         def aio(
             self, /, batch_max_size: int, batch_wait_ms: int
-        ) -> collections.abc.AsyncIterator[list[tuple[str, int, str, modal_proto.api_pb2.FunctionInput]]]: ...
+        ) -> collections.abc.AsyncIterator[list[tuple[str, int, str, str, modal_proto.api_pb2.FunctionInput]]]: ...
     _generate_inputs: ___generate_inputs_spec[typing_extensions.Self]
@@ -435,28 +458,16 @@ class ContainerIOManager:
     run_inputs_outputs: __run_inputs_outputs_spec[typing_extensions.Self]
-    class ___push_outputs_spec(typing_extensions.Protocol[SUPERSELF]):
-        def __call__(
-            self,
-            /,
-            io_context: IOContext,
-            started_at: float,
-            data_format: int,
-            results: list[modal_proto.api_pb2.GenericResult],
-        ) -> None: ...
-        async def aio(
-            self,
-            /,
-            io_context: IOContext,
-            started_at: float,
-            data_format: int,
-            results: list[modal_proto.api_pb2.GenericResult],
-        ) -> None: ...
+    class ___send_outputs_spec(typing_extensions.Protocol[SUPERSELF]):
+        def __call__(self, /, started_at: float, outputs: list[modal_proto.api_pb2.FunctionPutOutputsItem]) -> None:
+            """Send pre-built output items with retry and chunking."""
+            ...
-    _push_outputs: ___push_outputs_spec[typing_extensions.Self]
+        async def aio(self, /, started_at: float, outputs: list[modal_proto.api_pb2.FunctionPutOutputsItem]) -> None:
+            """Send pre-built output items with retry and chunking."""
+            ...
-    def serialize_exception(self, exc: BaseException) -> bytes: ...
-    def serialize_traceback(self, exc: BaseException) -> tuple[typing.Optional[bytes], typing.Optional[bytes]]: ...
+    _send_outputs: ___send_outputs_spec[typing_extensions.Self]
     class __handle_user_exception_spec(typing_extensions.Protocol[SUPERSELF]):
         def __call__(self, /) -> synchronicity.combined_types.AsyncAndBlockingContextManager[None]:
@@ -493,10 +504,8 @@ class ContainerIOManager:
     def exit_context(self, started_at, input_ids: list[str]): ...
     class __push_outputs_spec(typing_extensions.Protocol[SUPERSELF]):
-        def __call__(self, /, io_context: IOContext, started_at: float, data: typing.Any, data_format: int) -> None: ...
-        async def aio(
-            self, /, io_context: IOContext, started_at: float, data: typing.Any, data_format: int
-        ) -> None: ...
+        def __call__(self, /, io_context: IOContext, started_at: float, output_data: list[typing.Any]) -> None: ...
+        async def aio(self, /, io_context: IOContext, started_at: float, output_data: list[typing.Any]) -> None: ...
     push_outputs: __push_outputs_spec[typing_extensions.Self]

modal/_runtime/execution_context.py CHANGED Viewed

@@ -72,22 +72,38 @@ def current_function_call_id() -> Optional[str]:
         return None
-def _set_current_context_ids(input_ids: list[str], function_call_ids: list[str]) -> Callable[[], None]:
-    assert len(input_ids) == len(function_call_ids) and len(input_ids) > 0
+def current_attempt_token() -> Optional[str]:
+    # This ContextVar isn't useful to expose to users.
+    try:
+        return _current_attempt_token.get()
+    except LookupError:
+        return None
+def _set_current_context_ids(
+    input_ids: list[str], function_call_ids: list[str], attempt_tokens: list[str]
+) -> Callable[[], None]:
+    assert len(input_ids) == len(function_call_ids) == len(attempt_tokens) and input_ids
     input_id = input_ids[0]
     function_call_id = function_call_ids[0]
+    attempt_token = attempt_tokens[0]
     input_token = _current_input_id.set(input_id)
     function_call_token = _current_function_call_id.set(function_call_id)
+    attempt_token_token = _current_attempt_token.set(attempt_token)
     def _reset_current_context_ids():
         _current_input_id.reset(input_token)
         _current_function_call_id.reset(function_call_token)
+        _current_attempt_token.reset(attempt_token_token)
     return _reset_current_context_ids
 _current_input_id: ContextVar = ContextVar("_current_input_id")
 _current_function_call_id: ContextVar = ContextVar("_current_function_call_id")
+_current_attempt_token: ContextVar = ContextVar("_current_attempt_token")
 _is_currently_importing = False  # we set this to True while a container is importing user code

modal/_runtime/execution_context.pyi CHANGED Viewed

@@ -68,11 +68,14 @@ def current_function_call_id() -> typing.Optional[str]:
     """
     ...
+def current_attempt_token() -> typing.Optional[str]: ...
 def _set_current_context_ids(
-    input_ids: list[str], function_call_ids: list[str]
+    input_ids: list[str], function_call_ids: list[str], attempt_tokens: list[str]
 ) -> collections.abc.Callable[[], None]: ...
 def _import_context(): ...
 _current_input_id: contextvars.ContextVar
 _current_function_call_id: contextvars.ContextVar
+_current_attempt_token: contextvars.ContextVar

modal/_runtime/gpu_memory_snapshot.py CHANGED Viewed

@@ -1,25 +1,34 @@
 # Copyright Modal Labs 2022
 #
 # This module provides a simple interface for creating GPU memory snapshots,
-# provising a convenient interface to `cuda-checkpoint` [1]. This is intended
+# providing a convenient interface to `cuda-checkpoint` [1]. This is intended
 # to be used in conjunction with memory snapshots.
 #
 # [1] https://github.com/NVIDIA/cuda-checkpoint
 import subprocess
 import time
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
+from typing import List, Optional
 from modal.config import config, logger
 CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
+# Maximum total duration for an entire toggle operation.
+CUDA_CHECKPOINT_TOGGLE_TIMEOUT: float = 5 * 60.0
+# Maximum total duration for each individual `cuda-checkpoint` invocation.
+CUDA_CHECKPOINT_TIMEOUT: float = 90
 class CudaCheckpointState(Enum):
-    """State representation from the CUDA API: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc96cdda177a2b8c296144567cbea4f23"""
+    """State representation from the CUDA API [1].
+    [1] https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html"""
     RUNNING = "running"
     LOCKED = "locked"
@@ -28,6 +37,8 @@ class CudaCheckpointState(Enum):
 class CudaCheckpointException(Exception):
+    """Exception raised for CUDA checkpoint operations."""
     pass
@@ -39,24 +50,44 @@ class CudaCheckpointProcess:
     pid: int
     state: CudaCheckpointState
-    def toggle(self, target_state: CudaCheckpointState, timeout_secs: float = 5 * 60.0):
+    def toggle(self, target_state: CudaCheckpointState, skip_first_refresh: bool = False) -> None:
         """Toggle CUDA checkpoint state for current process, moving GPU memory to the
-        CPU and back depending on the current process state when called."""
+        CPU and back depending on the current process state when called.
+        """
         logger.debug(f"PID: {self.pid} Toggling CUDA checkpoint state to {target_state.value}")
         start_time = time.monotonic()
-        while self._should_continue_toggle(target_state, start_time, timeout_secs):
-            self._execute_toggle_command()
-            time.sleep(0.1)
+        retry_count = 0
+        max_retries = 3
+        attempts = 0
+        while self._should_continue_toggle(
+            target_state, start_time, refresh=not (skip_first_refresh and attempts == 0)
+        ):
+            attempts += 1
+            try:
+                self._execute_toggle_command()
+                # Use exponential backoff for retries
+                sleep_time = min(0.1 * (2**retry_count), 1.0)
+                time.sleep(sleep_time)
+                retry_count = 0
+            except CudaCheckpointException as e:
+                retry_count += 1
+                if retry_count >= max_retries:
+                    raise CudaCheckpointException(
+                        f"PID: {self.pid} Failed to toggle state after {max_retries} retries: {e}"
+                    )
+                logger.debug(f"PID: {self.pid} Retry {retry_count}/{max_retries} after error: {e}")
+                time.sleep(0.5 * retry_count)
         logger.debug(f"PID: {self.pid} Target state {target_state.value} reached")
     def _should_continue_toggle(
-        self, target_state: CudaCheckpointState, start_time: float, timeout_secs: float
+        self, target_state: CudaCheckpointState, start_time: float, refresh: bool = True
     ) -> bool:
         """Check if toggle operation should continue based on current state and timeout."""
-        self.refresh_state()
+        if refresh:
+            self.refresh_state()
         if self.state == target_state:
             return False
@@ -65,7 +96,7 @@ class CudaCheckpointProcess:
             raise CudaCheckpointException(f"PID: {self.pid} CUDA process state is {self.state}")
         elapsed = time.monotonic() - start_time
-        if elapsed >= timeout_secs:
+        if elapsed >= CUDA_CHECKPOINT_TOGGLE_TIMEOUT:
             raise CudaCheckpointException(
                 f"PID: {self.pid} Timeout after {elapsed:.2f}s waiting for state {target_state.value}. "
                 f"Current state: {self.state}"
@@ -73,19 +104,25 @@ class CudaCheckpointProcess:
         return True
-    def _execute_toggle_command(self):
+    def _execute_toggle_command(self) -> None:
         """Execute the cuda-checkpoint toggle command."""
         try:
-            subprocess.run(
+            _ = subprocess.run(
                 [CUDA_CHECKPOINT_PATH, "--toggle", "--pid", str(self.pid)],
                 check=True,
                 capture_output=True,
                 text=True,
+                timeout=CUDA_CHECKPOINT_TIMEOUT,
             )
             logger.debug(f"PID: {self.pid} Successfully toggled CUDA checkpoint state")
         except subprocess.CalledProcessError as e:
-            logger.debug(f"PID: {self.pid} Failed to toggle CUDA checkpoint state: {e.stderr}")
-            raise CudaCheckpointException(e.stderr)
+            error_msg = f"PID: {self.pid} Failed to toggle CUDA checkpoint state: {e.stderr}"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
+        except subprocess.TimeoutExpired:
+            error_msg = f"PID: {self.pid} Toggle command timed out"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
     def refresh_state(self) -> None:
         """Refreshes the current CUDA checkpoint state for this process."""
@@ -95,15 +132,20 @@ class CudaCheckpointProcess:
                 check=True,
                 capture_output=True,
                 text=True,
-                timeout=5,
+                timeout=CUDA_CHECKPOINT_TIMEOUT,
             )
             state_str = result.stdout.strip().lower()
             self.state = CudaCheckpointState(state_str)
         except subprocess.CalledProcessError as e:
-            logger.debug(f"PID: {self.pid} Failed to get CUDA checkpoint state: {e.stderr}")
-            raise CudaCheckpointException(e.stderr)
+            error_msg = f"PID: {self.pid} Failed to get CUDA checkpoint state: {e.stderr}"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
+        except subprocess.TimeoutExpired:
+            error_msg = f"PID: {self.pid} Get state command timed out"
+            logger.debug(error_msg)
+            raise CudaCheckpointException(error_msg)
 class CudaCheckpointSession:
@@ -111,12 +153,17 @@ class CudaCheckpointSession:
     def __init__(self):
         self.cuda_processes = self._get_cuda_pids()
-        logger.debug(f"PIDs with CUDA sessions: {[c.pid for c in self.cuda_processes]}")
+        if self.cuda_processes:
+            logger.debug(
+                f"Found {len(self.cuda_processes)} PID(s) with CUDA sessions: {[c.pid for c in self.cuda_processes]}"
+            )
+        else:
+            logger.debug("No CUDA sessions found.")
-    def _get_cuda_pids(self) -> list[CudaCheckpointProcess]:
+    def _get_cuda_pids(self) -> List[CudaCheckpointProcess]:
         """Iterates over all PIDs and identifies the ones that have running
         CUDA sessions."""
-        cuda_pids: list[CudaCheckpointProcess] = []
+        cuda_pids: List[CudaCheckpointProcess] = []
         # Get all active process IDs from /proc directory
         proc_dir = Path("/proc")
@@ -125,75 +172,135 @@ class CudaCheckpointSession:
                 "OS does not have /proc path rendering it incompatible with GPU memory snapshots."
             )
-        for entry in proc_dir.iterdir():
-            if not entry.name.isdigit():
-                continue
+        # Get all numeric directories (PIDs) from /proc
+        pid_dirs = [entry for entry in proc_dir.iterdir() if entry.name.isdigit()]
-            pid = int(entry.name)
-            try:
-                # Call cuda-checkpoint to check if this PID has a CUDA session
-                result = subprocess.run(
-                    [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
-                    capture_output=True,
-                    text=True,
-                    timeout=10,
-                )
-                # If the command succeeds (return code 0), this PID has a CUDA session
-                if result.returncode == 0:
-                    state_str = result.stdout.strip().lower()
-                    state = CudaCheckpointState(state_str)
-                    cuda_checkpoint_process = CudaCheckpointProcess(pid=pid, state=state)
-                    cuda_pids.append(cuda_checkpoint_process)
-            # Command failed, which is expected for PIDs without CUDA sessions
-            except subprocess.CalledProcessError:
-                continue
+        # Use ThreadPoolExecutor to check PIDs in parallel for better performance
+        with ThreadPoolExecutor(max_workers=min(50, len(pid_dirs))) as executor:
+            future_to_pid = {
+                executor.submit(self._check_cuda_session, int(entry.name)): int(entry.name) for entry in pid_dirs
+            }
-            # Raise other exceptions
-            except subprocess.TimeoutExpired:
-                raise CudaCheckpointException(f"Failed to get CUDA state for PID {pid}")
-            except Exception as e:
-                raise CudaCheckpointException(e)
+            for future in as_completed(future_to_pid):
+                pid = future_to_pid[future]
+                try:
+                    cuda_process = future.result()
+                    if cuda_process:
+                        cuda_pids.append(cuda_process)
+                except Exception as e:
+                    logger.debug(f"Error checking PID {pid}: {e}")
         # Sort PIDs for ordered checkpointing
         cuda_pids.sort(key=lambda x: x.pid)
         return cuda_pids
+    def _check_cuda_session(self, pid: int) -> Optional[CudaCheckpointProcess]:
+        """Check if a specific PID has a CUDA session."""
+        try:
+            result = subprocess.run(
+                [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
+                capture_output=True,
+                text=True,
+                # This should be quick since no checkpoint has taken place yet
+                timeout=5,
+            )
+            # If the command succeeds (return code 0), this PID has a CUDA session
+            if result.returncode == 0:
+                state_str = result.stdout.strip().lower()
+                state = CudaCheckpointState(state_str)
+                return CudaCheckpointProcess(pid=pid, state=state)
+        except subprocess.CalledProcessError:
+            # Command failed, which is expected for PIDs without CUDA sessions
+            pass
+        except subprocess.TimeoutExpired:
+            logger.debug(f"Timeout checking CUDA state for PID {pid}")
+        except Exception as e:
+            logger.debug(f"Error checking PID {pid}: {e}")
+        return None
     def checkpoint(self) -> None:
+        """Checkpoint all CUDA processes, moving GPU memory to CPU."""
+        if not self.cuda_processes:
+            logger.debug("No CUDA processes to checkpoint.")
+            return
         # Validate all states first
         for proc in self.cuda_processes:
+            proc.refresh_state()  # Refresh state before validation
             if proc.state != CudaCheckpointState.RUNNING:
-                raise CudaCheckpointException(f"CUDA session not in {CudaCheckpointState.RUNNING} state.")
+                raise CudaCheckpointException(
+                    f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.RUNNING.value} state. "
+                    f"Current state: {proc.state.value}"
+                )
         # Moving state from GPU to CPU can take several seconds per CUDA session.
         # Make a parallel call per CUDA session.
         start = time.perf_counter()
-        def checkpoint_impl(proc: CudaCheckpointProcess):
+        def checkpoint_impl(proc: CudaCheckpointProcess) -> None:
             proc.toggle(CudaCheckpointState.CHECKPOINTED)
         with ThreadPoolExecutor() as executor:
-            list(executor.map(checkpoint_impl, self.cuda_processes))
+            futures = [executor.submit(checkpoint_impl, proc) for proc in self.cuda_processes]
+            # Wait for all futures and collect any exceptions
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    exceptions.append(e)
+            if exceptions:
+                raise CudaCheckpointException(
+                    f"Failed to checkpoint {len(exceptions)} processes: {'; '.join(str(e) for e in exceptions)}"
+                )
         elapsed = time.perf_counter() - start
-        logger.debug(f"Checkpointing CUDA sessions took => {elapsed:.3f}s")
+        logger.debug(f"Checkpointing {len(self.cuda_processes)} CUDA sessions took => {elapsed:.3f}s")
     def restore(self) -> None:
-        # Validate all states first
-        for proc in self.cuda_processes:
-            if proc.state != CudaCheckpointState.CHECKPOINTED:
-                raise CudaCheckpointException(f"CUDA session not in {CudaCheckpointState.CHECKPOINTED} state.")
+        """Restore all CUDA processes, moving memory back from CPU to GPU."""
+        if not self.cuda_processes:
+            logger.debug("No CUDA sessions to restore.")
+            return
         # See checkpoint() for rationale about parallelism.
         start = time.perf_counter()
-        def restore_process(proc: CudaCheckpointProcess):
-            proc.toggle(CudaCheckpointState.RUNNING)
+        def restore_process(proc: CudaCheckpointProcess) -> None:
+            proc.toggle(CudaCheckpointState.RUNNING, skip_first_refresh=True)
         with ThreadPoolExecutor() as executor:
-            list(executor.map(restore_process, self.cuda_processes))
+            futures = [executor.submit(restore_process, proc) for proc in self.cuda_processes]
+            # Wait for all futures and collect any exceptions
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    exceptions.append(e)
+            if exceptions:
+                raise CudaCheckpointException(
+                    f"Failed to restore {len(exceptions)} processes: {'; '.join(str(e) for e in exceptions)}"
+                )
         elapsed = time.perf_counter() - start
-        logger.debug(f"Restoring CUDA sessions took => {elapsed:.3f}s")
+        logger.debug(f"Restoring {len(self.cuda_processes)} CUDA session(s) took => {elapsed:.3f}s")
+    def get_process_count(self) -> int:
+        """Get the number of CUDA processes managed by this session."""
+        return len(self.cuda_processes)
+    def get_process_states(self) -> List[tuple[int, CudaCheckpointState]]:
+        """Get current states of all managed processes."""
+        states = []
+        for proc in self.cuda_processes:
+            proc.refresh_state()
+            states.append((proc.pid, proc.state))
+        return states

modal 1.0.6.dev58__py3-none-any.whl → 1.2.3.dev7__py3-none-any.whl

Potentially problematic release.

modal 1.0.6.dev58py3-none-any.whl → 1.2.3.dev7py3-none-any.whl