PyPI - modal - Versions diffs - 0.73.8__py3-none-any.whl → 0.73.9__py3-none-any.whl - Mend

modal 0.73.8py3-none-any.whl → 0.73.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

modal/_runtime/container_io_manager.py +24 -0
modal/_runtime/gpu_memory_snapshot.py +104 -0
modal/client.pyi +2 -2
modal/config.py +1 -0
modal/functions.pyi +6 -6
{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/METADATA +1 -1
{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/RECORD +12 -11
modal_version/_version_generated.py +1 -1
{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/LICENSE +0 -0
{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/WHEEL +0 -0
{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/entry_points.txt +0 -0
{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/top_level.txt +0 -0

modal/_runtime/container_io_manager.py CHANGED Viewed

@@ -25,6 +25,7 @@ from grpclib import Status
 from synchronicity.async_wrap import asynccontextmanager
 import modal_proto.api_pb2
+from modal._runtime import gpu_memory_snapshot
 from modal._serialization import deserialize, serialize, serialize_data_format
 from modal._traceback import extract_traceback, print_exception
 from modal._utils.async_utils import TaskContext, asyncify, synchronize_api, synchronizer
@@ -877,6 +878,17 @@ class _ContainerIOManager:
             if value != "":
                 config.override_locally(key, value)
+        # Restore GPU memory.
+        if self.function_def._experimental_enable_gpu_snapshot and self.function_def.resources.gpu_config.gpu_type:
+            logger.debug("GPU memory snapshot enabled. Attempting to restore GPU memory.")
+            gpu_process_state = gpu_memory_snapshot.get_state()
+            if gpu_process_state != gpu_memory_snapshot.CudaCheckpointState.CHECKPOINTED:
+                raise ValueError(
+                    "Cannot restore GPU state if GPU isn't in a 'checkpointed' state. "
+                    f"Current GPU state: {gpu_process_state}"
+                )
+            gpu_memory_snapshot.toggle()
         # Restore input to default state.
         self.current_input_id = None
         self.current_inputs = {}
@@ -892,6 +904,18 @@ class _ContainerIOManager:
         # Pause heartbeats since they keep the client connection open which causes the snapshotter to crash
         async with self.heartbeat_condition:
+            # Snapshot GPU memory.
+            if self.function_def._experimental_enable_gpu_snapshot and self.function_def.resources.gpu_config.gpu_type:
+                logger.debug("GPU memory snapshot enabled. Attempting to snapshot GPU memory.")
+                gpu_process_state = gpu_memory_snapshot.get_state()
+                if gpu_process_state != gpu_memory_snapshot.CudaCheckpointState.RUNNING:
+                    raise ValueError(
+                        "Cannot snapshot GPU state if it isn't running. " f"Current GPU state: {gpu_process_state}"
+                    )
+                gpu_memory_snapshot.toggle()
+                gpu_memory_snapshot.wait_for_state(gpu_memory_snapshot.CudaCheckpointState.CHECKPOINTED)
             # Notify the heartbeat loop that the snapshot phase has begun in order to
             # prevent it from sending heartbeat RPCs
             self._waiting_for_memory_snapshot = True

modal/_runtime/gpu_memory_snapshot.py ADDED Viewed

@@ -0,0 +1,104 @@
+# Copyright Modal Labs 2022
+#
+# This module provides a simple interface for creating GPU memory snapshots,
+# provising a convenient interface to `cuda-checkpoint` [1]. This is intended
+# to be used in conjunction with memory snapshots.
+#
+# [1] https://github.com/NVIDIA/cuda-checkpoint
+import os
+import subprocess
+import time
+from enum import Enum
+from modal.config import config, logger
+CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
+class CudaCheckpointState(Enum):
+    """State representation from the CUDA API: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc96cdda177a2b8c296144567cbea4f23"""
+    RUNNING = "running"
+    LOCKED = "locked"
+    CHECKPOINTED = "checkpointed"
+    FAILED = "failed"
+class CudaCheckpointException(Exception):
+    pass
+def toggle():
+    """Toggle CUDA checkpoint state for current process, moving GPU memory to the
+    CPU and back depending on the current process state when called."""
+    pid = get_own_pid()
+    logger.debug(f"Toggling CUDA checkpoint state for PID {pid}")
+    try:
+        cuda_checkpoint_lock_timeout_ms = 5 * 1000
+        subprocess.run(
+            [
+                CUDA_CHECKPOINT_PATH,
+                "--toggle",
+                "--pid",
+                str(pid),
+                "--timeout",
+                str(cuda_checkpoint_lock_timeout_ms),
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        logger.debug("Successfully toggled CUDA checkpoint state")
+    except subprocess.CalledProcessError as e:
+        logger.debug(f"Failed to toggle CUDA checkpoint state: {e.stderr}")
+        raise CudaCheckpointException(e.stderr)
+def get_state() -> CudaCheckpointState:
+    """Get current CUDA checkpoint state for this process."""
+    pid = get_own_pid()
+    try:
+        result = subprocess.run(
+            [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)], check=True, capture_output=True, text=True
+        )
+        # Parse output to get state
+        state_str = result.stdout.strip().lower()
+        logger.debug(f"Raw state output: {state_str}")
+        return CudaCheckpointState(state_str)
+    except subprocess.CalledProcessError as e:
+        logger.debug(f"Failed to get CUDA checkpoint state: {e.stderr}")
+        raise CudaCheckpointException(e.stderr)
+def wait_for_state(target_state: CudaCheckpointState, timeout_secs: float = 5.0):
+    """Wait for CUDA checkpoint to reach a specific state."""
+    logger.debug(f"Waiting for CUDA checkpoint state {target_state.value}")
+    start_time = time.monotonic()
+    while True:
+        current_state = get_state()
+        if current_state == target_state:
+            logger.debug(f"Target state {target_state.value} reached")
+        if current_state == CudaCheckpointState.FAILED:
+            raise CudaCheckpointException(f"CUDA process state is {current_state}")
+        elapsed = time.monotonic() - start_time
+        if elapsed >= timeout_secs:
+            raise CudaCheckpointException(f"Timeout after {elapsed:.2f}s waiting for state {target_state.value}")
+        time.sleep(0.1)
+def get_own_pid():
+    """Returns the Process ID (PID) of the current Python process
+    using only the standard library.
+    """
+    return os.getpid()

modal/client.pyi CHANGED Viewed

@@ -27,7 +27,7 @@ class _Client:
     _snapshotted: bool
     def __init__(
-        self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.8"
+        self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.9"
     ): ...
     def is_closed(self) -> bool: ...
     @property
@@ -85,7 +85,7 @@ class Client:
     _snapshotted: bool
     def __init__(
-        self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.8"
+        self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.9"
     ): ...
     def is_closed(self) -> bool: ...
     @property

modal/config.py CHANGED Viewed

@@ -223,6 +223,7 @@ _SETTINGS = {
     "strict_parameters": _Setting(False, transform=_to_boolean),  # For internal/experimental use
     "snapshot_debug": _Setting(False, transform=_to_boolean),
     "client_retries": _Setting(False, transform=_to_boolean),  # For internal testing.
+    "cuda_checkpoint_path": _Setting("/__modal/.bin/cuda-checkpoint"),  # Used for snapshotting GPU memory.
 }

modal/functions.pyi CHANGED Viewed

@@ -200,11 +200,11 @@ class Function(
     _call_generator_nowait: ___call_generator_nowait_spec[typing_extensions.Self]
-    class __remote_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
+    class __remote_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
         def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER: ...
         async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER: ...
-    remote: __remote_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
+    remote: __remote_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
     class __remote_gen_spec(typing_extensions.Protocol[SUPERSELF]):
         def __call__(self, *args, **kwargs) -> typing.Generator[typing.Any, None, None]: ...
@@ -219,19 +219,19 @@ class Function(
         self, *args: modal._functions.P.args, **kwargs: modal._functions.P.kwargs
     ) -> modal._functions.OriginalReturnType: ...
-    class ___experimental_spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
+    class ___experimental_spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
         def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
         async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
     _experimental_spawn: ___experimental_spawn_spec[
-        modal._functions.ReturnType, modal._functions.P, typing_extensions.Self
+        modal._functions.P, modal._functions.ReturnType, typing_extensions.Self
     ]
-    class __spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
+    class __spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
         def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
         async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
-    spawn: __spawn_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
+    spawn: __spawn_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
     def get_raw_f(self) -> collections.abc.Callable[..., typing.Any]: ...

{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: modal
-Version: 0.73.8
+Version: 0.73.9
 Summary: Python client library for Modal
 Author: Modal Labs
 Author-email: support@modal.com

{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/RECORD RENAMED Viewed

@@ -21,12 +21,12 @@ modal/app.py,sha256=wRygVSrWH8iIqhDAAl2Ww_RAkz8MCJZ0Jt9qYZCF6SA,44626
 modal/app.pyi,sha256=lxiuWzE_OLb3WHg-H7Pek9DGBuCUzZ55P594VhJL5LA,26113
 modal/call_graph.py,sha256=1g2DGcMIJvRy-xKicuf63IVE98gJSnQsr8R_NVMptNc,2581
 modal/client.py,sha256=8SQawr7P1PNUCq1UmJMUQXG2jIo4Nmdcs311XqrNLRE,15276
-modal/client.pyi,sha256=kOt17KK6b51zHs9T5vme0cCrLzJu-nw1DTE1kSuxXcQ,7591
+modal/client.pyi,sha256=UOfACeLizRMYzf2NDFzalr9sLhXAy447M3jSwPxPCbc,7591
 modal/cloud_bucket_mount.py,sha256=YOe9nnvSr4ZbeCn587d7_VhE9IioZYRvF9VYQTQux08,5914
 modal/cloud_bucket_mount.pyi,sha256=30T3K1a89l6wzmEJ_J9iWv9SknoGqaZDx59Xs-ZQcmk,1607
 modal/cls.py,sha256=kNnZrBYVXOhgEXU0rDWk2Hr-bQRrsZkMKDgC-TD_6Bs,31063
 modal/cls.pyi,sha256=gb6QNwfX3HSJfcZXPY36N9ywF7aBJTwwtoARnf3G1HQ,8877
-modal/config.py,sha256=BzhZYUUwOmvVwf6x5kf0ywMC257s648dmuhsnB6g3gk,11041
+modal/config.py,sha256=XT1W4Y9PVkbYMAXjJRshvQEPDhZmnfW_ZRMwl8XKoqA,11149
 modal/container_process.py,sha256=WTqLn01dJPVkPpwR_0w_JH96ceN5mV4TGtiu1ZR2RRA,6108
 modal/container_process.pyi,sha256=Hf0J5JyDdCCXBJSKx6gvkPOo0XrztCm78xzxamtzUjQ,2828
 modal/dict.py,sha256=vc5lQVqzeDUCb4fRjnOlqYK2GmBb0fIhZmvB0xIBG0U,12921
@@ -40,7 +40,7 @@ modal/file_io.py,sha256=lcMs_E9Xfm0YX1t9U2wNIBPnqHRxmImqjLW1GHqVmyg,20945
 modal/file_io.pyi,sha256=NTRft1tbPSWf9TlWVeZmTlgB5AZ_Zhu2srWIrWr7brk,9445
 modal/file_pattern_matcher.py,sha256=1cZ4V2wSLiaXqAqStETSwp3bzDD6QZOt6pmmjk3Okz4,6505
 modal/functions.py,sha256=kcNHvqeGBxPI7Cgd57NIBBghkfbeFJzXO44WW0jSmao,325
-modal/functions.pyi,sha256=mtngzj8VlzMOQATe6muBN5oH_Gw9zGKxMKZ56Z-41kU,14288
+modal/functions.pyi,sha256=QYZy3BCjA2y3UC217e3YG-omyG0E1Jx-Uc-sonsyQsE,14288
 modal/gpu.py,sha256=2qZMNnoMrjU-5Bu7fx68pANUAKTtZq0EWEEeBA9OUVQ,7426
 modal/image.py,sha256=Vjsi7wS9dEcoj-7m7_LmvbK5iqEuFz-SHKl2K-qWcew,90952
 modal/image.pyi,sha256=A5mW2dBguEhmRo815Ax1rBIMXTCriu7PqLMHoUPsez8,26372
@@ -83,8 +83,9 @@ modal/volume.py,sha256=JAWeDvoAG95tMBv-fYIERyHsJPS_X_xGpxRRmYtb6j0,30096
 modal/volume.pyi,sha256=kTsXarphjZILXci84LQy7EyC84eXUs5-7D62IM5q3eE,12491
 modal/_runtime/__init__.py,sha256=MIEP8jhXUeGq_eCjYFcqN5b1bxBM4fdk0VESpjWR0fc,28
 modal/_runtime/asgi.py,sha256=c4hmaMW1pLo-cm7ouriJjieuFm4ZF6D2LMy0638sfOs,22139
-modal/_runtime/container_io_manager.py,sha256=QVWMCvJatd2696wsauzXl20psxCYsR0d_CHeS5ceTsU,43201
+modal/_runtime/container_io_manager.py,sha256=L6qv-Mo3mN3ttR5GX-G36cUhH_oz8wdP5WG0HT5FFzg,44619
 modal/_runtime/execution_context.py,sha256=E6ofm6j1POXGPxS841X3V7JU6NheVb8OkQc7JpLq4Kg,2712
+modal/_runtime/gpu_memory_snapshot.py,sha256=vV6igsqN9CxOoH91kUkuaZQ32QfX5wdoXIS-6MIYX2Y,3315
 modal/_runtime/telemetry.py,sha256=T1RoAGyjBDr1swiM6pPsGRSITm7LI5FDK18oNXxY08U,5163
 modal/_runtime/user_code_imports.py,sha256=zl_Mq9dsrVF62x3w-iNK1YAhZKYAXeFaGpd4G7AySTc,14746
 modal/_utils/__init__.py,sha256=waLjl5c6IPDhSsdWAm9Bji4e2PVxamYABKAze6CHVXY,28
@@ -170,10 +171,10 @@ modal_proto/options_pb2_grpc.pyi,sha256=CImmhxHsYnF09iENPoe8S4J-n93jtgUYD2JPAc0y
 modal_proto/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 modal_version/__init__.py,sha256=wiJQ53c-OMs0Xf1UeXOxQ7FwlV1VzIjnX6o-pRYZ_Pk,470
 modal_version/__main__.py,sha256=2FO0yYQQwDTh6udt1h-cBnGd1c4ZyHnHSI4BksxzVac,105
-modal_version/_version_generated.py,sha256=a8LMamoFRZKpp6SoTp5bDH1HRWGljeWLZesEIHf3WIE,148
-modal-0.73.8.dist-info/LICENSE,sha256=psuoW8kuDP96RQsdhzwOqi6fyWv0ct8CR6Jr7He_P_k,10173
-modal-0.73.8.dist-info/METADATA,sha256=wsn98lBqsvSVbab7C3DqlgtiyUF8vmevqqGNf7Ir6WE,2329
-modal-0.73.8.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
-modal-0.73.8.dist-info/entry_points.txt,sha256=An-wYgeEUnm6xzrAP9_NTSTSciYvvEWsMZILtYrvpAI,46
-modal-0.73.8.dist-info/top_level.txt,sha256=1nvYbOSIKcmU50fNrpnQnrrOpj269ei3LzgB6j9xGqg,64
-modal-0.73.8.dist-info/RECORD,,
+modal_version/_version_generated.py,sha256=RgDn253uA_DeV6d6P40AOPpDJqMMVupp6gXgLMcak24,148
+modal-0.73.9.dist-info/LICENSE,sha256=psuoW8kuDP96RQsdhzwOqi6fyWv0ct8CR6Jr7He_P_k,10173
+modal-0.73.9.dist-info/METADATA,sha256=BYeidDOAvKaxgE_yMfAun3xjsRB6Q0jGKR8fcs9a0jU,2329
+modal-0.73.9.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
+modal-0.73.9.dist-info/entry_points.txt,sha256=An-wYgeEUnm6xzrAP9_NTSTSciYvvEWsMZILtYrvpAI,46
+modal-0.73.9.dist-info/top_level.txt,sha256=1nvYbOSIKcmU50fNrpnQnrrOpj269ei3LzgB6j9xGqg,64
+modal-0.73.9.dist-info/RECORD,,

modal_version/_version_generated.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # Copyright Modal Labs 2025
 # Note: Reset this value to -1 whenever you make a minor `0.X` release of the client.
-build_number = 8  # git: 6a2791f
+build_number = 9  # git: 76a3f59

{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/LICENSE RENAMED Viewed

File without changes

{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{modal-0.73.8.dist-info → modal-0.73.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

modal 0.73.8__py3-none-any.whl → 0.73.9__py3-none-any.whl

modal 0.73.8py3-none-any.whl → 0.73.9py3-none-any.whl