modal 0.73.8__py3-none-any.whl → 0.73.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,6 +25,7 @@ from grpclib import Status
25
25
  from synchronicity.async_wrap import asynccontextmanager
26
26
 
27
27
  import modal_proto.api_pb2
28
+ from modal._runtime import gpu_memory_snapshot
28
29
  from modal._serialization import deserialize, serialize, serialize_data_format
29
30
  from modal._traceback import extract_traceback, print_exception
30
31
  from modal._utils.async_utils import TaskContext, asyncify, synchronize_api, synchronizer
@@ -877,6 +878,17 @@ class _ContainerIOManager:
877
878
  if value != "":
878
879
  config.override_locally(key, value)
879
880
 
881
+ # Restore GPU memory.
882
+ if self.function_def._experimental_enable_gpu_snapshot and self.function_def.resources.gpu_config.gpu_type:
883
+ logger.debug("GPU memory snapshot enabled. Attempting to restore GPU memory.")
884
+ gpu_process_state = gpu_memory_snapshot.get_state()
885
+ if gpu_process_state != gpu_memory_snapshot.CudaCheckpointState.CHECKPOINTED:
886
+ raise ValueError(
887
+ "Cannot restore GPU state if GPU isn't in a 'checkpointed' state. "
888
+ f"Current GPU state: {gpu_process_state}"
889
+ )
890
+ gpu_memory_snapshot.toggle()
891
+
880
892
  # Restore input to default state.
881
893
  self.current_input_id = None
882
894
  self.current_inputs = {}
@@ -892,6 +904,18 @@ class _ContainerIOManager:
892
904
 
893
905
  # Pause heartbeats since they keep the client connection open which causes the snapshotter to crash
894
906
  async with self.heartbeat_condition:
907
+ # Snapshot GPU memory.
908
+ if self.function_def._experimental_enable_gpu_snapshot and self.function_def.resources.gpu_config.gpu_type:
909
+ logger.debug("GPU memory snapshot enabled. Attempting to snapshot GPU memory.")
910
+ gpu_process_state = gpu_memory_snapshot.get_state()
911
+ if gpu_process_state != gpu_memory_snapshot.CudaCheckpointState.RUNNING:
912
+ raise ValueError(
913
+ "Cannot snapshot GPU state if it isn't running. " f"Current GPU state: {gpu_process_state}"
914
+ )
915
+
916
+ gpu_memory_snapshot.toggle()
917
+ gpu_memory_snapshot.wait_for_state(gpu_memory_snapshot.CudaCheckpointState.CHECKPOINTED)
918
+
895
919
  # Notify the heartbeat loop that the snapshot phase has begun in order to
896
920
  # prevent it from sending heartbeat RPCs
897
921
  self._waiting_for_memory_snapshot = True
@@ -0,0 +1,104 @@
1
+ # Copyright Modal Labs 2022
2
+ #
3
+ # This module provides a simple interface for creating GPU memory snapshots,
4
+ # provising a convenient interface to `cuda-checkpoint` [1]. This is intended
5
+ # to be used in conjunction with memory snapshots.
6
+ #
7
+ # [1] https://github.com/NVIDIA/cuda-checkpoint
8
+
9
+ import os
10
+ import subprocess
11
+ import time
12
+ from enum import Enum
13
+
14
+ from modal.config import config, logger
15
+
16
+ CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
17
+
18
+
19
+ class CudaCheckpointState(Enum):
20
+ """State representation from the CUDA API: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc96cdda177a2b8c296144567cbea4f23"""
21
+
22
+ RUNNING = "running"
23
+ LOCKED = "locked"
24
+ CHECKPOINTED = "checkpointed"
25
+ FAILED = "failed"
26
+
27
+
28
+ class CudaCheckpointException(Exception):
29
+ pass
30
+
31
+
32
+ def toggle():
33
+ """Toggle CUDA checkpoint state for current process, moving GPU memory to the
34
+ CPU and back depending on the current process state when called."""
35
+ pid = get_own_pid()
36
+ logger.debug(f"Toggling CUDA checkpoint state for PID {pid}")
37
+
38
+ try:
39
+ cuda_checkpoint_lock_timeout_ms = 5 * 1000
40
+ subprocess.run(
41
+ [
42
+ CUDA_CHECKPOINT_PATH,
43
+ "--toggle",
44
+ "--pid",
45
+ str(pid),
46
+ "--timeout",
47
+ str(cuda_checkpoint_lock_timeout_ms),
48
+ ],
49
+ check=True,
50
+ capture_output=True,
51
+ text=True,
52
+ )
53
+ logger.debug("Successfully toggled CUDA checkpoint state")
54
+
55
+ except subprocess.CalledProcessError as e:
56
+ logger.debug(f"Failed to toggle CUDA checkpoint state: {e.stderr}")
57
+ raise CudaCheckpointException(e.stderr)
58
+
59
+
60
+ def get_state() -> CudaCheckpointState:
61
+ """Get current CUDA checkpoint state for this process."""
62
+ pid = get_own_pid()
63
+
64
+ try:
65
+ result = subprocess.run(
66
+ [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)], check=True, capture_output=True, text=True
67
+ )
68
+
69
+ # Parse output to get state
70
+ state_str = result.stdout.strip().lower()
71
+ logger.debug(f"Raw state output: {state_str}")
72
+ return CudaCheckpointState(state_str)
73
+
74
+ except subprocess.CalledProcessError as e:
75
+ logger.debug(f"Failed to get CUDA checkpoint state: {e.stderr}")
76
+ raise CudaCheckpointException(e.stderr)
77
+
78
+
79
+ def wait_for_state(target_state: CudaCheckpointState, timeout_secs: float = 5.0):
80
+ """Wait for CUDA checkpoint to reach a specific state."""
81
+ logger.debug(f"Waiting for CUDA checkpoint state {target_state.value}")
82
+ start_time = time.monotonic()
83
+
84
+ while True:
85
+ current_state = get_state()
86
+
87
+ if current_state == target_state:
88
+ logger.debug(f"Target state {target_state.value} reached")
89
+
90
+ if current_state == CudaCheckpointState.FAILED:
91
+ raise CudaCheckpointException(f"CUDA process state is {current_state}")
92
+
93
+ elapsed = time.monotonic() - start_time
94
+ if elapsed >= timeout_secs:
95
+ raise CudaCheckpointException(f"Timeout after {elapsed:.2f}s waiting for state {target_state.value}")
96
+
97
+ time.sleep(0.1)
98
+
99
+
100
+ def get_own_pid():
101
+ """Returns the Process ID (PID) of the current Python process
102
+ using only the standard library.
103
+ """
104
+ return os.getpid()
modal/client.pyi CHANGED
@@ -27,7 +27,7 @@ class _Client:
27
27
  _snapshotted: bool
28
28
 
29
29
  def __init__(
30
- self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.8"
30
+ self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.9"
31
31
  ): ...
32
32
  def is_closed(self) -> bool: ...
33
33
  @property
@@ -85,7 +85,7 @@ class Client:
85
85
  _snapshotted: bool
86
86
 
87
87
  def __init__(
88
- self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.8"
88
+ self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.9"
89
89
  ): ...
90
90
  def is_closed(self) -> bool: ...
91
91
  @property
modal/config.py CHANGED
@@ -223,6 +223,7 @@ _SETTINGS = {
223
223
  "strict_parameters": _Setting(False, transform=_to_boolean), # For internal/experimental use
224
224
  "snapshot_debug": _Setting(False, transform=_to_boolean),
225
225
  "client_retries": _Setting(False, transform=_to_boolean), # For internal testing.
226
+ "cuda_checkpoint_path": _Setting("/__modal/.bin/cuda-checkpoint"), # Used for snapshotting GPU memory.
226
227
  }
227
228
 
228
229
 
modal/functions.pyi CHANGED
@@ -200,11 +200,11 @@ class Function(
200
200
 
201
201
  _call_generator_nowait: ___call_generator_nowait_spec[typing_extensions.Self]
202
202
 
203
- class __remote_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
203
+ class __remote_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
204
204
  def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER: ...
205
205
  async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER: ...
206
206
 
207
- remote: __remote_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
207
+ remote: __remote_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
208
208
 
209
209
  class __remote_gen_spec(typing_extensions.Protocol[SUPERSELF]):
210
210
  def __call__(self, *args, **kwargs) -> typing.Generator[typing.Any, None, None]: ...
@@ -219,19 +219,19 @@ class Function(
219
219
  self, *args: modal._functions.P.args, **kwargs: modal._functions.P.kwargs
220
220
  ) -> modal._functions.OriginalReturnType: ...
221
221
 
222
- class ___experimental_spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
222
+ class ___experimental_spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
223
223
  def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
224
224
  async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
225
225
 
226
226
  _experimental_spawn: ___experimental_spawn_spec[
227
- modal._functions.ReturnType, modal._functions.P, typing_extensions.Self
227
+ modal._functions.P, modal._functions.ReturnType, typing_extensions.Self
228
228
  ]
229
229
 
230
- class __spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
230
+ class __spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
231
231
  def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
232
232
  async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
233
233
 
234
- spawn: __spawn_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
234
+ spawn: __spawn_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
235
235
 
236
236
  def get_raw_f(self) -> collections.abc.Callable[..., typing.Any]: ...
237
237
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: modal
3
- Version: 0.73.8
3
+ Version: 0.73.9
4
4
  Summary: Python client library for Modal
5
5
  Author: Modal Labs
6
6
  Author-email: support@modal.com
@@ -21,12 +21,12 @@ modal/app.py,sha256=wRygVSrWH8iIqhDAAl2Ww_RAkz8MCJZ0Jt9qYZCF6SA,44626
21
21
  modal/app.pyi,sha256=lxiuWzE_OLb3WHg-H7Pek9DGBuCUzZ55P594VhJL5LA,26113
22
22
  modal/call_graph.py,sha256=1g2DGcMIJvRy-xKicuf63IVE98gJSnQsr8R_NVMptNc,2581
23
23
  modal/client.py,sha256=8SQawr7P1PNUCq1UmJMUQXG2jIo4Nmdcs311XqrNLRE,15276
24
- modal/client.pyi,sha256=kOt17KK6b51zHs9T5vme0cCrLzJu-nw1DTE1kSuxXcQ,7591
24
+ modal/client.pyi,sha256=UOfACeLizRMYzf2NDFzalr9sLhXAy447M3jSwPxPCbc,7591
25
25
  modal/cloud_bucket_mount.py,sha256=YOe9nnvSr4ZbeCn587d7_VhE9IioZYRvF9VYQTQux08,5914
26
26
  modal/cloud_bucket_mount.pyi,sha256=30T3K1a89l6wzmEJ_J9iWv9SknoGqaZDx59Xs-ZQcmk,1607
27
27
  modal/cls.py,sha256=kNnZrBYVXOhgEXU0rDWk2Hr-bQRrsZkMKDgC-TD_6Bs,31063
28
28
  modal/cls.pyi,sha256=gb6QNwfX3HSJfcZXPY36N9ywF7aBJTwwtoARnf3G1HQ,8877
29
- modal/config.py,sha256=BzhZYUUwOmvVwf6x5kf0ywMC257s648dmuhsnB6g3gk,11041
29
+ modal/config.py,sha256=XT1W4Y9PVkbYMAXjJRshvQEPDhZmnfW_ZRMwl8XKoqA,11149
30
30
  modal/container_process.py,sha256=WTqLn01dJPVkPpwR_0w_JH96ceN5mV4TGtiu1ZR2RRA,6108
31
31
  modal/container_process.pyi,sha256=Hf0J5JyDdCCXBJSKx6gvkPOo0XrztCm78xzxamtzUjQ,2828
32
32
  modal/dict.py,sha256=vc5lQVqzeDUCb4fRjnOlqYK2GmBb0fIhZmvB0xIBG0U,12921
@@ -40,7 +40,7 @@ modal/file_io.py,sha256=lcMs_E9Xfm0YX1t9U2wNIBPnqHRxmImqjLW1GHqVmyg,20945
40
40
  modal/file_io.pyi,sha256=NTRft1tbPSWf9TlWVeZmTlgB5AZ_Zhu2srWIrWr7brk,9445
41
41
  modal/file_pattern_matcher.py,sha256=1cZ4V2wSLiaXqAqStETSwp3bzDD6QZOt6pmmjk3Okz4,6505
42
42
  modal/functions.py,sha256=kcNHvqeGBxPI7Cgd57NIBBghkfbeFJzXO44WW0jSmao,325
43
- modal/functions.pyi,sha256=mtngzj8VlzMOQATe6muBN5oH_Gw9zGKxMKZ56Z-41kU,14288
43
+ modal/functions.pyi,sha256=QYZy3BCjA2y3UC217e3YG-omyG0E1Jx-Uc-sonsyQsE,14288
44
44
  modal/gpu.py,sha256=2qZMNnoMrjU-5Bu7fx68pANUAKTtZq0EWEEeBA9OUVQ,7426
45
45
  modal/image.py,sha256=Vjsi7wS9dEcoj-7m7_LmvbK5iqEuFz-SHKl2K-qWcew,90952
46
46
  modal/image.pyi,sha256=A5mW2dBguEhmRo815Ax1rBIMXTCriu7PqLMHoUPsez8,26372
@@ -83,8 +83,9 @@ modal/volume.py,sha256=JAWeDvoAG95tMBv-fYIERyHsJPS_X_xGpxRRmYtb6j0,30096
83
83
  modal/volume.pyi,sha256=kTsXarphjZILXci84LQy7EyC84eXUs5-7D62IM5q3eE,12491
84
84
  modal/_runtime/__init__.py,sha256=MIEP8jhXUeGq_eCjYFcqN5b1bxBM4fdk0VESpjWR0fc,28
85
85
  modal/_runtime/asgi.py,sha256=c4hmaMW1pLo-cm7ouriJjieuFm4ZF6D2LMy0638sfOs,22139
86
- modal/_runtime/container_io_manager.py,sha256=QVWMCvJatd2696wsauzXl20psxCYsR0d_CHeS5ceTsU,43201
86
+ modal/_runtime/container_io_manager.py,sha256=L6qv-Mo3mN3ttR5GX-G36cUhH_oz8wdP5WG0HT5FFzg,44619
87
87
  modal/_runtime/execution_context.py,sha256=E6ofm6j1POXGPxS841X3V7JU6NheVb8OkQc7JpLq4Kg,2712
88
+ modal/_runtime/gpu_memory_snapshot.py,sha256=vV6igsqN9CxOoH91kUkuaZQ32QfX5wdoXIS-6MIYX2Y,3315
88
89
  modal/_runtime/telemetry.py,sha256=T1RoAGyjBDr1swiM6pPsGRSITm7LI5FDK18oNXxY08U,5163
89
90
  modal/_runtime/user_code_imports.py,sha256=zl_Mq9dsrVF62x3w-iNK1YAhZKYAXeFaGpd4G7AySTc,14746
90
91
  modal/_utils/__init__.py,sha256=waLjl5c6IPDhSsdWAm9Bji4e2PVxamYABKAze6CHVXY,28
@@ -170,10 +171,10 @@ modal_proto/options_pb2_grpc.pyi,sha256=CImmhxHsYnF09iENPoe8S4J-n93jtgUYD2JPAc0y
170
171
  modal_proto/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
172
  modal_version/__init__.py,sha256=wiJQ53c-OMs0Xf1UeXOxQ7FwlV1VzIjnX6o-pRYZ_Pk,470
172
173
  modal_version/__main__.py,sha256=2FO0yYQQwDTh6udt1h-cBnGd1c4ZyHnHSI4BksxzVac,105
173
- modal_version/_version_generated.py,sha256=a8LMamoFRZKpp6SoTp5bDH1HRWGljeWLZesEIHf3WIE,148
174
- modal-0.73.8.dist-info/LICENSE,sha256=psuoW8kuDP96RQsdhzwOqi6fyWv0ct8CR6Jr7He_P_k,10173
175
- modal-0.73.8.dist-info/METADATA,sha256=wsn98lBqsvSVbab7C3DqlgtiyUF8vmevqqGNf7Ir6WE,2329
176
- modal-0.73.8.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
177
- modal-0.73.8.dist-info/entry_points.txt,sha256=An-wYgeEUnm6xzrAP9_NTSTSciYvvEWsMZILtYrvpAI,46
178
- modal-0.73.8.dist-info/top_level.txt,sha256=1nvYbOSIKcmU50fNrpnQnrrOpj269ei3LzgB6j9xGqg,64
179
- modal-0.73.8.dist-info/RECORD,,
174
+ modal_version/_version_generated.py,sha256=RgDn253uA_DeV6d6P40AOPpDJqMMVupp6gXgLMcak24,148
175
+ modal-0.73.9.dist-info/LICENSE,sha256=psuoW8kuDP96RQsdhzwOqi6fyWv0ct8CR6Jr7He_P_k,10173
176
+ modal-0.73.9.dist-info/METADATA,sha256=BYeidDOAvKaxgE_yMfAun3xjsRB6Q0jGKR8fcs9a0jU,2329
177
+ modal-0.73.9.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
178
+ modal-0.73.9.dist-info/entry_points.txt,sha256=An-wYgeEUnm6xzrAP9_NTSTSciYvvEWsMZILtYrvpAI,46
179
+ modal-0.73.9.dist-info/top_level.txt,sha256=1nvYbOSIKcmU50fNrpnQnrrOpj269ei3LzgB6j9xGqg,64
180
+ modal-0.73.9.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  # Copyright Modal Labs 2025
2
2
 
3
3
  # Note: Reset this value to -1 whenever you make a minor `0.X` release of the client.
4
- build_number = 8 # git: 6a2791f
4
+ build_number = 9 # git: 76a3f59
File without changes