modal 0.73.8__py3-none-any.whl → 0.73.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modal/_runtime/container_io_manager.py +24 -0
- modal/_runtime/gpu_memory_snapshot.py +104 -0
- modal/client.pyi +2 -2
- modal/config.py +1 -0
- modal/functions.pyi +6 -6
- {modal-0.73.8.dist-info → modal-0.73.9.dist-info}/METADATA +1 -1
- {modal-0.73.8.dist-info → modal-0.73.9.dist-info}/RECORD +12 -11
- modal_version/_version_generated.py +1 -1
- {modal-0.73.8.dist-info → modal-0.73.9.dist-info}/LICENSE +0 -0
- {modal-0.73.8.dist-info → modal-0.73.9.dist-info}/WHEEL +0 -0
- {modal-0.73.8.dist-info → modal-0.73.9.dist-info}/entry_points.txt +0 -0
- {modal-0.73.8.dist-info → modal-0.73.9.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,7 @@ from grpclib import Status
|
|
25
25
|
from synchronicity.async_wrap import asynccontextmanager
|
26
26
|
|
27
27
|
import modal_proto.api_pb2
|
28
|
+
from modal._runtime import gpu_memory_snapshot
|
28
29
|
from modal._serialization import deserialize, serialize, serialize_data_format
|
29
30
|
from modal._traceback import extract_traceback, print_exception
|
30
31
|
from modal._utils.async_utils import TaskContext, asyncify, synchronize_api, synchronizer
|
@@ -877,6 +878,17 @@ class _ContainerIOManager:
|
|
877
878
|
if value != "":
|
878
879
|
config.override_locally(key, value)
|
879
880
|
|
881
|
+
# Restore GPU memory.
|
882
|
+
if self.function_def._experimental_enable_gpu_snapshot and self.function_def.resources.gpu_config.gpu_type:
|
883
|
+
logger.debug("GPU memory snapshot enabled. Attempting to restore GPU memory.")
|
884
|
+
gpu_process_state = gpu_memory_snapshot.get_state()
|
885
|
+
if gpu_process_state != gpu_memory_snapshot.CudaCheckpointState.CHECKPOINTED:
|
886
|
+
raise ValueError(
|
887
|
+
"Cannot restore GPU state if GPU isn't in a 'checkpointed' state. "
|
888
|
+
f"Current GPU state: {gpu_process_state}"
|
889
|
+
)
|
890
|
+
gpu_memory_snapshot.toggle()
|
891
|
+
|
880
892
|
# Restore input to default state.
|
881
893
|
self.current_input_id = None
|
882
894
|
self.current_inputs = {}
|
@@ -892,6 +904,18 @@ class _ContainerIOManager:
|
|
892
904
|
|
893
905
|
# Pause heartbeats since they keep the client connection open which causes the snapshotter to crash
|
894
906
|
async with self.heartbeat_condition:
|
907
|
+
# Snapshot GPU memory.
|
908
|
+
if self.function_def._experimental_enable_gpu_snapshot and self.function_def.resources.gpu_config.gpu_type:
|
909
|
+
logger.debug("GPU memory snapshot enabled. Attempting to snapshot GPU memory.")
|
910
|
+
gpu_process_state = gpu_memory_snapshot.get_state()
|
911
|
+
if gpu_process_state != gpu_memory_snapshot.CudaCheckpointState.RUNNING:
|
912
|
+
raise ValueError(
|
913
|
+
"Cannot snapshot GPU state if it isn't running. " f"Current GPU state: {gpu_process_state}"
|
914
|
+
)
|
915
|
+
|
916
|
+
gpu_memory_snapshot.toggle()
|
917
|
+
gpu_memory_snapshot.wait_for_state(gpu_memory_snapshot.CudaCheckpointState.CHECKPOINTED)
|
918
|
+
|
895
919
|
# Notify the heartbeat loop that the snapshot phase has begun in order to
|
896
920
|
# prevent it from sending heartbeat RPCs
|
897
921
|
self._waiting_for_memory_snapshot = True
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# Copyright Modal Labs 2022
|
2
|
+
#
|
3
|
+
# This module provides a simple interface for creating GPU memory snapshots,
|
4
|
+
# provising a convenient interface to `cuda-checkpoint` [1]. This is intended
|
5
|
+
# to be used in conjunction with memory snapshots.
|
6
|
+
#
|
7
|
+
# [1] https://github.com/NVIDIA/cuda-checkpoint
|
8
|
+
|
9
|
+
import os
|
10
|
+
import subprocess
|
11
|
+
import time
|
12
|
+
from enum import Enum
|
13
|
+
|
14
|
+
from modal.config import config, logger
|
15
|
+
|
16
|
+
CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
|
17
|
+
|
18
|
+
|
19
|
+
class CudaCheckpointState(Enum):
|
20
|
+
"""State representation from the CUDA API: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc96cdda177a2b8c296144567cbea4f23"""
|
21
|
+
|
22
|
+
RUNNING = "running"
|
23
|
+
LOCKED = "locked"
|
24
|
+
CHECKPOINTED = "checkpointed"
|
25
|
+
FAILED = "failed"
|
26
|
+
|
27
|
+
|
28
|
+
class CudaCheckpointException(Exception):
|
29
|
+
pass
|
30
|
+
|
31
|
+
|
32
|
+
def toggle():
|
33
|
+
"""Toggle CUDA checkpoint state for current process, moving GPU memory to the
|
34
|
+
CPU and back depending on the current process state when called."""
|
35
|
+
pid = get_own_pid()
|
36
|
+
logger.debug(f"Toggling CUDA checkpoint state for PID {pid}")
|
37
|
+
|
38
|
+
try:
|
39
|
+
cuda_checkpoint_lock_timeout_ms = 5 * 1000
|
40
|
+
subprocess.run(
|
41
|
+
[
|
42
|
+
CUDA_CHECKPOINT_PATH,
|
43
|
+
"--toggle",
|
44
|
+
"--pid",
|
45
|
+
str(pid),
|
46
|
+
"--timeout",
|
47
|
+
str(cuda_checkpoint_lock_timeout_ms),
|
48
|
+
],
|
49
|
+
check=True,
|
50
|
+
capture_output=True,
|
51
|
+
text=True,
|
52
|
+
)
|
53
|
+
logger.debug("Successfully toggled CUDA checkpoint state")
|
54
|
+
|
55
|
+
except subprocess.CalledProcessError as e:
|
56
|
+
logger.debug(f"Failed to toggle CUDA checkpoint state: {e.stderr}")
|
57
|
+
raise CudaCheckpointException(e.stderr)
|
58
|
+
|
59
|
+
|
60
|
+
def get_state() -> CudaCheckpointState:
|
61
|
+
"""Get current CUDA checkpoint state for this process."""
|
62
|
+
pid = get_own_pid()
|
63
|
+
|
64
|
+
try:
|
65
|
+
result = subprocess.run(
|
66
|
+
[CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)], check=True, capture_output=True, text=True
|
67
|
+
)
|
68
|
+
|
69
|
+
# Parse output to get state
|
70
|
+
state_str = result.stdout.strip().lower()
|
71
|
+
logger.debug(f"Raw state output: {state_str}")
|
72
|
+
return CudaCheckpointState(state_str)
|
73
|
+
|
74
|
+
except subprocess.CalledProcessError as e:
|
75
|
+
logger.debug(f"Failed to get CUDA checkpoint state: {e.stderr}")
|
76
|
+
raise CudaCheckpointException(e.stderr)
|
77
|
+
|
78
|
+
|
79
|
+
def wait_for_state(target_state: CudaCheckpointState, timeout_secs: float = 5.0):
|
80
|
+
"""Wait for CUDA checkpoint to reach a specific state."""
|
81
|
+
logger.debug(f"Waiting for CUDA checkpoint state {target_state.value}")
|
82
|
+
start_time = time.monotonic()
|
83
|
+
|
84
|
+
while True:
|
85
|
+
current_state = get_state()
|
86
|
+
|
87
|
+
if current_state == target_state:
|
88
|
+
logger.debug(f"Target state {target_state.value} reached")
|
89
|
+
|
90
|
+
if current_state == CudaCheckpointState.FAILED:
|
91
|
+
raise CudaCheckpointException(f"CUDA process state is {current_state}")
|
92
|
+
|
93
|
+
elapsed = time.monotonic() - start_time
|
94
|
+
if elapsed >= timeout_secs:
|
95
|
+
raise CudaCheckpointException(f"Timeout after {elapsed:.2f}s waiting for state {target_state.value}")
|
96
|
+
|
97
|
+
time.sleep(0.1)
|
98
|
+
|
99
|
+
|
100
|
+
def get_own_pid():
|
101
|
+
"""Returns the Process ID (PID) of the current Python process
|
102
|
+
using only the standard library.
|
103
|
+
"""
|
104
|
+
return os.getpid()
|
modal/client.pyi
CHANGED
@@ -27,7 +27,7 @@ class _Client:
|
|
27
27
|
_snapshotted: bool
|
28
28
|
|
29
29
|
def __init__(
|
30
|
-
self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.
|
30
|
+
self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.9"
|
31
31
|
): ...
|
32
32
|
def is_closed(self) -> bool: ...
|
33
33
|
@property
|
@@ -85,7 +85,7 @@ class Client:
|
|
85
85
|
_snapshotted: bool
|
86
86
|
|
87
87
|
def __init__(
|
88
|
-
self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.
|
88
|
+
self, server_url: str, client_type: int, credentials: typing.Optional[tuple[str, str]], version: str = "0.73.9"
|
89
89
|
): ...
|
90
90
|
def is_closed(self) -> bool: ...
|
91
91
|
@property
|
modal/config.py
CHANGED
@@ -223,6 +223,7 @@ _SETTINGS = {
|
|
223
223
|
"strict_parameters": _Setting(False, transform=_to_boolean), # For internal/experimental use
|
224
224
|
"snapshot_debug": _Setting(False, transform=_to_boolean),
|
225
225
|
"client_retries": _Setting(False, transform=_to_boolean), # For internal testing.
|
226
|
+
"cuda_checkpoint_path": _Setting("/__modal/.bin/cuda-checkpoint"), # Used for snapshotting GPU memory.
|
226
227
|
}
|
227
228
|
|
228
229
|
|
modal/functions.pyi
CHANGED
@@ -200,11 +200,11 @@ class Function(
|
|
200
200
|
|
201
201
|
_call_generator_nowait: ___call_generator_nowait_spec[typing_extensions.Self]
|
202
202
|
|
203
|
-
class __remote_spec(typing_extensions.Protocol[
|
203
|
+
class __remote_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
|
204
204
|
def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER: ...
|
205
205
|
async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER: ...
|
206
206
|
|
207
|
-
remote: __remote_spec[modal._functions.
|
207
|
+
remote: __remote_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
|
208
208
|
|
209
209
|
class __remote_gen_spec(typing_extensions.Protocol[SUPERSELF]):
|
210
210
|
def __call__(self, *args, **kwargs) -> typing.Generator[typing.Any, None, None]: ...
|
@@ -219,19 +219,19 @@ class Function(
|
|
219
219
|
self, *args: modal._functions.P.args, **kwargs: modal._functions.P.kwargs
|
220
220
|
) -> modal._functions.OriginalReturnType: ...
|
221
221
|
|
222
|
-
class ___experimental_spawn_spec(typing_extensions.Protocol[
|
222
|
+
class ___experimental_spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
|
223
223
|
def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
|
224
224
|
async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
|
225
225
|
|
226
226
|
_experimental_spawn: ___experimental_spawn_spec[
|
227
|
-
modal._functions.
|
227
|
+
modal._functions.P, modal._functions.ReturnType, typing_extensions.Self
|
228
228
|
]
|
229
229
|
|
230
|
-
class __spawn_spec(typing_extensions.Protocol[
|
230
|
+
class __spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
|
231
231
|
def __call__(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
|
232
232
|
async def aio(self, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
|
233
233
|
|
234
|
-
spawn: __spawn_spec[modal._functions.
|
234
|
+
spawn: __spawn_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
|
235
235
|
|
236
236
|
def get_raw_f(self) -> collections.abc.Callable[..., typing.Any]: ...
|
237
237
|
|
@@ -21,12 +21,12 @@ modal/app.py,sha256=wRygVSrWH8iIqhDAAl2Ww_RAkz8MCJZ0Jt9qYZCF6SA,44626
|
|
21
21
|
modal/app.pyi,sha256=lxiuWzE_OLb3WHg-H7Pek9DGBuCUzZ55P594VhJL5LA,26113
|
22
22
|
modal/call_graph.py,sha256=1g2DGcMIJvRy-xKicuf63IVE98gJSnQsr8R_NVMptNc,2581
|
23
23
|
modal/client.py,sha256=8SQawr7P1PNUCq1UmJMUQXG2jIo4Nmdcs311XqrNLRE,15276
|
24
|
-
modal/client.pyi,sha256=
|
24
|
+
modal/client.pyi,sha256=UOfACeLizRMYzf2NDFzalr9sLhXAy447M3jSwPxPCbc,7591
|
25
25
|
modal/cloud_bucket_mount.py,sha256=YOe9nnvSr4ZbeCn587d7_VhE9IioZYRvF9VYQTQux08,5914
|
26
26
|
modal/cloud_bucket_mount.pyi,sha256=30T3K1a89l6wzmEJ_J9iWv9SknoGqaZDx59Xs-ZQcmk,1607
|
27
27
|
modal/cls.py,sha256=kNnZrBYVXOhgEXU0rDWk2Hr-bQRrsZkMKDgC-TD_6Bs,31063
|
28
28
|
modal/cls.pyi,sha256=gb6QNwfX3HSJfcZXPY36N9ywF7aBJTwwtoARnf3G1HQ,8877
|
29
|
-
modal/config.py,sha256=
|
29
|
+
modal/config.py,sha256=XT1W4Y9PVkbYMAXjJRshvQEPDhZmnfW_ZRMwl8XKoqA,11149
|
30
30
|
modal/container_process.py,sha256=WTqLn01dJPVkPpwR_0w_JH96ceN5mV4TGtiu1ZR2RRA,6108
|
31
31
|
modal/container_process.pyi,sha256=Hf0J5JyDdCCXBJSKx6gvkPOo0XrztCm78xzxamtzUjQ,2828
|
32
32
|
modal/dict.py,sha256=vc5lQVqzeDUCb4fRjnOlqYK2GmBb0fIhZmvB0xIBG0U,12921
|
@@ -40,7 +40,7 @@ modal/file_io.py,sha256=lcMs_E9Xfm0YX1t9U2wNIBPnqHRxmImqjLW1GHqVmyg,20945
|
|
40
40
|
modal/file_io.pyi,sha256=NTRft1tbPSWf9TlWVeZmTlgB5AZ_Zhu2srWIrWr7brk,9445
|
41
41
|
modal/file_pattern_matcher.py,sha256=1cZ4V2wSLiaXqAqStETSwp3bzDD6QZOt6pmmjk3Okz4,6505
|
42
42
|
modal/functions.py,sha256=kcNHvqeGBxPI7Cgd57NIBBghkfbeFJzXO44WW0jSmao,325
|
43
|
-
modal/functions.pyi,sha256=
|
43
|
+
modal/functions.pyi,sha256=QYZy3BCjA2y3UC217e3YG-omyG0E1Jx-Uc-sonsyQsE,14288
|
44
44
|
modal/gpu.py,sha256=2qZMNnoMrjU-5Bu7fx68pANUAKTtZq0EWEEeBA9OUVQ,7426
|
45
45
|
modal/image.py,sha256=Vjsi7wS9dEcoj-7m7_LmvbK5iqEuFz-SHKl2K-qWcew,90952
|
46
46
|
modal/image.pyi,sha256=A5mW2dBguEhmRo815Ax1rBIMXTCriu7PqLMHoUPsez8,26372
|
@@ -83,8 +83,9 @@ modal/volume.py,sha256=JAWeDvoAG95tMBv-fYIERyHsJPS_X_xGpxRRmYtb6j0,30096
|
|
83
83
|
modal/volume.pyi,sha256=kTsXarphjZILXci84LQy7EyC84eXUs5-7D62IM5q3eE,12491
|
84
84
|
modal/_runtime/__init__.py,sha256=MIEP8jhXUeGq_eCjYFcqN5b1bxBM4fdk0VESpjWR0fc,28
|
85
85
|
modal/_runtime/asgi.py,sha256=c4hmaMW1pLo-cm7ouriJjieuFm4ZF6D2LMy0638sfOs,22139
|
86
|
-
modal/_runtime/container_io_manager.py,sha256=
|
86
|
+
modal/_runtime/container_io_manager.py,sha256=L6qv-Mo3mN3ttR5GX-G36cUhH_oz8wdP5WG0HT5FFzg,44619
|
87
87
|
modal/_runtime/execution_context.py,sha256=E6ofm6j1POXGPxS841X3V7JU6NheVb8OkQc7JpLq4Kg,2712
|
88
|
+
modal/_runtime/gpu_memory_snapshot.py,sha256=vV6igsqN9CxOoH91kUkuaZQ32QfX5wdoXIS-6MIYX2Y,3315
|
88
89
|
modal/_runtime/telemetry.py,sha256=T1RoAGyjBDr1swiM6pPsGRSITm7LI5FDK18oNXxY08U,5163
|
89
90
|
modal/_runtime/user_code_imports.py,sha256=zl_Mq9dsrVF62x3w-iNK1YAhZKYAXeFaGpd4G7AySTc,14746
|
90
91
|
modal/_utils/__init__.py,sha256=waLjl5c6IPDhSsdWAm9Bji4e2PVxamYABKAze6CHVXY,28
|
@@ -170,10 +171,10 @@ modal_proto/options_pb2_grpc.pyi,sha256=CImmhxHsYnF09iENPoe8S4J-n93jtgUYD2JPAc0y
|
|
170
171
|
modal_proto/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
171
172
|
modal_version/__init__.py,sha256=wiJQ53c-OMs0Xf1UeXOxQ7FwlV1VzIjnX6o-pRYZ_Pk,470
|
172
173
|
modal_version/__main__.py,sha256=2FO0yYQQwDTh6udt1h-cBnGd1c4ZyHnHSI4BksxzVac,105
|
173
|
-
modal_version/_version_generated.py,sha256=
|
174
|
-
modal-0.73.
|
175
|
-
modal-0.73.
|
176
|
-
modal-0.73.
|
177
|
-
modal-0.73.
|
178
|
-
modal-0.73.
|
179
|
-
modal-0.73.
|
174
|
+
modal_version/_version_generated.py,sha256=RgDn253uA_DeV6d6P40AOPpDJqMMVupp6gXgLMcak24,148
|
175
|
+
modal-0.73.9.dist-info/LICENSE,sha256=psuoW8kuDP96RQsdhzwOqi6fyWv0ct8CR6Jr7He_P_k,10173
|
176
|
+
modal-0.73.9.dist-info/METADATA,sha256=BYeidDOAvKaxgE_yMfAun3xjsRB6Q0jGKR8fcs9a0jU,2329
|
177
|
+
modal-0.73.9.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
178
|
+
modal-0.73.9.dist-info/entry_points.txt,sha256=An-wYgeEUnm6xzrAP9_NTSTSciYvvEWsMZILtYrvpAI,46
|
179
|
+
modal-0.73.9.dist-info/top_level.txt,sha256=1nvYbOSIKcmU50fNrpnQnrrOpj269ei3LzgB6j9xGqg,64
|
180
|
+
modal-0.73.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|