modal 1.1.1.dev41__tar.gz → 1.1.1.dev44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of modal might be problematic. Click here for more details.

Files changed (189) hide show
  1. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/PKG-INFO +1 -1
  2. modal-1.1.1.dev44/modal/_runtime/gpu_memory_snapshot.py +303 -0
  3. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/client.pyi +2 -2
  4. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/functions.pyi +6 -6
  5. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/image.py +8 -2
  6. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal.egg-info/PKG-INFO +1 -1
  7. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/api.proto +1 -0
  8. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/api_pb2.py +174 -174
  9. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/api_pb2.pyi +6 -2
  10. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_version/__init__.py +1 -1
  11. modal-1.1.1.dev41/modal/_runtime/gpu_memory_snapshot.py +0 -199
  12. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/LICENSE +0 -0
  13. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/README.md +0 -0
  14. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/__init__.py +0 -0
  15. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/__main__.py +0 -0
  16. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_clustered_functions.py +0 -0
  17. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_clustered_functions.pyi +0 -0
  18. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_container_entrypoint.py +0 -0
  19. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_functions.py +0 -0
  20. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_ipython.py +0 -0
  21. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_location.py +0 -0
  22. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_object.py +0 -0
  23. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_output.py +0 -0
  24. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_partial_function.py +0 -0
  25. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_pty.py +0 -0
  26. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_resolver.py +0 -0
  27. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_resources.py +0 -0
  28. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_runtime/__init__.py +0 -0
  29. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_runtime/asgi.py +0 -0
  30. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_runtime/container_io_manager.py +0 -0
  31. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_runtime/container_io_manager.pyi +0 -0
  32. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_runtime/execution_context.py +0 -0
  33. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_runtime/execution_context.pyi +0 -0
  34. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_runtime/telemetry.py +0 -0
  35. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_runtime/user_code_imports.py +0 -0
  36. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_serialization.py +0 -0
  37. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_traceback.py +0 -0
  38. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_tunnel.py +0 -0
  39. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_tunnel.pyi +0 -0
  40. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_type_manager.py +0 -0
  41. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/__init__.py +0 -0
  42. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/app_utils.py +0 -0
  43. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/async_utils.py +0 -0
  44. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/auth_token_manager.py +0 -0
  45. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/blob_utils.py +0 -0
  46. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/bytes_io_segment_payload.py +0 -0
  47. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/deprecation.py +0 -0
  48. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/docker_utils.py +0 -0
  49. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/function_utils.py +0 -0
  50. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/git_utils.py +0 -0
  51. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/grpc_testing.py +0 -0
  52. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/grpc_utils.py +0 -0
  53. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/hash_utils.py +0 -0
  54. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/http_utils.py +0 -0
  55. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/jwt_utils.py +0 -0
  56. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/logger.py +0 -0
  57. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/mount_utils.py +0 -0
  58. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/name_utils.py +0 -0
  59. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/package_utils.py +0 -0
  60. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/pattern_utils.py +0 -0
  61. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/rand_pb_testing.py +0 -0
  62. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/shell_utils.py +0 -0
  63. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_utils/time_utils.py +0 -0
  64. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_vendor/__init__.py +0 -0
  65. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_vendor/a2wsgi_wsgi.py +0 -0
  66. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_vendor/cloudpickle.py +0 -0
  67. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_vendor/tblib.py +0 -0
  68. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/_watcher.py +0 -0
  69. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/app.py +0 -0
  70. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/app.pyi +0 -0
  71. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/builder/2023.12.312.txt +0 -0
  72. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/builder/2023.12.txt +0 -0
  73. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/builder/2024.04.txt +0 -0
  74. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/builder/2024.10.txt +0 -0
  75. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/builder/2025.06.txt +0 -0
  76. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/builder/PREVIEW.txt +0 -0
  77. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/builder/README.md +0 -0
  78. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/builder/base-images.json +0 -0
  79. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/call_graph.py +0 -0
  80. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/__init__.py +0 -0
  81. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/_download.py +0 -0
  82. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/_traceback.py +0 -0
  83. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/app.py +0 -0
  84. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/cluster.py +0 -0
  85. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/config.py +0 -0
  86. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/container.py +0 -0
  87. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/dict.py +0 -0
  88. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/entry_point.py +0 -0
  89. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/environment.py +0 -0
  90. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/import_refs.py +0 -0
  91. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/launch.py +0 -0
  92. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/network_file_system.py +0 -0
  93. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/profile.py +0 -0
  94. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/programs/__init__.py +0 -0
  95. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/programs/run_jupyter.py +0 -0
  96. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/programs/vscode.py +0 -0
  97. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/queues.py +0 -0
  98. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/run.py +0 -0
  99. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/secret.py +0 -0
  100. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/token.py +0 -0
  101. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/utils.py +0 -0
  102. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cli/volume.py +0 -0
  103. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/client.py +0 -0
  104. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cloud_bucket_mount.py +0 -0
  105. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cloud_bucket_mount.pyi +0 -0
  106. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cls.py +0 -0
  107. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/cls.pyi +0 -0
  108. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/config.py +0 -0
  109. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/container_process.py +0 -0
  110. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/container_process.pyi +0 -0
  111. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/dict.py +0 -0
  112. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/dict.pyi +0 -0
  113. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/environments.py +0 -0
  114. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/environments.pyi +0 -0
  115. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/exception.py +0 -0
  116. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/experimental/__init__.py +0 -0
  117. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/experimental/flash.py +0 -0
  118. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/experimental/flash.pyi +0 -0
  119. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/experimental/ipython.py +0 -0
  120. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/file_io.py +0 -0
  121. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/file_io.pyi +0 -0
  122. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/file_pattern_matcher.py +0 -0
  123. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/functions.py +0 -0
  124. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/gpu.py +0 -0
  125. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/image.pyi +0 -0
  126. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/io_streams.py +0 -0
  127. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/io_streams.pyi +0 -0
  128. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/mount.py +0 -0
  129. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/mount.pyi +0 -0
  130. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/network_file_system.py +0 -0
  131. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/network_file_system.pyi +0 -0
  132. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/object.py +0 -0
  133. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/object.pyi +0 -0
  134. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/output.py +0 -0
  135. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/parallel_map.py +0 -0
  136. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/parallel_map.pyi +0 -0
  137. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/partial_function.py +0 -0
  138. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/partial_function.pyi +0 -0
  139. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/proxy.py +0 -0
  140. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/proxy.pyi +0 -0
  141. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/py.typed +0 -0
  142. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/queue.py +0 -0
  143. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/queue.pyi +0 -0
  144. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/retries.py +0 -0
  145. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/runner.py +0 -0
  146. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/runner.pyi +0 -0
  147. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/running_app.py +0 -0
  148. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/sandbox.py +0 -0
  149. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/sandbox.pyi +0 -0
  150. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/schedule.py +0 -0
  151. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/scheduler_placement.py +0 -0
  152. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/secret.py +0 -0
  153. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/secret.pyi +0 -0
  154. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/serving.py +0 -0
  155. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/serving.pyi +0 -0
  156. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/snapshot.py +0 -0
  157. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/snapshot.pyi +0 -0
  158. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/stream_type.py +0 -0
  159. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/token_flow.py +0 -0
  160. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/token_flow.pyi +0 -0
  161. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/volume.py +0 -0
  162. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal/volume.pyi +0 -0
  163. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal.egg-info/SOURCES.txt +0 -0
  164. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal.egg-info/dependency_links.txt +0 -0
  165. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal.egg-info/entry_points.txt +0 -0
  166. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal.egg-info/requires.txt +0 -0
  167. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal.egg-info/top_level.txt +0 -0
  168. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_docs/__init__.py +0 -0
  169. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_docs/gen_cli_docs.py +0 -0
  170. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_docs/gen_reference_docs.py +0 -0
  171. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_docs/mdmd/__init__.py +0 -0
  172. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_docs/mdmd/mdmd.py +0 -0
  173. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_docs/mdmd/signatures.py +0 -0
  174. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/__init__.py +0 -0
  175. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/api_grpc.py +0 -0
  176. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/api_pb2_grpc.py +0 -0
  177. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/api_pb2_grpc.pyi +0 -0
  178. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/modal_api_grpc.py +0 -0
  179. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/modal_options_grpc.py +0 -0
  180. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/options.proto +0 -0
  181. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/options_grpc.py +0 -0
  182. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/options_pb2.py +0 -0
  183. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/options_pb2.pyi +0 -0
  184. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/options_pb2_grpc.py +0 -0
  185. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/options_pb2_grpc.pyi +0 -0
  186. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_proto/py.typed +0 -0
  187. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/modal_version/__main__.py +0 -0
  188. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/pyproject.toml +0 -0
  189. {modal-1.1.1.dev41 → modal-1.1.1.dev44}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: modal
3
- Version: 1.1.1.dev41
3
+ Version: 1.1.1.dev44
4
4
  Summary: Python client library for Modal
5
5
  Author-email: Modal Labs <support@modal.com>
6
6
  License: Apache-2.0
@@ -0,0 +1,303 @@
1
+ # Copyright Modal Labs 2022
2
+ #
3
+ # This module provides a simple interface for creating GPU memory snapshots,
4
+ # providing a convenient interface to `cuda-checkpoint` [1]. This is intended
5
+ # to be used in conjunction with memory snapshots.
6
+ #
7
+ # [1] https://github.com/NVIDIA/cuda-checkpoint
8
+
9
+ import subprocess
10
+ import time
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+ from pathlib import Path
15
+ from typing import List, Optional
16
+
17
+ from modal.config import config, logger
18
+
19
+ CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
20
+
21
+
22
+ class CudaCheckpointState(Enum):
23
+ """State representation from the CUDA API [1].
24
+
25
+ [1] https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html"""
26
+
27
+ RUNNING = "running"
28
+ LOCKED = "locked"
29
+ CHECKPOINTED = "checkpointed"
30
+ FAILED = "failed"
31
+
32
+
33
+ class CudaCheckpointException(Exception):
34
+ """Exception raised for CUDA checkpoint operations."""
35
+
36
+ pass
37
+
38
+
39
+ @dataclass
40
+ class CudaCheckpointProcess:
41
+ """Contains a reference to a PID with active CUDA session. This also provides
42
+ methods for checkpointing and restoring GPU memory."""
43
+
44
+ pid: int
45
+ state: CudaCheckpointState
46
+
47
+ def toggle(self, target_state: CudaCheckpointState, timeout_secs: float = 5 * 60.0) -> None:
48
+ """Toggle CUDA checkpoint state for current process, moving GPU memory to the
49
+ CPU and back depending on the current process state when called.
50
+ """
51
+ logger.debug(f"PID: {self.pid} Toggling CUDA checkpoint state to {target_state.value}")
52
+
53
+ start_time = time.monotonic()
54
+ retry_count = 0
55
+ max_retries = 3
56
+
57
+ while self._should_continue_toggle(target_state, start_time, timeout_secs):
58
+ try:
59
+ self._execute_toggle_command()
60
+ # Use exponential backoff for retries
61
+ sleep_time = min(0.1 * (2**retry_count), 1.0)
62
+ time.sleep(sleep_time)
63
+ retry_count = 0
64
+ except CudaCheckpointException as e:
65
+ retry_count += 1
66
+ if retry_count >= max_retries:
67
+ raise CudaCheckpointException(
68
+ f"PID: {self.pid} Failed to toggle state after {max_retries} retries: {e}"
69
+ )
70
+ logger.debug(f"PID: {self.pid} Retry {retry_count}/{max_retries} after error: {e}")
71
+ time.sleep(0.5 * retry_count)
72
+
73
+ logger.debug(f"PID: {self.pid} Target state {target_state.value} reached")
74
+
75
+ def _should_continue_toggle(
76
+ self, target_state: CudaCheckpointState, start_time: float, timeout_secs: float
77
+ ) -> bool:
78
+ """Check if toggle operation should continue based on current state and timeout."""
79
+ self.refresh_state()
80
+
81
+ if self.state == target_state:
82
+ return False
83
+
84
+ if self.state == CudaCheckpointState.FAILED:
85
+ raise CudaCheckpointException(f"PID: {self.pid} CUDA process state is {self.state}")
86
+
87
+ elapsed = time.monotonic() - start_time
88
+ if elapsed >= timeout_secs:
89
+ raise CudaCheckpointException(
90
+ f"PID: {self.pid} Timeout after {elapsed:.2f}s waiting for state {target_state.value}. "
91
+ f"Current state: {self.state}"
92
+ )
93
+
94
+ return True
95
+
96
+ def _execute_toggle_command(self) -> None:
97
+ """Execute the cuda-checkpoint toggle command."""
98
+ try:
99
+ _ = subprocess.run(
100
+ [CUDA_CHECKPOINT_PATH, "--toggle", "--pid", str(self.pid)],
101
+ check=True,
102
+ capture_output=True,
103
+ text=True,
104
+ timeout=30,
105
+ )
106
+ logger.debug(f"PID: {self.pid} Successfully toggled CUDA checkpoint state")
107
+ except subprocess.CalledProcessError as e:
108
+ error_msg = f"PID: {self.pid} Failed to toggle CUDA checkpoint state: {e.stderr}"
109
+ logger.debug(error_msg)
110
+ raise CudaCheckpointException(error_msg)
111
+ except subprocess.TimeoutExpired:
112
+ error_msg = f"PID: {self.pid} Toggle command timed out"
113
+ logger.debug(error_msg)
114
+ raise CudaCheckpointException(error_msg)
115
+
116
+ def refresh_state(self) -> None:
117
+ """Refreshes the current CUDA checkpoint state for this process."""
118
+ try:
119
+ result = subprocess.run(
120
+ [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(self.pid)],
121
+ check=True,
122
+ capture_output=True,
123
+ text=True,
124
+ timeout=10,
125
+ )
126
+
127
+ state_str = result.stdout.strip().lower()
128
+ self.state = CudaCheckpointState(state_str)
129
+
130
+ except subprocess.CalledProcessError as e:
131
+ error_msg = f"PID: {self.pid} Failed to get CUDA checkpoint state: {e.stderr}"
132
+ logger.debug(error_msg)
133
+ raise CudaCheckpointException(error_msg)
134
+ except subprocess.TimeoutExpired:
135
+ error_msg = f"PID: {self.pid} Get state command timed out"
136
+ logger.debug(error_msg)
137
+ raise CudaCheckpointException(error_msg)
138
+
139
+
140
+ class CudaCheckpointSession:
141
+ """Manages the checkpointing state of processes with active CUDA sessions."""
142
+
143
+ def __init__(self):
144
+ self.cuda_processes = self._get_cuda_pids()
145
+ if self.cuda_processes:
146
+ logger.debug(
147
+ f"Found {len(self.cuda_processes)} PID(s) with CUDA sessions: {[c.pid for c in self.cuda_processes]}"
148
+ )
149
+ else:
150
+ logger.debug("No CUDA sessions found.")
151
+
152
+ def _get_cuda_pids(self) -> List[CudaCheckpointProcess]:
153
+ """Iterates over all PIDs and identifies the ones that have running
154
+ CUDA sessions."""
155
+ cuda_pids: List[CudaCheckpointProcess] = []
156
+
157
+ # Get all active process IDs from /proc directory
158
+ proc_dir = Path("/proc")
159
+ if not proc_dir.exists():
160
+ raise CudaCheckpointException(
161
+ "OS does not have /proc path rendering it incompatible with GPU memory snapshots."
162
+ )
163
+
164
+ # Get all numeric directories (PIDs) from /proc
165
+ pid_dirs = [entry for entry in proc_dir.iterdir() if entry.name.isdigit()]
166
+
167
+ # Use ThreadPoolExecutor to check PIDs in parallel for better performance
168
+ with ThreadPoolExecutor(max_workers=min(50, len(pid_dirs))) as executor:
169
+ future_to_pid = {
170
+ executor.submit(self._check_cuda_session, int(entry.name)): int(entry.name) for entry in pid_dirs
171
+ }
172
+
173
+ for future in as_completed(future_to_pid):
174
+ pid = future_to_pid[future]
175
+ try:
176
+ cuda_process = future.result()
177
+ if cuda_process:
178
+ cuda_pids.append(cuda_process)
179
+ except Exception as e:
180
+ logger.debug(f"Error checking PID {pid}: {e}")
181
+
182
+ # Sort PIDs for ordered checkpointing
183
+ cuda_pids.sort(key=lambda x: x.pid)
184
+ return cuda_pids
185
+
186
+ def _check_cuda_session(self, pid: int) -> Optional[CudaCheckpointProcess]:
187
+ """Check if a specific PID has a CUDA session."""
188
+ try:
189
+ result = subprocess.run(
190
+ [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
191
+ capture_output=True,
192
+ text=True,
193
+ timeout=5,
194
+ )
195
+
196
+ # If the command succeeds (return code 0), this PID has a CUDA session
197
+ if result.returncode == 0:
198
+ state_str = result.stdout.strip().lower()
199
+ state = CudaCheckpointState(state_str)
200
+ return CudaCheckpointProcess(pid=pid, state=state)
201
+
202
+ except subprocess.CalledProcessError:
203
+ # Command failed, which is expected for PIDs without CUDA sessions
204
+ pass
205
+ except subprocess.TimeoutExpired:
206
+ logger.debug(f"Timeout checking CUDA state for PID {pid}")
207
+ except Exception as e:
208
+ logger.debug(f"Error checking PID {pid}: {e}")
209
+
210
+ return None
211
+
212
+ def checkpoint(self) -> None:
213
+ """Checkpoint all CUDA processes, moving GPU memory to CPU."""
214
+ if not self.cuda_processes:
215
+ logger.debug("No CUDA processes to checkpoint.")
216
+ return
217
+
218
+ # Validate all states first
219
+ for proc in self.cuda_processes:
220
+ proc.refresh_state() # Refresh state before validation
221
+ if proc.state != CudaCheckpointState.RUNNING:
222
+ raise CudaCheckpointException(
223
+ f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.RUNNING.value} state. "
224
+ f"Current state: {proc.state.value}"
225
+ )
226
+
227
+ # Moving state from GPU to CPU can take several seconds per CUDA session.
228
+ # Make a parallel call per CUDA session.
229
+ start = time.perf_counter()
230
+
231
+ def checkpoint_impl(proc: CudaCheckpointProcess) -> None:
232
+ proc.toggle(CudaCheckpointState.CHECKPOINTED)
233
+
234
+ with ThreadPoolExecutor() as executor:
235
+ futures = [executor.submit(checkpoint_impl, proc) for proc in self.cuda_processes]
236
+
237
+ # Wait for all futures and collect any exceptions
238
+ exceptions = []
239
+ for future in as_completed(futures):
240
+ try:
241
+ future.result()
242
+ except Exception as e:
243
+ exceptions.append(e)
244
+
245
+ if exceptions:
246
+ raise CudaCheckpointException(
247
+ f"Failed to checkpoint {len(exceptions)} processes: {'; '.join(str(e) for e in exceptions)}"
248
+ )
249
+
250
+ elapsed = time.perf_counter() - start
251
+ logger.debug(f"Checkpointing {len(self.cuda_processes)} CUDA sessions took => {elapsed:.3f}s")
252
+
253
+ def restore(self) -> None:
254
+ """Restore all CUDA processes, moving memory back from CPU to GPU."""
255
+ if not self.cuda_processes:
256
+ logger.debug("No CUDA sessions to restore.")
257
+ return
258
+
259
+ # Validate all states first
260
+ for proc in self.cuda_processes:
261
+ proc.refresh_state() # Refresh state before validation
262
+ if proc.state != CudaCheckpointState.CHECKPOINTED:
263
+ raise CudaCheckpointException(
264
+ f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.CHECKPOINTED.value} state. "
265
+ f"Current state: {proc.state.value}"
266
+ )
267
+
268
+ # See checkpoint() for rationale about parallelism.
269
+ start = time.perf_counter()
270
+
271
+ def restore_process(proc: CudaCheckpointProcess) -> None:
272
+ proc.toggle(CudaCheckpointState.RUNNING)
273
+
274
+ with ThreadPoolExecutor() as executor:
275
+ futures = [executor.submit(restore_process, proc) for proc in self.cuda_processes]
276
+
277
+ # Wait for all futures and collect any exceptions
278
+ exceptions = []
279
+ for future in as_completed(futures):
280
+ try:
281
+ future.result()
282
+ except Exception as e:
283
+ exceptions.append(e)
284
+
285
+ if exceptions:
286
+ raise CudaCheckpointException(
287
+ f"Failed to restore {len(exceptions)} processes: {'; '.join(str(e) for e in exceptions)}"
288
+ )
289
+
290
+ elapsed = time.perf_counter() - start
291
+ logger.debug(f"Restoring {len(self.cuda_processes)} CUDA session(s) took => {elapsed:.3f}s")
292
+
293
+ def get_process_count(self) -> int:
294
+ """Get the number of CUDA processes managed by this session."""
295
+ return len(self.cuda_processes)
296
+
297
+ def get_process_states(self) -> List[tuple[int, CudaCheckpointState]]:
298
+ """Get current states of all managed processes."""
299
+ states = []
300
+ for proc in self.cuda_processes:
301
+ proc.refresh_state()
302
+ states.append((proc.pid, proc.state))
303
+ return states
@@ -33,7 +33,7 @@ class _Client:
33
33
  server_url: str,
34
34
  client_type: int,
35
35
  credentials: typing.Optional[tuple[str, str]],
36
- version: str = "1.1.1.dev41",
36
+ version: str = "1.1.1.dev44",
37
37
  ):
38
38
  """mdmd:hidden
39
39
  The Modal client object is not intended to be instantiated directly by users.
@@ -164,7 +164,7 @@ class Client:
164
164
  server_url: str,
165
165
  client_type: int,
166
166
  credentials: typing.Optional[tuple[str, str]],
167
- version: str = "1.1.1.dev41",
167
+ version: str = "1.1.1.dev44",
168
168
  ):
169
169
  """mdmd:hidden
170
170
  The Modal client object is not intended to be instantiated directly by users.
@@ -427,7 +427,7 @@ class Function(
427
427
 
428
428
  _call_generator: ___call_generator_spec[typing_extensions.Self]
429
429
 
430
- class __remote_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
430
+ class __remote_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
431
431
  def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER:
432
432
  """Calls the function remotely, executing it with the given arguments and returning the execution's result."""
433
433
  ...
@@ -436,7 +436,7 @@ class Function(
436
436
  """Calls the function remotely, executing it with the given arguments and returning the execution's result."""
437
437
  ...
438
438
 
439
- remote: __remote_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
439
+ remote: __remote_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
440
440
 
441
441
  class __remote_gen_spec(typing_extensions.Protocol[SUPERSELF]):
442
442
  def __call__(self, /, *args, **kwargs) -> typing.Generator[typing.Any, None, None]:
@@ -463,7 +463,7 @@ class Function(
463
463
  """
464
464
  ...
465
465
 
466
- class ___experimental_spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
466
+ class ___experimental_spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
467
467
  def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]:
468
468
  """[Experimental] Calls the function with the given arguments, without waiting for the results.
469
469
 
@@ -487,7 +487,7 @@ class Function(
487
487
  ...
488
488
 
489
489
  _experimental_spawn: ___experimental_spawn_spec[
490
- modal._functions.P, modal._functions.ReturnType, typing_extensions.Self
490
+ modal._functions.ReturnType, modal._functions.P, typing_extensions.Self
491
491
  ]
492
492
 
493
493
  class ___spawn_map_inner_spec(typing_extensions.Protocol[P_INNER, SUPERSELF]):
@@ -496,7 +496,7 @@ class Function(
496
496
 
497
497
  _spawn_map_inner: ___spawn_map_inner_spec[modal._functions.P, typing_extensions.Self]
498
498
 
499
- class __spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
499
+ class __spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
500
500
  def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]:
501
501
  """Calls the function with the given arguments, without waiting for the results.
502
502
 
@@ -517,7 +517,7 @@ class Function(
517
517
  """
518
518
  ...
519
519
 
520
- spawn: __spawn_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
520
+ spawn: __spawn_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
521
521
 
522
522
  def get_raw_f(self) -> collections.abc.Callable[..., typing.Any]:
523
523
  """Return the inner Python object wrapped by this Modal Function."""
@@ -1393,7 +1393,13 @@ class _Image(_Object, type_prefix="im"):
1393
1393
  # a requirement in `uv.lock`
1394
1394
  return
1395
1395
 
1396
- dependencies = pyproject_toml_content["project"]["dependencies"]
1396
+ try:
1397
+ dependencies = pyproject_toml_content["project"]["dependencies"]
1398
+ except KeyError as e:
1399
+ raise InvalidError(
1400
+ f"Invalid pyproject.toml file: missing key {e} in {pyproject_toml}. "
1401
+ "See https://packaging.python.org/en/latest/guides/writing-pyproject-toml for guidelines."
1402
+ )
1397
1403
 
1398
1404
  for group in groups:
1399
1405
  if (
@@ -1459,7 +1465,7 @@ class _Image(_Object, type_prefix="im"):
1459
1465
  commands.append(f"COPY /.uv.lock {UV_ROOT}/uv.lock")
1460
1466
 
1461
1467
  if frozen:
1462
- # Do not update `uv.lock` when we have one when `frozen=True`. This it ehd efault because this
1468
+ # Do not update `uv.lock` when we have one when `frozen=True`. This is the default because this
1463
1469
  # ensures that the runtime environment matches the local `uv.lock`.
1464
1470
  #
1465
1471
  # If `frozen=False`, then `uv sync` will update the the dependencies in the `uv.lock` file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: modal
3
- Version: 1.1.1.dev41
3
+ Version: 1.1.1.dev44
4
4
  Summary: Python client library for Modal
5
5
  Author-email: Modal Labs <support@modal.com>
6
6
  License: Apache-2.0
@@ -3008,6 +3008,7 @@ message TaskInfo {
3008
3008
  string gpu_type = 6;
3009
3009
  string sandbox_id = 7;
3010
3010
  TaskSnapshotBehavior snapshot_behavior = 8;
3011
+ GPUConfig gpu_config = 9;
3011
3012
  }
3012
3013
 
3013
3014
  message TaskListRequest {