modal 1.0.3.dev10__tar.gz → 1.0.3.dev12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (185) hide show
  1. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/PKG-INFO +1 -1
  2. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_runtime/container_io_manager.py +8 -14
  3. modal-1.0.3.dev12/modal/_runtime/gpu_memory_snapshot.py +199 -0
  4. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/client.pyi +2 -2
  5. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/functions.pyi +6 -6
  6. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal.egg-info/PKG-INFO +1 -1
  7. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_version/__init__.py +1 -1
  8. modal-1.0.3.dev10/modal/_runtime/gpu_memory_snapshot.py +0 -101
  9. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/LICENSE +0 -0
  10. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/README.md +0 -0
  11. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/__init__.py +0 -0
  12. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/__main__.py +0 -0
  13. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_clustered_functions.py +0 -0
  14. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_clustered_functions.pyi +0 -0
  15. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_container_entrypoint.py +0 -0
  16. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_functions.py +0 -0
  17. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_ipython.py +0 -0
  18. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_location.py +0 -0
  19. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_object.py +0 -0
  20. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_output.py +0 -0
  21. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_partial_function.py +0 -0
  22. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_pty.py +0 -0
  23. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_resolver.py +0 -0
  24. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_resources.py +0 -0
  25. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_runtime/__init__.py +0 -0
  26. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_runtime/asgi.py +0 -0
  27. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_runtime/container_io_manager.pyi +0 -0
  28. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_runtime/execution_context.py +0 -0
  29. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_runtime/execution_context.pyi +0 -0
  30. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_runtime/telemetry.py +0 -0
  31. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_runtime/user_code_imports.py +0 -0
  32. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_serialization.py +0 -0
  33. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_traceback.py +0 -0
  34. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_tunnel.py +0 -0
  35. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_tunnel.pyi +0 -0
  36. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_type_manager.py +0 -0
  37. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/__init__.py +0 -0
  38. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/app_utils.py +0 -0
  39. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/async_utils.py +0 -0
  40. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/blob_utils.py +0 -0
  41. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/bytes_io_segment_payload.py +0 -0
  42. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/deprecation.py +0 -0
  43. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/docker_utils.py +0 -0
  44. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/function_utils.py +0 -0
  45. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/git_utils.py +0 -0
  46. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/grpc_testing.py +0 -0
  47. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/grpc_utils.py +0 -0
  48. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/hash_utils.py +0 -0
  49. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/http_utils.py +0 -0
  50. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/jwt_utils.py +0 -0
  51. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/logger.py +0 -0
  52. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/mount_utils.py +0 -0
  53. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/name_utils.py +0 -0
  54. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/package_utils.py +0 -0
  55. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/pattern_utils.py +0 -0
  56. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/rand_pb_testing.py +0 -0
  57. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/shell_utils.py +0 -0
  58. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_utils/time_utils.py +0 -0
  59. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_vendor/__init__.py +0 -0
  60. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_vendor/a2wsgi_wsgi.py +0 -0
  61. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_vendor/cloudpickle.py +0 -0
  62. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_vendor/tblib.py +0 -0
  63. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/_watcher.py +0 -0
  64. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/app.py +0 -0
  65. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/app.pyi +0 -0
  66. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/call_graph.py +0 -0
  67. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/__init__.py +0 -0
  68. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/_download.py +0 -0
  69. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/_traceback.py +0 -0
  70. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/app.py +0 -0
  71. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/cluster.py +0 -0
  72. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/config.py +0 -0
  73. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/container.py +0 -0
  74. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/dict.py +0 -0
  75. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/entry_point.py +0 -0
  76. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/environment.py +0 -0
  77. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/import_refs.py +0 -0
  78. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/launch.py +0 -0
  79. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/network_file_system.py +0 -0
  80. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/profile.py +0 -0
  81. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/programs/__init__.py +0 -0
  82. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/programs/run_jupyter.py +0 -0
  83. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/programs/vscode.py +0 -0
  84. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/queues.py +0 -0
  85. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/run.py +0 -0
  86. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/secret.py +0 -0
  87. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/token.py +0 -0
  88. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/utils.py +0 -0
  89. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cli/volume.py +0 -0
  90. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/client.py +0 -0
  91. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cloud_bucket_mount.py +0 -0
  92. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cloud_bucket_mount.pyi +0 -0
  93. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cls.py +0 -0
  94. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/cls.pyi +0 -0
  95. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/config.py +0 -0
  96. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/container_process.py +0 -0
  97. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/container_process.pyi +0 -0
  98. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/dict.py +0 -0
  99. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/dict.pyi +0 -0
  100. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/environments.py +0 -0
  101. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/environments.pyi +0 -0
  102. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/exception.py +0 -0
  103. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/experimental/__init__.py +0 -0
  104. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/experimental/ipython.py +0 -0
  105. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/file_io.py +0 -0
  106. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/file_io.pyi +0 -0
  107. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/file_pattern_matcher.py +0 -0
  108. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/functions.py +0 -0
  109. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/gpu.py +0 -0
  110. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/image.py +0 -0
  111. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/image.pyi +0 -0
  112. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/io_streams.py +0 -0
  113. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/io_streams.pyi +0 -0
  114. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/mount.py +0 -0
  115. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/mount.pyi +0 -0
  116. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/network_file_system.py +0 -0
  117. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/network_file_system.pyi +0 -0
  118. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/object.py +0 -0
  119. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/object.pyi +0 -0
  120. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/output.py +0 -0
  121. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/parallel_map.py +0 -0
  122. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/parallel_map.pyi +0 -0
  123. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/partial_function.py +0 -0
  124. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/partial_function.pyi +0 -0
  125. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/proxy.py +0 -0
  126. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/proxy.pyi +0 -0
  127. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/py.typed +0 -0
  128. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/queue.py +0 -0
  129. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/queue.pyi +0 -0
  130. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/requirements/2023.12.312.txt +0 -0
  131. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/requirements/2023.12.txt +0 -0
  132. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/requirements/2024.04.txt +0 -0
  133. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/requirements/2024.10.txt +0 -0
  134. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/requirements/PREVIEW.txt +0 -0
  135. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/requirements/README.md +0 -0
  136. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/requirements/base-images.json +0 -0
  137. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/retries.py +0 -0
  138. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/runner.py +0 -0
  139. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/runner.pyi +0 -0
  140. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/running_app.py +0 -0
  141. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/sandbox.py +0 -0
  142. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/sandbox.pyi +0 -0
  143. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/schedule.py +0 -0
  144. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/scheduler_placement.py +0 -0
  145. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/secret.py +0 -0
  146. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/secret.pyi +0 -0
  147. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/serving.py +0 -0
  148. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/serving.pyi +0 -0
  149. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/snapshot.py +0 -0
  150. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/snapshot.pyi +0 -0
  151. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/stream_type.py +0 -0
  152. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/token_flow.py +0 -0
  153. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/token_flow.pyi +0 -0
  154. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/volume.py +0 -0
  155. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal/volume.pyi +0 -0
  156. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal.egg-info/SOURCES.txt +0 -0
  157. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal.egg-info/dependency_links.txt +0 -0
  158. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal.egg-info/entry_points.txt +0 -0
  159. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal.egg-info/requires.txt +0 -0
  160. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal.egg-info/top_level.txt +0 -0
  161. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_docs/__init__.py +0 -0
  162. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_docs/gen_cli_docs.py +0 -0
  163. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_docs/gen_reference_docs.py +0 -0
  164. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_docs/mdmd/__init__.py +0 -0
  165. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_docs/mdmd/mdmd.py +0 -0
  166. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_docs/mdmd/signatures.py +0 -0
  167. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/__init__.py +0 -0
  168. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/api.proto +0 -0
  169. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/api_grpc.py +0 -0
  170. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/api_pb2.py +0 -0
  171. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/api_pb2.pyi +0 -0
  172. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/api_pb2_grpc.py +0 -0
  173. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/api_pb2_grpc.pyi +0 -0
  174. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/modal_api_grpc.py +0 -0
  175. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/modal_options_grpc.py +0 -0
  176. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/options.proto +0 -0
  177. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/options_grpc.py +0 -0
  178. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/options_pb2.py +0 -0
  179. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/options_pb2.pyi +0 -0
  180. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/options_pb2_grpc.py +0 -0
  181. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/options_pb2_grpc.pyi +0 -0
  182. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_proto/py.typed +0 -0
  183. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/modal_version/__main__.py +0 -0
  184. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/pyproject.toml +0 -0
  185. {modal-1.0.3.dev10 → modal-1.0.3.dev12}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: modal
3
- Version: 1.0.3.dev10
3
+ Version: 1.0.3.dev12
4
4
  Summary: Python client library for Modal
5
5
  Author-email: Modal Labs <support@modal.com>
6
6
  License: Apache-2.0
@@ -323,6 +323,7 @@ class _ContainerIOManager:
323
323
  self._heartbeat_loop = None
324
324
  self._heartbeat_condition = None
325
325
  self._waiting_for_memory_snapshot = False
326
+ self._cuda_checkpoint_session = None
326
327
 
327
328
  self._is_interactivity_enabled = False
328
329
  self._fetching_inputs = True
@@ -881,13 +882,11 @@ class _ContainerIOManager:
881
882
  # Restore GPU memory.
882
883
  if self.function_def._experimental_enable_gpu_snapshot and self.function_def.resources.gpu_config.gpu_type:
883
884
  logger.debug("GPU memory snapshot enabled. Attempting to restore GPU memory.")
884
- gpu_process_state = gpu_memory_snapshot.get_state()
885
- if gpu_process_state != gpu_memory_snapshot.CudaCheckpointState.CHECKPOINTED:
886
- raise ValueError(
887
- "Cannot restore GPU state if GPU isn't in a 'checkpointed' state. "
888
- f"Current GPU state: {gpu_process_state}"
889
- )
890
- gpu_memory_snapshot.toggle()
885
+
886
+ assert self._cuda_checkpoint_session, (
887
+ "CudaCheckpointSession not found when attempting to restore GPU memory"
888
+ )
889
+ self._cuda_checkpoint_session.restore()
891
890
 
892
891
  # Restore input to default state.
893
892
  self.current_input_id = None
@@ -907,14 +906,9 @@ class _ContainerIOManager:
907
906
  # Snapshot GPU memory.
908
907
  if self.function_def._experimental_enable_gpu_snapshot and self.function_def.resources.gpu_config.gpu_type:
909
908
  logger.debug("GPU memory snapshot enabled. Attempting to snapshot GPU memory.")
910
- gpu_process_state = gpu_memory_snapshot.get_state()
911
- if gpu_process_state != gpu_memory_snapshot.CudaCheckpointState.RUNNING:
912
- raise ValueError(
913
- f"Cannot snapshot GPU state if it isn't running. Current GPU state: {gpu_process_state}"
914
- )
915
909
 
916
- gpu_memory_snapshot.toggle()
917
- gpu_memory_snapshot.wait_for_state(gpu_memory_snapshot.CudaCheckpointState.CHECKPOINTED)
910
+ self._cuda_checkpoint_session = gpu_memory_snapshot.CudaCheckpointSession()
911
+ self._cuda_checkpoint_session.checkpoint()
918
912
 
919
913
  # Notify the heartbeat loop that the snapshot phase has begun in order to
920
914
  # prevent it from sending heartbeat RPCs
@@ -0,0 +1,199 @@
1
+ # Copyright Modal Labs 2022
2
+ #
3
+ # This module provides a simple interface for creating GPU memory snapshots,
4
+ # provising a convenient interface to `cuda-checkpoint` [1]. This is intended
5
+ # to be used in conjunction with memory snapshots.
6
+ #
7
+ # [1] https://github.com/NVIDIA/cuda-checkpoint
8
+
9
+ import subprocess
10
+ import time
11
+ from concurrent.futures import ThreadPoolExecutor
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+ from pathlib import Path
15
+
16
+ from modal.config import config, logger
17
+
18
+ CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
19
+
20
+
21
+ class CudaCheckpointState(Enum):
22
+ """State representation from the CUDA API: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc96cdda177a2b8c296144567cbea4f23"""
23
+
24
+ RUNNING = "running"
25
+ LOCKED = "locked"
26
+ CHECKPOINTED = "checkpointed"
27
+ FAILED = "failed"
28
+
29
+
30
+ class CudaCheckpointException(Exception):
31
+ pass
32
+
33
+
34
+ @dataclass
35
+ class CudaCheckpointProcess:
36
+ """Contains a reference to a PID with active CUDA session. This also provides
37
+ methods for checkpointing and restoring GPU memory."""
38
+
39
+ pid: int
40
+ state: CudaCheckpointState
41
+
42
+ def toggle(self, target_state: CudaCheckpointState, timeout_secs: float = 5 * 60.0):
43
+ """Toggle CUDA checkpoint state for current process, moving GPU memory to the
44
+ CPU and back depending on the current process state when called."""
45
+ logger.debug(f"PID: {self.pid} Toggling CUDA checkpoint state to {target_state.value}")
46
+
47
+ start_time = time.monotonic()
48
+
49
+ while self._should_continue_toggle(target_state, start_time, timeout_secs):
50
+ self._execute_toggle_command()
51
+ time.sleep(0.1)
52
+
53
+ logger.debug(f"PID: {self.pid} Target state {target_state.value} reached")
54
+
55
+ def _should_continue_toggle(
56
+ self, target_state: CudaCheckpointState, start_time: float, timeout_secs: float
57
+ ) -> bool:
58
+ """Check if toggle operation should continue based on current state and timeout."""
59
+ self.refresh_state()
60
+
61
+ if self.state == target_state:
62
+ return False
63
+
64
+ if self.state == CudaCheckpointState.FAILED:
65
+ raise CudaCheckpointException(f"PID: {self.pid} CUDA process state is {self.state}")
66
+
67
+ elapsed = time.monotonic() - start_time
68
+ if elapsed >= timeout_secs:
69
+ raise CudaCheckpointException(
70
+ f"PID: {self.pid} Timeout after {elapsed:.2f}s waiting for state {target_state.value}. "
71
+ f"Current state: {self.state}"
72
+ )
73
+
74
+ return True
75
+
76
+ def _execute_toggle_command(self):
77
+ """Execute the cuda-checkpoint toggle command."""
78
+ try:
79
+ subprocess.run(
80
+ [CUDA_CHECKPOINT_PATH, "--toggle", "--pid", str(self.pid)],
81
+ check=True,
82
+ capture_output=True,
83
+ text=True,
84
+ )
85
+ logger.debug(f"PID: {self.pid} Successfully toggled CUDA checkpoint state")
86
+ except subprocess.CalledProcessError as e:
87
+ logger.debug(f"PID: {self.pid} Failed to toggle CUDA checkpoint state: {e.stderr}")
88
+ raise CudaCheckpointException(e.stderr)
89
+
90
+ def refresh_state(self) -> None:
91
+ """Refreshes the current CUDA checkpoint state for this process."""
92
+ try:
93
+ result = subprocess.run(
94
+ [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(self.pid)],
95
+ check=True,
96
+ capture_output=True,
97
+ text=True,
98
+ timeout=5,
99
+ )
100
+
101
+ state_str = result.stdout.strip().lower()
102
+ self.state = CudaCheckpointState(state_str)
103
+
104
+ except subprocess.CalledProcessError as e:
105
+ logger.debug(f"PID: {self.pid} Failed to get CUDA checkpoint state: {e.stderr}")
106
+ raise CudaCheckpointException(e.stderr)
107
+
108
+
109
+ class CudaCheckpointSession:
110
+ """Manages the checkpointing state of processes with active CUDA sessions."""
111
+
112
+ def __init__(self):
113
+ self.cuda_processes = self._get_cuda_pids()
114
+ logger.debug(f"PIDs with CUDA sessions: {[c.pid for c in self.cuda_processes]}")
115
+
116
+ def _get_cuda_pids(self) -> list[CudaCheckpointProcess]:
117
+ """Iterates over all PIDs and identifies the ones that have running
118
+ CUDA sessions."""
119
+ cuda_pids: list[CudaCheckpointProcess] = []
120
+
121
+ # Get all active process IDs from /proc directory
122
+ proc_dir = Path("/proc")
123
+ if not proc_dir.exists():
124
+ raise CudaCheckpointException(
125
+ "OS does not have /proc path rendering it incompatible with GPU memory snapshots."
126
+ )
127
+
128
+ for entry in proc_dir.iterdir():
129
+ if not entry.name.isdigit():
130
+ continue
131
+
132
+ pid = int(entry.name)
133
+ try:
134
+ # Call cuda-checkpoint to check if this PID has a CUDA session
135
+ result = subprocess.run(
136
+ [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
137
+ capture_output=True,
138
+ text=True,
139
+ timeout=10,
140
+ )
141
+
142
+ # If the command succeeds (return code 0), this PID has a CUDA session
143
+ if result.returncode == 0:
144
+ state_str = result.stdout.strip().lower()
145
+ state = CudaCheckpointState(state_str)
146
+
147
+ cuda_checkpoint_process = CudaCheckpointProcess(pid=pid, state=state)
148
+ cuda_pids.append(cuda_checkpoint_process)
149
+
150
+ # Command failed, which is expected for PIDs without CUDA sessions
151
+ except subprocess.CalledProcessError:
152
+ continue
153
+
154
+ # Raise other exceptions
155
+ except subprocess.TimeoutExpired:
156
+ raise CudaCheckpointException(f"Failed to get CUDA state for PID {pid}")
157
+ except Exception as e:
158
+ raise CudaCheckpointException(e)
159
+
160
+ # Sort PIDs for ordered checkpointing
161
+ cuda_pids.sort(key=lambda x: x.pid)
162
+ return cuda_pids
163
+
164
+ def checkpoint(self) -> None:
165
+ # Validate all states first
166
+ for proc in self.cuda_processes:
167
+ if proc.state != CudaCheckpointState.RUNNING:
168
+ raise CudaCheckpointException(f"CUDA session not in {CudaCheckpointState.RUNNING} state.")
169
+
170
+ # Moving state from GPU to CPU can take several seconds per CUDA session.
171
+ # Make a parallel call per CUDA session.
172
+ start = time.perf_counter()
173
+
174
+ def checkpoint_impl(proc: CudaCheckpointProcess):
175
+ proc.toggle(CudaCheckpointState.CHECKPOINTED)
176
+
177
+ with ThreadPoolExecutor() as executor:
178
+ list(executor.map(checkpoint_impl, self.cuda_processes))
179
+
180
+ elapsed = time.perf_counter() - start
181
+ logger.debug(f"Checkpointing CUDA sessions took => {elapsed:.3f}s")
182
+
183
+ def restore(self) -> None:
184
+ # Validate all states first
185
+ for proc in self.cuda_processes:
186
+ if proc.state != CudaCheckpointState.CHECKPOINTED:
187
+ raise CudaCheckpointException(f"CUDA session not in {CudaCheckpointState.CHECKPOINTED} state.")
188
+
189
+ # See checkpoint() for rationale about parallelism.
190
+ start = time.perf_counter()
191
+
192
+ def restore_process(proc: CudaCheckpointProcess):
193
+ proc.toggle(CudaCheckpointState.RUNNING)
194
+
195
+ with ThreadPoolExecutor() as executor:
196
+ list(executor.map(restore_process, self.cuda_processes))
197
+
198
+ elapsed = time.perf_counter() - start
199
+ logger.debug(f"Restoring CUDA sessions took => {elapsed:.3f}s")
@@ -31,7 +31,7 @@ class _Client:
31
31
  server_url: str,
32
32
  client_type: int,
33
33
  credentials: typing.Optional[tuple[str, str]],
34
- version: str = "1.0.3.dev10",
34
+ version: str = "1.0.3.dev12",
35
35
  ): ...
36
36
  def is_closed(self) -> bool: ...
37
37
  @property
@@ -94,7 +94,7 @@ class Client:
94
94
  server_url: str,
95
95
  client_type: int,
96
96
  credentials: typing.Optional[tuple[str, str]],
97
- version: str = "1.0.3.dev10",
97
+ version: str = "1.0.3.dev12",
98
98
  ): ...
99
99
  def is_closed(self) -> bool: ...
100
100
  @property
@@ -227,11 +227,11 @@ class Function(
227
227
 
228
228
  _call_generator: ___call_generator_spec[typing_extensions.Self]
229
229
 
230
- class __remote_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
230
+ class __remote_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
231
231
  def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER: ...
232
232
  async def aio(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER: ...
233
233
 
234
- remote: __remote_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
234
+ remote: __remote_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
235
235
 
236
236
  class __remote_gen_spec(typing_extensions.Protocol[SUPERSELF]):
237
237
  def __call__(self, /, *args, **kwargs) -> typing.Generator[typing.Any, None, None]: ...
@@ -246,12 +246,12 @@ class Function(
246
246
  self, *args: modal._functions.P.args, **kwargs: modal._functions.P.kwargs
247
247
  ) -> modal._functions.OriginalReturnType: ...
248
248
 
249
- class ___experimental_spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
249
+ class ___experimental_spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
250
250
  def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
251
251
  async def aio(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
252
252
 
253
253
  _experimental_spawn: ___experimental_spawn_spec[
254
- modal._functions.ReturnType, modal._functions.P, typing_extensions.Self
254
+ modal._functions.P, modal._functions.ReturnType, typing_extensions.Self
255
255
  ]
256
256
 
257
257
  class ___spawn_map_inner_spec(typing_extensions.Protocol[P_INNER, SUPERSELF]):
@@ -260,11 +260,11 @@ class Function(
260
260
 
261
261
  _spawn_map_inner: ___spawn_map_inner_spec[modal._functions.P, typing_extensions.Self]
262
262
 
263
- class __spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
263
+ class __spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
264
264
  def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
265
265
  async def aio(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]: ...
266
266
 
267
- spawn: __spawn_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
267
+ spawn: __spawn_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
268
268
 
269
269
  def get_raw_f(self) -> collections.abc.Callable[..., typing.Any]: ...
270
270
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: modal
3
- Version: 1.0.3.dev10
3
+ Version: 1.0.3.dev12
4
4
  Summary: Python client library for Modal
5
5
  Author-email: Modal Labs <support@modal.com>
6
6
  License: Apache-2.0
@@ -1,4 +1,4 @@
1
1
  # Copyright Modal Labs 2025
2
2
  """Supplies the current version of the modal client library."""
3
3
 
4
- __version__ = "1.0.3.dev10"
4
+ __version__ = "1.0.3.dev12"
@@ -1,101 +0,0 @@
1
- # Copyright Modal Labs 2022
2
- #
3
- # This module provides a simple interface for creating GPU memory snapshots,
4
- # provising a convenient interface to `cuda-checkpoint` [1]. This is intended
5
- # to be used in conjunction with memory snapshots.
6
- #
7
- # [1] https://github.com/NVIDIA/cuda-checkpoint
8
-
9
- import os
10
- import subprocess
11
- import time
12
- from enum import Enum
13
-
14
- from modal.config import config, logger
15
-
16
- CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
17
-
18
-
19
- class CudaCheckpointState(Enum):
20
- """State representation from the CUDA API: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc96cdda177a2b8c296144567cbea4f23"""
21
-
22
- RUNNING = "running"
23
- LOCKED = "locked"
24
- CHECKPOINTED = "checkpointed"
25
- FAILED = "failed"
26
-
27
-
28
- class CudaCheckpointException(Exception):
29
- pass
30
-
31
-
32
- def toggle():
33
- """Toggle CUDA checkpoint state for current process, moving GPU memory to the
34
- CPU and back depending on the current process state when called."""
35
- pid = get_own_pid()
36
- logger.debug(f"Toggling CUDA checkpoint state for PID {pid}")
37
-
38
- try:
39
- subprocess.run(
40
- [
41
- CUDA_CHECKPOINT_PATH,
42
- "--toggle",
43
- "--pid",
44
- str(pid),
45
- ],
46
- check=True,
47
- capture_output=True,
48
- text=True,
49
- )
50
- logger.debug("Successfully toggled CUDA checkpoint state")
51
-
52
- except subprocess.CalledProcessError as e:
53
- logger.debug(f"Failed to toggle CUDA checkpoint state: {e.stderr}")
54
- raise CudaCheckpointException(e.stderr)
55
-
56
-
57
- def get_state() -> CudaCheckpointState:
58
- """Get current CUDA checkpoint state for this process."""
59
- pid = get_own_pid()
60
-
61
- try:
62
- result = subprocess.run(
63
- [CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)], check=True, capture_output=True, text=True
64
- )
65
-
66
- # Parse output to get state
67
- state_str = result.stdout.strip().lower()
68
- return CudaCheckpointState(state_str)
69
-
70
- except subprocess.CalledProcessError as e:
71
- logger.debug(f"Failed to get CUDA checkpoint state: {e.stderr}")
72
- raise CudaCheckpointException(e.stderr)
73
-
74
-
75
- def wait_for_state(target_state: CudaCheckpointState, timeout_secs: float = 5.0):
76
- """Wait for CUDA checkpoint to reach a specific state."""
77
- logger.debug(f"Waiting for CUDA checkpoint state {target_state.value}")
78
- start_time = time.monotonic()
79
-
80
- while True:
81
- current_state = get_state()
82
-
83
- if current_state == target_state:
84
- logger.debug(f"Target state {target_state.value} reached")
85
- break
86
-
87
- if current_state == CudaCheckpointState.FAILED:
88
- raise CudaCheckpointException(f"CUDA process state is {current_state}")
89
-
90
- elapsed = time.monotonic() - start_time
91
- if elapsed >= timeout_secs:
92
- raise CudaCheckpointException(f"Timeout after {elapsed:.2f}s waiting for state {target_state.value}")
93
-
94
- time.sleep(0.1)
95
-
96
-
97
- def get_own_pid():
98
- """Returns the Process ID (PID) of the current Python process
99
- using only the standard library.
100
- """
101
- return os.getpid()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes