modal 1.1.5.dev51__tar.gz → 1.1.5.dev53__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of modal might be problematic. Click here for more details.
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/PKG-INFO +1 -1
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/gpu_memory_snapshot.py +20 -17
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/function_utils.py +6 -1
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/client.pyi +2 -2
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/functions.pyi +6 -6
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal.egg-info/PKG-INFO +1 -1
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_version/__init__.py +1 -1
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/LICENSE +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/README.md +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/__main__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_clustered_functions.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_clustered_functions.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_container_entrypoint.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_functions.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_ipython.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_location.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_object.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_output.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_partial_function.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_pty.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_resolver.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_resources.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/asgi.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/container_io_manager.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/container_io_manager.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/execution_context.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/execution_context.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/telemetry.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_runtime/user_code_imports.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_serialization.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_traceback.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_tunnel.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_tunnel.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_type_manager.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/app_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/async_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/auth_token_manager.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/blob_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/bytes_io_segment_payload.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/deprecation.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/docker_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/git_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/grpc_testing.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/grpc_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/hash_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/http_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/jwt_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/logger.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/mount_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/name_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/package_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/pattern_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/rand_pb_testing.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/shell_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_utils/time_utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_vendor/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_vendor/a2wsgi_wsgi.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_vendor/cloudpickle.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_vendor/tblib.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/_watcher.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/app.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/app.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/builder/2023.12.312.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/builder/2023.12.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/builder/2024.04.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/builder/2024.10.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/builder/2025.06.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/builder/PREVIEW.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/builder/README.md +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/builder/base-images.json +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/call_graph.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/_download.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/_traceback.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/app.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/cluster.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/config.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/container.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/dict.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/entry_point.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/environment.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/import_refs.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/launch.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/network_file_system.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/profile.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/programs/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/programs/launch_instance_ssh.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/programs/run_jupyter.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/programs/run_marimo.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/programs/vscode.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/queues.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/run.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/secret.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/token.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/utils.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cli/volume.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/client.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cloud_bucket_mount.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cloud_bucket_mount.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cls.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/cls.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/config.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/container_process.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/container_process.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/dict.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/dict.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/environments.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/environments.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/exception.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/experimental/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/experimental/flash.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/experimental/flash.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/experimental/ipython.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/file_io.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/file_io.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/file_pattern_matcher.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/functions.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/gpu.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/image.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/image.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/io_streams.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/io_streams.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/mount.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/mount.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/network_file_system.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/network_file_system.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/object.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/object.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/output.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/parallel_map.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/parallel_map.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/partial_function.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/partial_function.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/proxy.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/proxy.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/py.typed +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/queue.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/queue.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/retries.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/runner.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/runner.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/running_app.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/sandbox.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/sandbox.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/schedule.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/scheduler_placement.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/secret.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/secret.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/serving.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/serving.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/snapshot.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/snapshot.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/stream_type.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/token_flow.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/token_flow.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/volume.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal/volume.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal.egg-info/SOURCES.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal.egg-info/dependency_links.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal.egg-info/entry_points.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal.egg-info/requires.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal.egg-info/top_level.txt +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_docs/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_docs/gen_cli_docs.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_docs/gen_reference_docs.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_docs/mdmd/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_docs/mdmd/mdmd.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_docs/mdmd/signatures.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/__init__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/api.proto +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/api_grpc.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/api_pb2.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/api_pb2.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/api_pb2_grpc.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/api_pb2_grpc.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/modal_api_grpc.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/modal_options_grpc.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/options.proto +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/options_grpc.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/options_pb2.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/options_pb2.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/options_pb2_grpc.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/options_pb2_grpc.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/py.typed +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/sandbox_router.proto +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/sandbox_router_grpc.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/sandbox_router_pb2.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/sandbox_router_pb2.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/sandbox_router_pb2_grpc.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_proto/sandbox_router_pb2_grpc.pyi +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/modal_version/__main__.py +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/pyproject.toml +0 -0
- {modal-1.1.5.dev51 → modal-1.1.5.dev53}/setup.cfg +0 -0
|
@@ -18,6 +18,12 @@ from modal.config import config, logger
|
|
|
18
18
|
|
|
19
19
|
CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
|
|
20
20
|
|
|
21
|
+
# Maximum total duration for an entire toggle operation.
|
|
22
|
+
CUDA_CHECKPOINT_TOGGLE_TIMEOUT: float = 5 * 60.0
|
|
23
|
+
|
|
24
|
+
# Maximum total duration for each individual `cuda-checkpoint` invocation.
|
|
25
|
+
CUDA_CHECKPOINT_TIMEOUT: float = 90
|
|
26
|
+
|
|
21
27
|
|
|
22
28
|
class CudaCheckpointState(Enum):
|
|
23
29
|
"""State representation from the CUDA API [1].
|
|
@@ -44,7 +50,7 @@ class CudaCheckpointProcess:
|
|
|
44
50
|
pid: int
|
|
45
51
|
state: CudaCheckpointState
|
|
46
52
|
|
|
47
|
-
def toggle(self, target_state: CudaCheckpointState,
|
|
53
|
+
def toggle(self, target_state: CudaCheckpointState, skip_first_refresh: bool = False) -> None:
|
|
48
54
|
"""Toggle CUDA checkpoint state for current process, moving GPU memory to the
|
|
49
55
|
CPU and back depending on the current process state when called.
|
|
50
56
|
"""
|
|
@@ -54,7 +60,11 @@ class CudaCheckpointProcess:
|
|
|
54
60
|
retry_count = 0
|
|
55
61
|
max_retries = 3
|
|
56
62
|
|
|
57
|
-
|
|
63
|
+
attempts = 0
|
|
64
|
+
while self._should_continue_toggle(
|
|
65
|
+
target_state, start_time, refresh=not (skip_first_refresh and attempts == 0)
|
|
66
|
+
):
|
|
67
|
+
attempts += 1
|
|
58
68
|
try:
|
|
59
69
|
self._execute_toggle_command()
|
|
60
70
|
# Use exponential backoff for retries
|
|
@@ -73,10 +83,11 @@ class CudaCheckpointProcess:
|
|
|
73
83
|
logger.debug(f"PID: {self.pid} Target state {target_state.value} reached")
|
|
74
84
|
|
|
75
85
|
def _should_continue_toggle(
|
|
76
|
-
self, target_state: CudaCheckpointState, start_time: float,
|
|
86
|
+
self, target_state: CudaCheckpointState, start_time: float, refresh: bool = True
|
|
77
87
|
) -> bool:
|
|
78
88
|
"""Check if toggle operation should continue based on current state and timeout."""
|
|
79
|
-
|
|
89
|
+
if refresh:
|
|
90
|
+
self.refresh_state()
|
|
80
91
|
|
|
81
92
|
if self.state == target_state:
|
|
82
93
|
return False
|
|
@@ -85,7 +96,7 @@ class CudaCheckpointProcess:
|
|
|
85
96
|
raise CudaCheckpointException(f"PID: {self.pid} CUDA process state is {self.state}")
|
|
86
97
|
|
|
87
98
|
elapsed = time.monotonic() - start_time
|
|
88
|
-
if elapsed >=
|
|
99
|
+
if elapsed >= CUDA_CHECKPOINT_TOGGLE_TIMEOUT:
|
|
89
100
|
raise CudaCheckpointException(
|
|
90
101
|
f"PID: {self.pid} Timeout after {elapsed:.2f}s waiting for state {target_state.value}. "
|
|
91
102
|
f"Current state: {self.state}"
|
|
@@ -101,7 +112,7 @@ class CudaCheckpointProcess:
|
|
|
101
112
|
check=True,
|
|
102
113
|
capture_output=True,
|
|
103
114
|
text=True,
|
|
104
|
-
timeout=
|
|
115
|
+
timeout=CUDA_CHECKPOINT_TIMEOUT,
|
|
105
116
|
)
|
|
106
117
|
logger.debug(f"PID: {self.pid} Successfully toggled CUDA checkpoint state")
|
|
107
118
|
except subprocess.CalledProcessError as e:
|
|
@@ -121,7 +132,7 @@ class CudaCheckpointProcess:
|
|
|
121
132
|
check=True,
|
|
122
133
|
capture_output=True,
|
|
123
134
|
text=True,
|
|
124
|
-
timeout=
|
|
135
|
+
timeout=CUDA_CHECKPOINT_TIMEOUT,
|
|
125
136
|
)
|
|
126
137
|
|
|
127
138
|
state_str = result.stdout.strip().lower()
|
|
@@ -190,6 +201,7 @@ class CudaCheckpointSession:
|
|
|
190
201
|
[CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
|
|
191
202
|
capture_output=True,
|
|
192
203
|
text=True,
|
|
204
|
+
# This should be quick since no checkpoint has taken place yet
|
|
193
205
|
timeout=5,
|
|
194
206
|
)
|
|
195
207
|
|
|
@@ -256,20 +268,11 @@ class CudaCheckpointSession:
|
|
|
256
268
|
logger.debug("No CUDA sessions to restore.")
|
|
257
269
|
return
|
|
258
270
|
|
|
259
|
-
# Validate all states first
|
|
260
|
-
for proc in self.cuda_processes:
|
|
261
|
-
proc.refresh_state() # Refresh state before validation
|
|
262
|
-
if proc.state != CudaCheckpointState.CHECKPOINTED:
|
|
263
|
-
raise CudaCheckpointException(
|
|
264
|
-
f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.CHECKPOINTED.value} state. "
|
|
265
|
-
f"Current state: {proc.state.value}"
|
|
266
|
-
)
|
|
267
|
-
|
|
268
271
|
# See checkpoint() for rationale about parallelism.
|
|
269
272
|
start = time.perf_counter()
|
|
270
273
|
|
|
271
274
|
def restore_process(proc: CudaCheckpointProcess) -> None:
|
|
272
|
-
proc.toggle(CudaCheckpointState.RUNNING)
|
|
275
|
+
proc.toggle(CudaCheckpointState.RUNNING, skip_first_refresh=True)
|
|
273
276
|
|
|
274
277
|
with ThreadPoolExecutor() as executor:
|
|
275
278
|
futures = [executor.submit(restore_process, proc) for proc in self.cuda_processes]
|
|
@@ -492,7 +492,12 @@ async def _process_result(result: api_pb2.GenericResult, data_format: int, stub,
|
|
|
492
492
|
elif result.status == api_pb2.GenericResult.GENERIC_STATUS_INTERNAL_FAILURE:
|
|
493
493
|
raise InternalFailure(result.exception)
|
|
494
494
|
elif result.status != api_pb2.GenericResult.GENERIC_STATUS_SUCCESS:
|
|
495
|
-
if data and data_format
|
|
495
|
+
if data and data_format in (api_pb2.DATA_FORMAT_PICKLE, api_pb2.DATA_FORMAT_UNSPECIFIED):
|
|
496
|
+
# *Unspecified data format here but data present usually means that the exception
|
|
497
|
+
# was created by the server representing an exception that occurred during container
|
|
498
|
+
# startup (crash looping) that eventually got escalated to input failures.
|
|
499
|
+
# TaskResult doesn't specify data format, so these results don't have that metadata
|
|
500
|
+
# the moment.
|
|
496
501
|
try:
|
|
497
502
|
exc = deserialize(data, client)
|
|
498
503
|
except DeserializationError as deser_exc:
|
|
@@ -33,7 +33,7 @@ class _Client:
|
|
|
33
33
|
server_url: str,
|
|
34
34
|
client_type: int,
|
|
35
35
|
credentials: typing.Optional[tuple[str, str]],
|
|
36
|
-
version: str = "1.1.5.
|
|
36
|
+
version: str = "1.1.5.dev53",
|
|
37
37
|
):
|
|
38
38
|
"""mdmd:hidden
|
|
39
39
|
The Modal client object is not intended to be instantiated directly by users.
|
|
@@ -164,7 +164,7 @@ class Client:
|
|
|
164
164
|
server_url: str,
|
|
165
165
|
client_type: int,
|
|
166
166
|
credentials: typing.Optional[tuple[str, str]],
|
|
167
|
-
version: str = "1.1.5.
|
|
167
|
+
version: str = "1.1.5.dev53",
|
|
168
168
|
):
|
|
169
169
|
"""mdmd:hidden
|
|
170
170
|
The Modal client object is not intended to be instantiated directly by users.
|
|
@@ -450,7 +450,7 @@ class Function(
|
|
|
450
450
|
|
|
451
451
|
_call_generator: ___call_generator_spec[typing_extensions.Self]
|
|
452
452
|
|
|
453
|
-
class __remote_spec(typing_extensions.Protocol[
|
|
453
|
+
class __remote_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
|
|
454
454
|
def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER:
|
|
455
455
|
"""Calls the function remotely, executing it with the given arguments and returning the execution's result."""
|
|
456
456
|
...
|
|
@@ -459,7 +459,7 @@ class Function(
|
|
|
459
459
|
"""Calls the function remotely, executing it with the given arguments and returning the execution's result."""
|
|
460
460
|
...
|
|
461
461
|
|
|
462
|
-
remote: __remote_spec[modal._functions.
|
|
462
|
+
remote: __remote_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
|
|
463
463
|
|
|
464
464
|
class __remote_gen_spec(typing_extensions.Protocol[SUPERSELF]):
|
|
465
465
|
def __call__(self, /, *args, **kwargs) -> typing.Generator[typing.Any, None, None]:
|
|
@@ -486,7 +486,7 @@ class Function(
|
|
|
486
486
|
"""
|
|
487
487
|
...
|
|
488
488
|
|
|
489
|
-
class ___experimental_spawn_spec(typing_extensions.Protocol[
|
|
489
|
+
class ___experimental_spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
|
|
490
490
|
def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]:
|
|
491
491
|
"""[Experimental] Calls the function with the given arguments, without waiting for the results.
|
|
492
492
|
|
|
@@ -510,7 +510,7 @@ class Function(
|
|
|
510
510
|
...
|
|
511
511
|
|
|
512
512
|
_experimental_spawn: ___experimental_spawn_spec[
|
|
513
|
-
modal._functions.
|
|
513
|
+
modal._functions.P, modal._functions.ReturnType, typing_extensions.Self
|
|
514
514
|
]
|
|
515
515
|
|
|
516
516
|
class ___spawn_map_inner_spec(typing_extensions.Protocol[P_INNER, SUPERSELF]):
|
|
@@ -519,7 +519,7 @@ class Function(
|
|
|
519
519
|
|
|
520
520
|
_spawn_map_inner: ___spawn_map_inner_spec[modal._functions.P, typing_extensions.Self]
|
|
521
521
|
|
|
522
|
-
class __spawn_spec(typing_extensions.Protocol[
|
|
522
|
+
class __spawn_spec(typing_extensions.Protocol[P_INNER, ReturnType_INNER, SUPERSELF]):
|
|
523
523
|
def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]:
|
|
524
524
|
"""Calls the function with the given arguments, without waiting for the results.
|
|
525
525
|
|
|
@@ -540,7 +540,7 @@ class Function(
|
|
|
540
540
|
"""
|
|
541
541
|
...
|
|
542
542
|
|
|
543
|
-
spawn: __spawn_spec[modal._functions.
|
|
543
|
+
spawn: __spawn_spec[modal._functions.P, modal._functions.ReturnType, typing_extensions.Self]
|
|
544
544
|
|
|
545
545
|
def get_raw_f(self) -> collections.abc.Callable[..., typing.Any]:
|
|
546
546
|
"""Return the inner Python object wrapped by this Modal Function."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|