modal 1.1.5.dev50__tar.gz → 1.1.5.dev52__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of modal might be problematic. Click here for more details.
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/PKG-INFO +1 -1
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/gpu_memory_snapshot.py +20 -17
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/client.pyi +2 -2
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/functions.pyi +6 -6
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal.egg-info/PKG-INFO +1 -1
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/api.proto +9 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/api_grpc.py +16 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/api_pb2.py +671 -651
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/api_pb2.pyi +27 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/api_pb2_grpc.py +33 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/api_pb2_grpc.pyi +10 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/modal_api_grpc.py +1 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_version/__init__.py +1 -1
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/LICENSE +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/README.md +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/__main__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_clustered_functions.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_clustered_functions.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_container_entrypoint.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_functions.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_ipython.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_location.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_object.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_output.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_partial_function.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_pty.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_resolver.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_resources.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/asgi.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/container_io_manager.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/container_io_manager.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/execution_context.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/execution_context.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/telemetry.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_runtime/user_code_imports.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_serialization.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_traceback.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_tunnel.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_tunnel.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_type_manager.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/app_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/async_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/auth_token_manager.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/blob_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/bytes_io_segment_payload.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/deprecation.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/docker_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/function_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/git_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/grpc_testing.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/grpc_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/hash_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/http_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/jwt_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/logger.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/mount_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/name_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/package_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/pattern_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/rand_pb_testing.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/shell_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_utils/time_utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_vendor/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_vendor/a2wsgi_wsgi.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_vendor/cloudpickle.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_vendor/tblib.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/_watcher.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/app.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/app.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/builder/2023.12.312.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/builder/2023.12.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/builder/2024.04.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/builder/2024.10.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/builder/2025.06.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/builder/PREVIEW.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/builder/README.md +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/builder/base-images.json +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/call_graph.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/_download.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/_traceback.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/app.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/cluster.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/config.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/container.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/dict.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/entry_point.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/environment.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/import_refs.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/launch.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/network_file_system.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/profile.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/programs/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/programs/launch_instance_ssh.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/programs/run_jupyter.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/programs/run_marimo.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/programs/vscode.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/queues.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/run.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/secret.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/token.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/utils.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cli/volume.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/client.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cloud_bucket_mount.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cloud_bucket_mount.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cls.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/cls.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/config.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/container_process.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/container_process.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/dict.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/dict.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/environments.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/environments.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/exception.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/experimental/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/experimental/flash.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/experimental/flash.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/experimental/ipython.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/file_io.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/file_io.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/file_pattern_matcher.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/functions.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/gpu.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/image.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/image.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/io_streams.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/io_streams.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/mount.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/mount.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/network_file_system.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/network_file_system.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/object.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/object.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/output.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/parallel_map.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/parallel_map.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/partial_function.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/partial_function.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/proxy.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/proxy.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/py.typed +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/queue.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/queue.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/retries.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/runner.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/runner.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/running_app.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/sandbox.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/sandbox.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/schedule.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/scheduler_placement.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/secret.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/secret.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/serving.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/serving.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/snapshot.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/snapshot.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/stream_type.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/token_flow.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/token_flow.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/volume.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal/volume.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal.egg-info/SOURCES.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal.egg-info/dependency_links.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal.egg-info/entry_points.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal.egg-info/requires.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal.egg-info/top_level.txt +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_docs/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_docs/gen_cli_docs.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_docs/gen_reference_docs.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_docs/mdmd/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_docs/mdmd/mdmd.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_docs/mdmd/signatures.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/__init__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/modal_options_grpc.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/options.proto +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/options_grpc.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/options_pb2.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/options_pb2.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/options_pb2_grpc.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/options_pb2_grpc.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/py.typed +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/sandbox_router.proto +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/sandbox_router_grpc.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/sandbox_router_pb2.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/sandbox_router_pb2.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/sandbox_router_pb2_grpc.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_proto/sandbox_router_pb2_grpc.pyi +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/modal_version/__main__.py +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/pyproject.toml +0 -0
- {modal-1.1.5.dev50 → modal-1.1.5.dev52}/setup.cfg +0 -0
|
@@ -18,6 +18,12 @@ from modal.config import config, logger
|
|
|
18
18
|
|
|
19
19
|
CUDA_CHECKPOINT_PATH: str = config.get("cuda_checkpoint_path")
|
|
20
20
|
|
|
21
|
+
# Maximum total duration for an entire toggle operation.
|
|
22
|
+
CUDA_CHECKPOINT_TOGGLE_TIMEOUT: float = 5 * 60.0
|
|
23
|
+
|
|
24
|
+
# Maximum total duration for each individual `cuda-checkpoint` invocation.
|
|
25
|
+
CUDA_CHECKPOINT_TIMEOUT: float = 90
|
|
26
|
+
|
|
21
27
|
|
|
22
28
|
class CudaCheckpointState(Enum):
|
|
23
29
|
"""State representation from the CUDA API [1].
|
|
@@ -44,7 +50,7 @@ class CudaCheckpointProcess:
|
|
|
44
50
|
pid: int
|
|
45
51
|
state: CudaCheckpointState
|
|
46
52
|
|
|
47
|
-
def toggle(self, target_state: CudaCheckpointState,
|
|
53
|
+
def toggle(self, target_state: CudaCheckpointState, skip_first_refresh: bool = False) -> None:
|
|
48
54
|
"""Toggle CUDA checkpoint state for current process, moving GPU memory to the
|
|
49
55
|
CPU and back depending on the current process state when called.
|
|
50
56
|
"""
|
|
@@ -54,7 +60,11 @@ class CudaCheckpointProcess:
|
|
|
54
60
|
retry_count = 0
|
|
55
61
|
max_retries = 3
|
|
56
62
|
|
|
57
|
-
|
|
63
|
+
attempts = 0
|
|
64
|
+
while self._should_continue_toggle(
|
|
65
|
+
target_state, start_time, refresh=not (skip_first_refresh and attempts == 0)
|
|
66
|
+
):
|
|
67
|
+
attempts += 1
|
|
58
68
|
try:
|
|
59
69
|
self._execute_toggle_command()
|
|
60
70
|
# Use exponential backoff for retries
|
|
@@ -73,10 +83,11 @@ class CudaCheckpointProcess:
|
|
|
73
83
|
logger.debug(f"PID: {self.pid} Target state {target_state.value} reached")
|
|
74
84
|
|
|
75
85
|
def _should_continue_toggle(
|
|
76
|
-
self, target_state: CudaCheckpointState, start_time: float,
|
|
86
|
+
self, target_state: CudaCheckpointState, start_time: float, refresh: bool = True
|
|
77
87
|
) -> bool:
|
|
78
88
|
"""Check if toggle operation should continue based on current state and timeout."""
|
|
79
|
-
|
|
89
|
+
if refresh:
|
|
90
|
+
self.refresh_state()
|
|
80
91
|
|
|
81
92
|
if self.state == target_state:
|
|
82
93
|
return False
|
|
@@ -85,7 +96,7 @@ class CudaCheckpointProcess:
|
|
|
85
96
|
raise CudaCheckpointException(f"PID: {self.pid} CUDA process state is {self.state}")
|
|
86
97
|
|
|
87
98
|
elapsed = time.monotonic() - start_time
|
|
88
|
-
if elapsed >=
|
|
99
|
+
if elapsed >= CUDA_CHECKPOINT_TOGGLE_TIMEOUT:
|
|
89
100
|
raise CudaCheckpointException(
|
|
90
101
|
f"PID: {self.pid} Timeout after {elapsed:.2f}s waiting for state {target_state.value}. "
|
|
91
102
|
f"Current state: {self.state}"
|
|
@@ -101,7 +112,7 @@ class CudaCheckpointProcess:
|
|
|
101
112
|
check=True,
|
|
102
113
|
capture_output=True,
|
|
103
114
|
text=True,
|
|
104
|
-
timeout=
|
|
115
|
+
timeout=CUDA_CHECKPOINT_TIMEOUT,
|
|
105
116
|
)
|
|
106
117
|
logger.debug(f"PID: {self.pid} Successfully toggled CUDA checkpoint state")
|
|
107
118
|
except subprocess.CalledProcessError as e:
|
|
@@ -121,7 +132,7 @@ class CudaCheckpointProcess:
|
|
|
121
132
|
check=True,
|
|
122
133
|
capture_output=True,
|
|
123
134
|
text=True,
|
|
124
|
-
timeout=
|
|
135
|
+
timeout=CUDA_CHECKPOINT_TIMEOUT,
|
|
125
136
|
)
|
|
126
137
|
|
|
127
138
|
state_str = result.stdout.strip().lower()
|
|
@@ -190,6 +201,7 @@ class CudaCheckpointSession:
|
|
|
190
201
|
[CUDA_CHECKPOINT_PATH, "--get-state", "--pid", str(pid)],
|
|
191
202
|
capture_output=True,
|
|
192
203
|
text=True,
|
|
204
|
+
# This should be quick since no checkpoint has taken place yet
|
|
193
205
|
timeout=5,
|
|
194
206
|
)
|
|
195
207
|
|
|
@@ -256,20 +268,11 @@ class CudaCheckpointSession:
|
|
|
256
268
|
logger.debug("No CUDA sessions to restore.")
|
|
257
269
|
return
|
|
258
270
|
|
|
259
|
-
# Validate all states first
|
|
260
|
-
for proc in self.cuda_processes:
|
|
261
|
-
proc.refresh_state() # Refresh state before validation
|
|
262
|
-
if proc.state != CudaCheckpointState.CHECKPOINTED:
|
|
263
|
-
raise CudaCheckpointException(
|
|
264
|
-
f"PID {proc.pid}: CUDA session not in {CudaCheckpointState.CHECKPOINTED.value} state. "
|
|
265
|
-
f"Current state: {proc.state.value}"
|
|
266
|
-
)
|
|
267
|
-
|
|
268
271
|
# See checkpoint() for rationale about parallelism.
|
|
269
272
|
start = time.perf_counter()
|
|
270
273
|
|
|
271
274
|
def restore_process(proc: CudaCheckpointProcess) -> None:
|
|
272
|
-
proc.toggle(CudaCheckpointState.RUNNING)
|
|
275
|
+
proc.toggle(CudaCheckpointState.RUNNING, skip_first_refresh=True)
|
|
273
276
|
|
|
274
277
|
with ThreadPoolExecutor() as executor:
|
|
275
278
|
futures = [executor.submit(restore_process, proc) for proc in self.cuda_processes]
|
|
@@ -33,7 +33,7 @@ class _Client:
|
|
|
33
33
|
server_url: str,
|
|
34
34
|
client_type: int,
|
|
35
35
|
credentials: typing.Optional[tuple[str, str]],
|
|
36
|
-
version: str = "1.1.5.
|
|
36
|
+
version: str = "1.1.5.dev52",
|
|
37
37
|
):
|
|
38
38
|
"""mdmd:hidden
|
|
39
39
|
The Modal client object is not intended to be instantiated directly by users.
|
|
@@ -164,7 +164,7 @@ class Client:
|
|
|
164
164
|
server_url: str,
|
|
165
165
|
client_type: int,
|
|
166
166
|
credentials: typing.Optional[tuple[str, str]],
|
|
167
|
-
version: str = "1.1.5.
|
|
167
|
+
version: str = "1.1.5.dev52",
|
|
168
168
|
):
|
|
169
169
|
"""mdmd:hidden
|
|
170
170
|
The Modal client object is not intended to be instantiated directly by users.
|
|
@@ -450,7 +450,7 @@ class Function(
|
|
|
450
450
|
|
|
451
451
|
_call_generator: ___call_generator_spec[typing_extensions.Self]
|
|
452
452
|
|
|
453
|
-
class __remote_spec(typing_extensions.Protocol[
|
|
453
|
+
class __remote_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
|
|
454
454
|
def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> ReturnType_INNER:
|
|
455
455
|
"""Calls the function remotely, executing it with the given arguments and returning the execution's result."""
|
|
456
456
|
...
|
|
@@ -459,7 +459,7 @@ class Function(
|
|
|
459
459
|
"""Calls the function remotely, executing it with the given arguments and returning the execution's result."""
|
|
460
460
|
...
|
|
461
461
|
|
|
462
|
-
remote: __remote_spec[modal._functions.
|
|
462
|
+
remote: __remote_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
|
|
463
463
|
|
|
464
464
|
class __remote_gen_spec(typing_extensions.Protocol[SUPERSELF]):
|
|
465
465
|
def __call__(self, /, *args, **kwargs) -> typing.Generator[typing.Any, None, None]:
|
|
@@ -486,7 +486,7 @@ class Function(
|
|
|
486
486
|
"""
|
|
487
487
|
...
|
|
488
488
|
|
|
489
|
-
class ___experimental_spawn_spec(typing_extensions.Protocol[
|
|
489
|
+
class ___experimental_spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
|
|
490
490
|
def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]:
|
|
491
491
|
"""[Experimental] Calls the function with the given arguments, without waiting for the results.
|
|
492
492
|
|
|
@@ -510,7 +510,7 @@ class Function(
|
|
|
510
510
|
...
|
|
511
511
|
|
|
512
512
|
_experimental_spawn: ___experimental_spawn_spec[
|
|
513
|
-
modal._functions.
|
|
513
|
+
modal._functions.ReturnType, modal._functions.P, typing_extensions.Self
|
|
514
514
|
]
|
|
515
515
|
|
|
516
516
|
class ___spawn_map_inner_spec(typing_extensions.Protocol[P_INNER, SUPERSELF]):
|
|
@@ -519,7 +519,7 @@ class Function(
|
|
|
519
519
|
|
|
520
520
|
_spawn_map_inner: ___spawn_map_inner_spec[modal._functions.P, typing_extensions.Self]
|
|
521
521
|
|
|
522
|
-
class __spawn_spec(typing_extensions.Protocol[
|
|
522
|
+
class __spawn_spec(typing_extensions.Protocol[ReturnType_INNER, P_INNER, SUPERSELF]):
|
|
523
523
|
def __call__(self, /, *args: P_INNER.args, **kwargs: P_INNER.kwargs) -> FunctionCall[ReturnType_INNER]:
|
|
524
524
|
"""Calls the function with the given arguments, without waiting for the results.
|
|
525
525
|
|
|
@@ -540,7 +540,7 @@ class Function(
|
|
|
540
540
|
"""
|
|
541
541
|
...
|
|
542
542
|
|
|
543
|
-
spawn: __spawn_spec[modal._functions.
|
|
543
|
+
spawn: __spawn_spec[modal._functions.ReturnType, modal._functions.P, typing_extensions.Self]
|
|
544
544
|
|
|
545
545
|
def get_raw_f(self) -> collections.abc.Callable[..., typing.Any]:
|
|
546
546
|
"""Return the inner Python object wrapped by this Modal Function."""
|
|
@@ -1345,6 +1345,14 @@ message FlashProxyUpstreamRequest {
|
|
|
1345
1345
|
double timestamp = 2;
|
|
1346
1346
|
}
|
|
1347
1347
|
|
|
1348
|
+
message FlashSetTargetSlotsMetricsRequest {
|
|
1349
|
+
// TODO(claudia): add other metrics to use in autoscaling decisions
|
|
1350
|
+
string function_id = 1;
|
|
1351
|
+
uint32 target_slots = 2;
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
message FlashSetTargetSlotsMetricsResponse {}
|
|
1355
|
+
|
|
1348
1356
|
message Function {
|
|
1349
1357
|
string module_name = 1;
|
|
1350
1358
|
string function_name = 2;
|
|
@@ -3629,6 +3637,7 @@ service ModalClient {
|
|
|
3629
3637
|
rpc FlashContainerDeregister(FlashContainerDeregisterRequest) returns (google.protobuf.Empty);
|
|
3630
3638
|
rpc FlashContainerList(FlashContainerListRequest) returns (FlashContainerListResponse);
|
|
3631
3639
|
rpc FlashContainerRegister(FlashContainerRegisterRequest) returns (FlashContainerRegisterResponse);
|
|
3640
|
+
rpc FlashSetTargetSlotsMetrics(FlashSetTargetSlotsMetricsRequest) returns (FlashSetTargetSlotsMetricsResponse);
|
|
3632
3641
|
|
|
3633
3642
|
// Functions
|
|
3634
3643
|
rpc FunctionAsyncInvoke(FunctionAsyncInvokeRequest) returns (FunctionAsyncInvokeResponse);
|
|
@@ -262,6 +262,10 @@ class ModalClientBase(abc.ABC):
|
|
|
262
262
|
async def FlashContainerRegister(self, stream: 'grpclib.server.Stream[modal_proto.api_pb2.FlashContainerRegisterRequest, modal_proto.api_pb2.FlashContainerRegisterResponse]') -> None:
|
|
263
263
|
pass
|
|
264
264
|
|
|
265
|
+
@abc.abstractmethod
|
|
266
|
+
async def FlashSetTargetSlotsMetrics(self, stream: 'grpclib.server.Stream[modal_proto.api_pb2.FlashSetTargetSlotsMetricsRequest, modal_proto.api_pb2.FlashSetTargetSlotsMetricsResponse]') -> None:
|
|
267
|
+
pass
|
|
268
|
+
|
|
265
269
|
@abc.abstractmethod
|
|
266
270
|
async def FunctionAsyncInvoke(self, stream: 'grpclib.server.Stream[modal_proto.api_pb2.FunctionAsyncInvokeRequest, modal_proto.api_pb2.FunctionAsyncInvokeResponse]') -> None:
|
|
267
271
|
pass
|
|
@@ -1070,6 +1074,12 @@ class ModalClientBase(abc.ABC):
|
|
|
1070
1074
|
modal_proto.api_pb2.FlashContainerRegisterRequest,
|
|
1071
1075
|
modal_proto.api_pb2.FlashContainerRegisterResponse,
|
|
1072
1076
|
),
|
|
1077
|
+
'/modal.client.ModalClient/FlashSetTargetSlotsMetrics': grpclib.const.Handler(
|
|
1078
|
+
self.FlashSetTargetSlotsMetrics,
|
|
1079
|
+
grpclib.const.Cardinality.UNARY_UNARY,
|
|
1080
|
+
modal_proto.api_pb2.FlashSetTargetSlotsMetricsRequest,
|
|
1081
|
+
modal_proto.api_pb2.FlashSetTargetSlotsMetricsResponse,
|
|
1082
|
+
),
|
|
1073
1083
|
'/modal.client.ModalClient/FunctionAsyncInvoke': grpclib.const.Handler(
|
|
1074
1084
|
self.FunctionAsyncInvoke,
|
|
1075
1085
|
grpclib.const.Cardinality.UNARY_UNARY,
|
|
@@ -2102,6 +2112,12 @@ class ModalClientStub:
|
|
|
2102
2112
|
modal_proto.api_pb2.FlashContainerRegisterRequest,
|
|
2103
2113
|
modal_proto.api_pb2.FlashContainerRegisterResponse,
|
|
2104
2114
|
)
|
|
2115
|
+
self.FlashSetTargetSlotsMetrics = grpclib.client.UnaryUnaryMethod(
|
|
2116
|
+
channel,
|
|
2117
|
+
'/modal.client.ModalClient/FlashSetTargetSlotsMetrics',
|
|
2118
|
+
modal_proto.api_pb2.FlashSetTargetSlotsMetricsRequest,
|
|
2119
|
+
modal_proto.api_pb2.FlashSetTargetSlotsMetricsResponse,
|
|
2120
|
+
)
|
|
2105
2121
|
self.FunctionAsyncInvoke = grpclib.client.UnaryUnaryMethod(
|
|
2106
2122
|
channel,
|
|
2107
2123
|
'/modal.client.ModalClient/FunctionAsyncInvoke',
|