PyPI - torch-memory-saver - Versions diffs - 0.0.6__cp39-abi3-manylinux2014_x86_64.whl → 0.0.9rc1__cp39-abi3-manylinux2014_x86_64.whl - Mend

torch-memory-saver 0.0.6__cp39-abi3-manylinux2014_x86_64.whl → 0.0.9rc1__cp39-abi3-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

torch_memory_saver/__init__.py +4 -114
torch_memory_saver/binary_wrapper.py +31 -0
torch_memory_saver/entrypoint.py +142 -0
torch_memory_saver/hooks/__init__.py +0 -0
torch_memory_saver/hooks/base.py +21 -0
torch_memory_saver/hooks/mode_preload.py +26 -0
torch_memory_saver/hooks/mode_torch.py +19 -0
torch_memory_saver/testing_utils.py +10 -0
torch_memory_saver/utils.py +27 -0
{torch_memory_saver-0.0.6.dist-info → torch_memory_saver-0.0.9rc1.dist-info}/METADATA +1 -1
torch_memory_saver-0.0.9rc1.dist-info/RECORD +16 -0
torch_memory_saver-0.0.9rc1.dist-info/top_level.txt +3 -0
torch_memory_saver_hook_mode_preload.abi3.so +0 -0
torch_memory_saver_hook_mode_torch.abi3.so +0 -0
torch_memory_saver-0.0.6.dist-info/RECORD +0 -7
torch_memory_saver-0.0.6.dist-info/top_level.txt +0 -2
torch_memory_saver_cpp.abi3.so +0 -0
{torch_memory_saver-0.0.6.dist-info → torch_memory_saver-0.0.9rc1.dist-info}/LICENSE +0 -0
{torch_memory_saver-0.0.6.dist-info → torch_memory_saver-0.0.9rc1.dist-info}/WHEEL +0 -0

torch_memory_saver/__init__.py CHANGED Viewed

@@ -1,115 +1,5 @@
-import ctypes
-import logging
-import os
-from contextlib import contextmanager
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
+from .entrypoint import TorchMemorySaver
+from .hooks.mode_preload import configure_subprocess
-import torch
-logger = logging.getLogger(__name__)
-class TorchMemorySaver:
-    def __init__(self):
-        self._mem_pool = None
-        self._id = _global_info.next_id()
-        assert self._id == 1, 'Only support one single instance yet (multi-instance will be implemented later)'
-    @contextmanager
-    def region(self):
-        if _global_info.binary_info.enabled:
-            self._ensure_mem_pool()
-            with torch.cuda.use_mem_pool(self._mem_pool):
-                _global_info.binary_info.cdll.tms_region_enter()
-                try:
-                    yield
-                finally:
-                    _global_info.binary_info.cdll.tms_region_leave()
-        else:
-            yield
-    def pause(self):
-        if _global_info.binary_info.enabled:
-            _global_info.binary_info.cdll.tms_pause()
-    def resume(self):
-        if _global_info.binary_info.enabled:
-            _global_info.binary_info.cdll.tms_resume()
-    @property
-    def enabled(self):
-        return _global_info.binary_info.enabled
-    def _ensure_mem_pool(self):
-        if self._mem_pool is None:
-            self._mem_pool = torch.cuda.MemPool()
-@dataclass
-class _BinaryInfo:
-    cdll: Optional[ctypes.CDLL]
-    @property
-    def enabled(self):
-        return self.cdll is not None
-    @staticmethod
-    def compute():
-        env_ld_preload = os.environ.get('LD_PRELOAD', '')
-        if 'torch_memory_saver' in env_ld_preload:
-            return _BinaryInfo(cdll=ctypes.CDLL(env_ld_preload))
-        else:
-            logger.warning(
-                f'TorchMemorySaver is disabled for the current process because invalid LD_PRELOAD="{env_ld_preload}" (process_id={os.getpid()})')
-            return _BinaryInfo(cdll=None)
-class _GlobalInfo:
-    def __init__(self):
-        self._binary_info: Optional[_BinaryInfo] = None
-        self._last_id = 0
-    @property
-    def binary_info(self):
-        if self._binary_info is None:
-            self._binary_info = _BinaryInfo.compute()
-        return self._binary_info
-    def next_id(self):
-        self._last_id += 1
-        return self._last_id
-_global_info = _GlobalInfo()
-def get_binary_path():
-    dir_package = Path(__file__).parent
-    candidates = [
-        p
-        for d in [dir_package, dir_package.parent]
-        for p in d.glob('torch_memory_saver_cpp.*.so')
-    ]
-    assert len(candidates) == 1, f'{candidates=}'
-    return candidates[0]
-@contextmanager
-def configure_subprocess():
-    with change_env('LD_PRELOAD', str(get_binary_path())):
-        yield
-@contextmanager
-def change_env(key: str, value: str):
-    old_value = os.environ.get(key, '')
-    os.environ[key] = value
-    logger.debug(f'change_env set key={key} value={value}')
-    try:
-        yield
-    finally:
-        assert os.environ[key] == value
-        os.environ[key] = old_value
-        logger.debug(f'change_env restore key={key} value={old_value}')
+# Global singleton
+torch_memory_saver = TorchMemorySaver()

torch_memory_saver/binary_wrapper.py ADDED Viewed

@@ -0,0 +1,31 @@
+import ctypes
+import logging
+logger = logging.getLogger(__name__)
+class BinaryWrapper:
+    def __init__(self, path_binary: str):
+        try:
+            self.cdll = ctypes.CDLL(path_binary)
+        except OSError as e:
+            logger.error(f"Failed to load CDLL from {path_binary}: {e}")
+            raise
+        _setup_function_signatures(self.cdll)
+    def set_config(self, *, tag: str, interesting_region: bool, enable_cpu_backup: bool):
+        self.cdll.tms_set_current_tag(tag.encode("utf-8"))
+        self.cdll.tms_set_interesting_region(interesting_region)
+        self.cdll.tms_set_enable_cpu_backup(enable_cpu_backup)
+def _setup_function_signatures(cdll):
+    """Define function signatures for the C library"""
+    cdll.tms_set_current_tag.argtypes = [ctypes.c_char_p]
+    cdll.tms_set_interesting_region.argtypes = [ctypes.c_bool]
+    cdll.tms_get_interesting_region.restype = ctypes.c_bool
+    cdll.tms_set_enable_cpu_backup.argtypes = [ctypes.c_bool]
+    cdll.tms_get_enable_cpu_backup.restype = ctypes.c_bool
+    cdll.tms_pause.argtypes = [ctypes.c_char_p]
+    cdll.tms_resume.argtypes = [ctypes.c_char_p]

torch_memory_saver/entrypoint.py ADDED Viewed

@@ -0,0 +1,142 @@
+import ctypes
+import logging
+import os
+from contextlib import contextmanager
+from typing import Optional
+import torch
+from .binary_wrapper import BinaryWrapper
+from .hooks.base import HookUtilBase, HookMode
+logger = logging.getLogger(__name__)
+_TAG_DEFAULT = "default"
+class TorchMemorySaver:
+    def __init__(self):
+        self._impl_ctor_kwargs = {}
+        self._impl: Optional[_TorchMemorySaverImpl] = None
+    @contextmanager
+    def region(self, tag: str = _TAG_DEFAULT, enable_cpu_backup: bool = False):
+        """Context manager for memory saving with optional tag"""
+        self._ensure_initialized()
+        with self._impl.region(tag=tag, enable_cpu_backup=enable_cpu_backup):
+            yield
+    @contextmanager
+    def cuda_graph(
+            self,
+            cuda_graph, pool=None, stream=None, capture_error_mode='global',
+            tag: str = _TAG_DEFAULT, enable_cpu_backup: bool = False,
+    ):
+        """Similar to `torch.cuda.graph`, but ensures memory in it to be pauseable."""
+        self._ensure_initialized()
+        with self._impl.cuda_graph(
+                cuda_graph=cuda_graph,
+                pool=pool, stream=stream, capture_error_mode=capture_error_mode,
+                tag=tag, enable_cpu_backup=enable_cpu_backup,
+        ):
+            yield
+    @contextmanager
+    def disable(self):
+        self._ensure_initialized()
+        with self._impl.disable():
+            yield
+    def pause(self, tag: Optional[str] = None):
+        """Pause memory for specific tag or all memory if tag is None"""
+        self._ensure_initialized()
+        self._impl.pause(tag=tag)
+    def resume(self, tag: Optional[str] = None):
+        """Resume memory for specific tag or all memory if tag is None"""
+        self._ensure_initialized()
+        self._impl.resume(tag=tag)
+    # for compatibility
+    @property
+    def enabled(self):
+        return True
+    @property
+    def hook_mode(self):
+        raise AttributeError
+    @hook_mode.setter
+    def hook_mode(self, hook_mode: HookMode):
+        assert self._impl_ctor_kwargs is not None, "Cannot configure after initialization"
+        self._impl_ctor_kwargs["hook_mode"] = hook_mode
+    def _ensure_initialized(self):
+        if self._impl is not None:
+            return
+        self._impl = _TorchMemorySaverImpl(**self._impl_ctor_kwargs)
+        del self._impl_ctor_kwargs
+class _TorchMemorySaverImpl:
+    def __init__(self, hook_mode: HookMode = "preload"):
+        self._hook_mode = hook_mode
+        self._hook_util = HookUtilBase.create(hook_mode=hook_mode)
+        self._binary_wrapper = BinaryWrapper(path_binary=self._hook_util.get_path_binary())
+        self._primary_mem_pool = torch.cuda.MemPool(allocator=self._hook_util.get_allocator())
+        _sanity_checks()
+    @contextmanager
+    def region(self, tag: str, enable_cpu_backup: bool):
+        with torch.cuda.use_mem_pool(self._primary_mem_pool):
+            with self._with_region_config(tag=tag, enable_cpu_backup=enable_cpu_backup):
+                yield
+    @contextmanager
+    def cuda_graph(self, cuda_graph, pool, stream, capture_error_mode, tag: str, enable_cpu_backup: bool):
+        assert self._hook_mode == "preload", "Only hook_mode=preload supports pauseable CUDA Graph currently"
+        with torch.cuda.graph(cuda_graph, pool=pool, stream=stream, capture_error_mode=capture_error_mode):
+            with self._with_region_config(tag=tag, enable_cpu_backup=enable_cpu_backup):
+                yield
+    @contextmanager
+    def _with_region_config(self, tag: str, enable_cpu_backup: bool):
+        assert not self._binary_wrapper.cdll.tms_get_interesting_region()
+        original_enable_cpu_backup = self._binary_wrapper.cdll.tms_get_enable_cpu_backup()
+        self._binary_wrapper.set_config(tag=tag, interesting_region=True, enable_cpu_backup=enable_cpu_backup)
+        try:
+            yield
+        finally:
+            assert self._binary_wrapper.cdll.tms_get_interesting_region()
+            self._binary_wrapper.set_config(tag=_TAG_DEFAULT, interesting_region=False, enable_cpu_backup=original_enable_cpu_backup)
+    @contextmanager
+    def disable(self, dispose_mem_pool_after_use: bool = True):
+        assert dispose_mem_pool_after_use, "Only dispose_mem_pool_after_use=true is supported now"
+        assert self._binary_wrapper.cdll.tms_get_interesting_region(), "disable() should be called only when tms is active"
+        self._binary_wrapper.cdll.tms_set_interesting_region(False)
+        try:
+            # We can either reuse the pool or delete it immediately, and we implement the latter currently since Slime uses it.
+            # About why we need a pool: https://github.com/fzyzcjy/torch_memory_saver/pull/20#issuecomment-3047099047
+            pool = torch.cuda.MemPool()
+            with torch.cuda.use_mem_pool(pool):
+                yield
+            del pool
+        finally:
+            self._binary_wrapper.cdll.tms_set_interesting_region(True)
+    def pause(self, tag: Optional[str]):
+        tag_bytes = tag.encode("utf-8") if tag else None
+        self._binary_wrapper.cdll.tms_pause(tag_bytes)
+    def resume(self, tag: Optional[str]):
+        tag_bytes = tag.encode("utf-8") if tag else None
+        self._binary_wrapper.cdll.tms_resume(tag_bytes)
+def _sanity_checks():
+    if "expandable_segments:True" in os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""):
+        raise RuntimeError(
+            "TorchMemorySaver is disabled for the current process because expandable_segments is not supported yet."
+        )

torch_memory_saver/hooks/__init__.py ADDED Viewed

File without changes

torch_memory_saver/hooks/base.py ADDED Viewed

@@ -0,0 +1,21 @@
+from abc import ABC
+from typing import Literal
+HookMode = Literal["preload", "torch"]
+class HookUtilBase(ABC):
+    @staticmethod
+    def create(hook_mode: HookMode) -> "HookUtilBase":
+        from torch_memory_saver.hooks.mode_preload import HookUtilModePreload
+        from torch_memory_saver.hooks.mode_torch import HookUtilModeTorch
+        return {
+            "preload": HookUtilModePreload,
+            "torch": HookUtilModeTorch,
+        }[hook_mode]()
+    def get_path_binary(self):
+        raise NotImplementedError
+    def get_allocator(self):
+        return None

torch_memory_saver/hooks/mode_preload.py ADDED Viewed

@@ -0,0 +1,26 @@
+import logging
+import os
+from contextlib import contextmanager
+from torch_memory_saver.hooks.base import HookUtilBase
+from torch_memory_saver.utils import get_binary_path_from_package, change_env
+logger = logging.getLogger(__name__)
+class HookUtilModePreload(HookUtilBase):
+    def get_path_binary(self):
+        env_ld_preload = os.environ.get("LD_PRELOAD", "")
+        assert "torch_memory_saver" in env_ld_preload, (
+            f"TorchMemorySaver observes invalid LD_PRELOAD. "
+            f"You can use configure_subprocess() utility, "
+            f"or directly specify `LD_PRELOAD=/path/to/torch_memory_saver_cpp.some-postfix.so python your_script.py. "
+            f'(LD_PRELOAD="{env_ld_preload}" process_id={os.getpid()})'
+        )
+        return env_ld_preload
+@contextmanager
+def configure_subprocess():
+    """Configure environment variables for subprocesses. Only needed for hook_mode=preload."""
+    with change_env("LD_PRELOAD", str(get_binary_path_from_package("torch_memory_saver_hook_mode_preload"))):
+        yield

torch_memory_saver/hooks/mode_torch.py ADDED Viewed

@@ -0,0 +1,19 @@
+import logging
+from torch_memory_saver.hooks.base import HookUtilBase
+from torch_memory_saver.utils import get_binary_path_from_package
+from torch.cuda.memory import CUDAPluggableAllocator
+logger = logging.getLogger(__name__)
+class HookUtilModeTorch(HookUtilBase):
+    def __init__(self):
+        self.allocator = CUDAPluggableAllocator(self.get_path_binary(), "tms_torch_malloc", "tms_torch_free")
+        logger.debug(f"HookUtilModeTorch {self.allocator=} {self.get_path_binary()=}")
+    def get_path_binary(self):
+        return str(get_binary_path_from_package("torch_memory_saver_hook_mode_torch"))
+    def get_allocator(self):
+        return self.allocator.allocator()

torch_memory_saver/testing_utils.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Not to be used by end users, but only for tests of the package itself."""
+import torch
+def get_and_print_gpu_memory(message, gpu_id=0):
+    """Print GPU memory usage with optional message"""
+    mem = torch.cuda.device_memory_used(gpu_id)
+    print(f"GPU {gpu_id} memory: {mem / 1024 ** 3:.2f} GB ({message})")
+    return mem

torch_memory_saver/utils.py ADDED Viewed

@@ -0,0 +1,27 @@
+import logging
+import os
+from contextlib import contextmanager
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def get_binary_path_from_package(stem: str):
+    dir_package = Path(__file__).parent
+    candidates = [p for d in [dir_package, dir_package.parent] for p in d.glob(f"{stem}.*.so")]
+    assert len(candidates) == 1, f"Expected exactly one torch_memory_saver_cpp library, found: {candidates}"
+    return candidates[0]
+# private utils, not to be used by end users
+@contextmanager
+def change_env(key: str, value: str):
+    old_value = os.environ.get(key, "")
+    os.environ[key] = value
+    logger.debug(f"change_env set key={key} value={value}")
+    try:
+        yield
+    finally:
+        assert os.environ[key] == value
+        os.environ[key] = old_value
+        logger.debug(f"change_env restore key={key} value={old_value}")

{torch_memory_saver-0.0.6.dist-info → torch_memory_saver-0.0.9rc1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: torch_memory_saver
-Version: 0.0.6
+Version: 0.0.9rc1
 Requires-Python: >=3.9
 License-File: LICENSE

torch_memory_saver-0.0.9rc1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+torch_memory_saver_hook_mode_preload.abi3.so,sha256=dQ6PvuqKNXTsbAPSWxP86YkQL82Vo-ngJ9WqB4Zbiss,777208
+torch_memory_saver_hook_mode_torch.abi3.so,sha256=-NE71rUpuYC3Sh8EqaMWbdMbm9C4h6MxeC30gNoqGys,781032
+torch_memory_saver/__init__.py,sha256=9iU_QlTe6OxMR5_OtSRUmvr6ltzk149GjojYvG74sag,154
+torch_memory_saver/binary_wrapper.py,sha256=MeQlPHIuFycamcWp3kOXjVZMiEK8HONuSx4l92J4k_Q,1133
+torch_memory_saver/entrypoint.py,sha256=aFkgqnWRI8vF8EeAL4FvIY33dNVtIbUMk1eM3_xH-fs,5538
+torch_memory_saver/testing_utils.py,sha256=vd9jhMgBLbeEy3vdvbuCjjtO-lRSX-RVB_Dg-wSHVQM,332
+torch_memory_saver/utils.py,sha256=LhtiocZTpMyDEjSexXaGglQtOJeJB7AaH5s43PZX5yo,856
+torch_memory_saver/hooks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+torch_memory_saver/hooks/base.py,sha256=f8Rv_XxNupU80dKWUgE-Ea5pu1qaoXcnzltZrDy90hY,579
+torch_memory_saver/hooks/mode_preload.py,sha256=ELaVloI7T-rjssxn6lujaknNfujxFBxA2oc0SOsiUfk,1041
+torch_memory_saver/hooks/mode_torch.py,sha256=yxGyA8AYrKX7hr3Bawr_MH2AMwSgXxLN76GZaAbQLGU,681
+torch_memory_saver-0.0.9rc1.dist-info/LICENSE,sha256=i806R5xShJFB4k9yNQJ2GYCcSBlu1frTx2vH_nWdWE8,1064
+torch_memory_saver-0.0.9rc1.dist-info/METADATA,sha256=PAMArqZ3_juC25gvir7W_--VffaTK9tO1-422cWOfnA,111
+torch_memory_saver-0.0.9rc1.dist-info/WHEEL,sha256=HUPiMa7ZA9BvJ9gdJRYwZIjK2rWbCcrqYvJ4Onw0owE,102
+torch_memory_saver-0.0.9rc1.dist-info/top_level.txt,sha256=Fdob5gbD3sjPAe3kNfDokaN1sL43cMvwKRLKuR8oitw,91
+torch_memory_saver-0.0.9rc1.dist-info/RECORD,,

torch_memory_saver-0.0.9rc1.dist-info/top_level.txt ADDED Viewed

@@ -0,0 +1,3 @@
+torch_memory_saver
+torch_memory_saver_hook_mode_preload
+torch_memory_saver_hook_mode_torch

torch_memory_saver_hook_mode_preload.abi3.so ADDED Viewed

Binary file

torch_memory_saver_hook_mode_torch.abi3.so ADDED Viewed

Binary file

torch_memory_saver-0.0.6.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-torch_memory_saver_cpp.abi3.so,sha256=OCweTnvdmyg5zhUIMJjfH9NW0lYjtcNwqzS9-89cCvQ,315896
-torch_memory_saver/__init__.py,sha256=B3AXwxxJeUbNFKdrfaGzXvl3vTcgPOf2UjaFVtGCZ68,3072
-torch_memory_saver-0.0.6.dist-info/LICENSE,sha256=i806R5xShJFB4k9yNQJ2GYCcSBlu1frTx2vH_nWdWE8,1064
-torch_memory_saver-0.0.6.dist-info/METADATA,sha256=P21LYFkCJHFwaMAkxZBoiQRkhvIQmOBAKbHHoQdQiEI,108
-torch_memory_saver-0.0.6.dist-info/WHEEL,sha256=HUPiMa7ZA9BvJ9gdJRYwZIjK2rWbCcrqYvJ4Onw0owE,102
-torch_memory_saver-0.0.6.dist-info/top_level.txt,sha256=uJ27-bVSKHxdcfHRcakvEr_KQxnUlMia6v19fHbfHxA,42
-torch_memory_saver-0.0.6.dist-info/RECORD,,

torch_memory_saver-0.0.6.dist-info/top_level.txt DELETED Viewed

	@@ -1,2 +0,0 @@
1	- torch_memory_saver
2	- torch_memory_saver_cpp

torch_memory_saver_cpp.abi3.so DELETED Viewed

Binary file

{torch_memory_saver-0.0.6.dist-info → torch_memory_saver-0.0.9rc1.dist-info}/LICENSE RENAMED Viewed

File without changes

{torch_memory_saver-0.0.6.dist-info → torch_memory_saver-0.0.9rc1.dist-info}/WHEEL RENAMED Viewed

File without changes