PyPI - torch-memory-saver - Versions diffs - 0.0.6__tar.gz → 0.0.8__tar.gz - Mend

torch-memory-saver 0.0.6tar.gz → 0.0.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: torch_memory_saver
-Version: 0.0.6
+Version: 0.0.8
 Summary: UNKNOWN
 Home-page: UNKNOWN
 License: UNKNOWN

torch_memory_saver-0.0.8/README.md ADDED Viewed

@@ -0,0 +1,74 @@
+# Torch Memory Saver
+A PyTorch library that allows tensor memory to be temporarily released and resumed later.
+During the pause:
+- Physical memory is released
+- Virtual address is preserved
+When resume:
+- Virtual address is restored to the original one
+Please refer to https://github.com/sgl-project/sglang/issues/2542#issuecomment-2563641647 for details.
+## Examples
+### Basic Example
+```python
+import torch_memory_saver
+memory_saver = torch_memory_saver.memory_saver
+# 1. For tensors that wants to be paused, create them within `region`
+with memory_saver.region():
+    pauseable_tensor = torch.full((1_000_000_000,), 100, dtype=torch.uint8, device='cuda')
+# 2. After `pause`, CUDA memory is released for those tensors.
+# For example, check `nvidia-smi`'s memory usage to verify.
+memory_saver.pause()
+# 3. After `resume`, CUDA memory is re-occupied for those tensors.
+memory_saver.resume()
+```
+### Multiple Tags Example
+Please refer to https://github.com/sgl-project/sglang/issues/7009 for details.
+```python
+from torch_memory_saver import torch_memory_saver
+# 1. Create tensors with different tags
+with torch_memory_saver.region(tag="type1"):
+    tensor1 = torch.full((5_000_000_000,), 100, dtype=torch.uint8, device='cuda')
+with torch_memory_saver.region(tag="type2"):
+    tensor2 = torch.full((5_000_000_000,), 100, dtype=torch.uint8, device='cuda')
+# 2. Pause and resume with different tags selectively
+torch_memory_saver.pause("type1")
+torch_memory_saver.pause("type2")
+torch_memory_saver.resume("type2")
+torch_memory_saver.resume("type1")
+torch_memory_saver.pause("type1")
+torch_memory_saver.resume("type1")
+```
+## Development
+```bash
+pip install -e .
+```
+A `torch_memory_saver_cpp.abi3.so` will be built under `{your_workspace}/torch_memory_saver/` folder.
+You can use this command for local testing:
+```bash
+LD_PRELOAD={your_workspace}/torch_memory_saver/torch_memory_saver_cpp.abi3.so python examples/simple.py
+LD_PRELOAD={your_workspace}/torch_memory_saver/torch_memory_saver_cpp.abi3.so python examples/rl_with_cuda_graph.py
+```

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/csrc/torch_memory_saver.cpp RENAMED Viewed

@@ -6,6 +6,7 @@
 #include <dlfcn.h>
 #include <unordered_map>
 #include <mutex>
+#include <string>
 // #define TMS_DEBUG_LOG
@@ -118,32 +119,32 @@ struct _AllocationMetadata {
     size_t size;
     CUdevice device;
     CUmemGenericAllocationHandle allocHandle;
+    std::string tag;
 };
 class TorchMemorySaver {
 public:
     TorchMemorySaver() {}
-    cudaError_t malloc(void **ptr, size_t size) {
+    cudaError_t malloc(void **ptr, size_t size, const std::string& tag) {
         CUdevice device;
         CURESULT_CHECK(cuCtxGetDevice(&device));
         CUmemGenericAllocationHandle allocHandle;
         CUDAUtils::cu_mem_create(&allocHandle, size, device);
         CURESULT_CHECK(cuMemAddressReserve((CUdeviceptr *) ptr, size, 0, 0, 0));
         CURESULT_CHECK(cuMemMap((CUdeviceptr) * ptr, size, 0, allocHandle, 0));
         CUDAUtils::cu_mem_set_access(*ptr, size, device);
         {
-            const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
-            allocation_metadata_.emplace(*ptr, _AllocationMetadata{size, device, allocHandle});
+            const std::lock_guard<std::mutex> lock(allocator_metadata_mutex_);
+            allocation_metadata_.emplace(*ptr, _AllocationMetadata{size, device, allocHandle, tag});
         }
 #ifdef TMS_DEBUG_LOG
         std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
                   << " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
-                  << " allocHandle=" << allocHandle
+                  << " allocHandle=" << allocHandle << " tag=" << tag
                   << std::endl;
 #endif
@@ -166,39 +167,47 @@ public:
 #ifdef TMS_DEBUG_LOG
         std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_free "
                   << " ptr=" << ptr << " metadata.size=" << metadata.size
-                  << " metadata.allocHandle=" << metadata.allocHandle
+                  << " metadata.allocHandle=" << metadata.allocHandle << " tag=" << metadata.tag
                   << std::endl;
 #endif
         return cudaSuccess;
     }
-    void pause() {
+    void pause(const std::string& tag) {
         const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
         for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
             void *ptr = it->first;
             _AllocationMetadata metadata = it->second;
+            if (!tag.empty() && metadata.tag != tag) {
+                continue;
+            }
             CURESULT_CHECK(cuMemUnmap((CUdeviceptr) ptr, metadata.size));
             CURESULT_CHECK(cuMemRelease(metadata.allocHandle));
 #ifdef TMS_DEBUG_LOG
             std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.pause"
                       << " ptr=" << ptr << " metadata.size=" << metadata.size << " metadata.allocHandle="
-                      << metadata.allocHandle
+                      << metadata.allocHandle << " tag=" << metadata.tag << " filter_tag=" << tag
                       << std::endl;
 #endif
         }
     }
-    void resume() {
+    void resume(const std::string& tag) {
         const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
         for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
             void *ptr = it->first;
             _AllocationMetadata &metadata = it->second;
+            if (!tag.empty() && metadata.tag != tag) {
+                continue;
+            }
             CUmemGenericAllocationHandle newAllocHandle;
             CUDAUtils::cu_mem_create(&newAllocHandle, metadata.size, metadata.device);
@@ -210,7 +219,7 @@ public:
             std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.resume"
                       << " ptr=" << ptr << " metadata.size=" << metadata.size << " (old)metadata.allocHandle="
                       << metadata.allocHandle
-                      << " (new)newAllocHandle=" << newAllocHandle
+                      << " (new)newAllocHandle=" << newAllocHandle << " tag=" << metadata.tag << " filter_tag=" << tag
                       << std::endl;
 #endif
@@ -223,14 +232,18 @@ public:
         return instance;
     }
 private:
-    // Similar to torch's CUDACachingAllocator and CUDAPluggableAllocator
     std::mutex allocator_metadata_mutex_;
     std::unordered_map<void *, _AllocationMetadata> allocation_metadata_;
 };
+// ----------------------------------------------- region manager --------------------------------------------------
 namespace RegionManager {
     static thread_local bool is_interesting_region_ = false;
+    static thread_local std::string current_tag_ = "default";
     void enter() {
 #ifdef TMS_DEBUG_LOG
@@ -249,13 +262,21 @@ namespace RegionManager {
     bool is_interesting_region() {
         return is_interesting_region_;
     }
+    void set_current_tag(const std::string& tag) {
+        current_tag_ = tag;
+    }
+    const std::string& get_current_tag() {
+        return current_tag_;
+    }
 }
 // ------------------------------------------------- entrypoints ------------------------------------------------
 cudaError_t cudaMalloc(void **ptr, size_t size) {
     if (RegionManager::is_interesting_region()) {
-        return TorchMemorySaver::instance().malloc(ptr, size);
+        return TorchMemorySaver::instance().malloc(ptr, size, RegionManager::get_current_tag());
     } else {
         return APIForwarder::call_real_cuda_malloc(ptr, size);
     }
@@ -278,11 +299,21 @@ void tms_region_leave() {
     RegionManager::leave();
 }
-void tms_pause() {
-    TorchMemorySaver::instance().pause();
+void tms_set_current_tag(const char* tag) {
+    if (tag == nullptr) {
+        std::cerr << "[torch_memory_saver.cpp] FATAL: NULL tag passed to tms_set_current_tag" << std::endl;
+        exit(1);
+    }
+    RegionManager::set_current_tag(std::string(tag));
 }
-void tms_resume() {
-    TorchMemorySaver::instance().resume();
+void tms_pause(const char* tag) {
+    std::string tag_str = (tag != nullptr) ? std::string(tag) : "";
+    TorchMemorySaver::instance().pause(tag_str);
 }
+void tms_resume(const char* tag) {
+    std::string tag_str = (tag != nullptr) ? std::string(tag) : "";
+    TorchMemorySaver::instance().resume(tag_str);
 }
+}

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/setup.py RENAMED Viewed

@@ -1,8 +1,9 @@
 import logging
 import os
 import shutil
 from pathlib import Path
+import platform
 import setuptools
 from setuptools import setup
@@ -24,25 +25,28 @@ def _find_cuda_home():
             cuda_home = '/usr/local/cuda'
     return cuda_home
 cuda_home = Path(_find_cuda_home())
 include_dirs = [
-    str(cuda_home.resolve() / 'targets/x86_64-linux/include'),
+    str((cuda_home / 'include').resolve()),
 ]
 library_dirs = [
-    str(cuda_home.resolve() / 'lib64'),
-    str(cuda_home.resolve() / 'lib64/stubs'),
+    str((cuda_home / 'lib64').resolve()),
+    str((cuda_home / 'lib64/stubs').resolve()),
 ]
 setup(
     name='torch_memory_saver',
-    version='0.0.6',
+    version='0.0.8',
     ext_modules=[setuptools.Extension(
         'torch_memory_saver_cpp',
         ['csrc/torch_memory_saver.cpp'],
         include_dirs=include_dirs,
         library_dirs=library_dirs,
-        libraries=['cuda']
+        libraries=['cuda'],
+        define_macros=[('Py_LIMITED_API', '0x03090000')],
+        py_limited_api=True,
     )],
     python_requires=">=3.9",
     packages=['torch_memory_saver'],

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/torch_memory_saver/__init__.py RENAMED Viewed

@@ -14,29 +14,34 @@ logger = logging.getLogger(__name__)
 class TorchMemorySaver:
     def __init__(self):
         self._mem_pool = None
-        self._id = _global_info.next_id()
-        assert self._id == 1, 'Only support one single instance yet (multi-instance will be implemented later)'
     @contextmanager
-    def region(self):
+    def region(self, tag: str = "default"):
+        """Context manager for memory saving with optional tag"""
         if _global_info.binary_info.enabled:
             self._ensure_mem_pool()
             with torch.cuda.use_mem_pool(self._mem_pool):
+                _global_info.binary_info.cdll.tms_set_current_tag(tag.encode('utf-8'))
                 _global_info.binary_info.cdll.tms_region_enter()
                 try:
                     yield
                 finally:
+                    _global_info.binary_info.cdll.tms_set_current_tag(b"default")
                     _global_info.binary_info.cdll.tms_region_leave()
         else:
             yield
-    def pause(self):
+    def pause(self, tag: Optional[str] = None):
+        """Pause memory for specific tag or all memory if tag is None"""
         if _global_info.binary_info.enabled:
-            _global_info.binary_info.cdll.tms_pause()
+            tag_bytes = tag.encode('utf-8') if tag else None
+            _global_info.binary_info.cdll.tms_pause(tag_bytes)
-    def resume(self):
+    def resume(self, tag: Optional[str] = None):
+        """Resume memory for specific tag or all memory if tag is None"""
         if _global_info.binary_info.enabled:
-            _global_info.binary_info.cdll.tms_resume()
+            tag_bytes = tag.encode('utf-8') if tag else None
+            _global_info.binary_info.cdll.tms_resume(tag_bytes)
     @property
     def enabled(self):
@@ -46,7 +51,6 @@ class TorchMemorySaver:
         if self._mem_pool is None:
             self._mem_pool = torch.cuda.MemPool()
 @dataclass
 class _BinaryInfo:
     cdll: Optional[ctypes.CDLL]
@@ -55,21 +59,39 @@ class _BinaryInfo:
     def enabled(self):
         return self.cdll is not None
+    @staticmethod
+    def _setup_function_signatures(cdll):
+        """Define function signatures for the C library"""
+        cdll.tms_region_enter.argtypes = []
+        cdll.tms_region_leave.argtypes = []
+        cdll.tms_set_current_tag.argtypes = [ctypes.c_char_p]
+        cdll.tms_pause.argtypes = [ctypes.c_char_p]
+        cdll.tms_resume.argtypes = [ctypes.c_char_p]
     @staticmethod
     def compute():
         env_ld_preload = os.environ.get('LD_PRELOAD', '')
         if 'torch_memory_saver' in env_ld_preload:
-            return _BinaryInfo(cdll=ctypes.CDLL(env_ld_preload))
+            try:
+                cdll = ctypes.CDLL(env_ld_preload)
+                _BinaryInfo._setup_function_signatures(cdll)
+                return _BinaryInfo(cdll=cdll)
+            except OSError as e:
+                logger.error(f'Failed to load CDLL from {env_ld_preload}: {e}')
+                return _BinaryInfo(cdll=None)
         else:
-            logger.warning(
-                f'TorchMemorySaver is disabled for the current process because invalid LD_PRELOAD="{env_ld_preload}" (process_id={os.getpid()})')
+            print(
+                f'TorchMemorySaver is disabled for the current process because invalid LD_PRELOAD. '
+                f'You can use configure_subprocess() utility, '
+                f'or directly specify `LD_PRELOAD=/path/to/torch_memory_saver_cpp.some-postfix.so python your_script.py. '
+                f'(LD_PRELOAD="{env_ld_preload}" process_id={os.getpid()})'
+            )
             return _BinaryInfo(cdll=None)
 class _GlobalInfo:
     def __init__(self):
         self._binary_info: Optional[_BinaryInfo] = None
-        self._last_id = 0
     @property
     def binary_info(self):
@@ -77,13 +99,11 @@ class _GlobalInfo:
             self._binary_info = _BinaryInfo.compute()
         return self._binary_info
-    def next_id(self):
-        self._last_id += 1
-        return self._last_id
 _global_info = _GlobalInfo()
+# Global singleton instance
+torch_memory_saver = TorchMemorySaver()
 def get_binary_path():
     dir_package = Path(__file__).parent
@@ -92,7 +112,7 @@ def get_binary_path():
         for d in [dir_package, dir_package.parent]
         for p in d.glob('torch_memory_saver_cpp.*.so')
     ]
-    assert len(candidates) == 1, f'{candidates=}'
+    assert len(candidates) == 1, f'Expected exactly one torch_memory_saver_cpp library, found: {candidates}'
     return candidates[0]

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/torch_memory_saver.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: torch-memory-saver
-Version: 0.0.6
+Version: 0.0.8
 Summary: UNKNOWN
 Home-page: UNKNOWN
 License: UNKNOWN

torch_memory_saver-0.0.6/README.md DELETED Viewed

@@ -1,29 +0,0 @@
-# torch_memory_saver
-Allow torch tensor memory to be released and resumed later.
-API:
-```python
-memory_saver = TorchMemorySaver()
-# 1. For tensors that wants to be paused, create them within `region`
-with memory_saver.region():
-    x = torch.full((1_000_000_000,), 100, dtype=torch.uint8, device='cuda')
-# 2. After `pause`, CUDA memory is released for those tensors.
-# For example, check `nvidia-smi`'s memory usage to verify.
-memory_saver.pause()
-# 3. After `resume`, CUDA memory is re-occupied for those tensors.
-memory_saver.resume()
-```
-Please refer to https://github.com/sgl-project/sglang/issues/2542#issuecomment-2563641647 for details.
-TODO:
-- [x] Implementation
-- [x] Publish to pypi
-- [ ] More tests and infra
-- [ ] Documentation

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/LICENSE RENAMED Viewed

File without changes

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/setup.cfg RENAMED Viewed

File without changes

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/torch_memory_saver.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/torch_memory_saver.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{torch_memory_saver-0.0.6 → torch_memory_saver-0.0.8}/torch_memory_saver.egg-info/top_level.txt RENAMED Viewed

File without changes

torch-memory-saver 0.0.6__tar.gz → 0.0.8__tar.gz

torch-memory-saver 0.0.6tar.gz → 0.0.8tar.gz