PyPI - torch-memory-saver - Versions diffs - 0.0.9rc1__tar.gz → 0.0.9rc2__tar.gz - Mend

torch-memory-saver 0.0.9rc1tar.gz → 0.0.9rc2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

torch_memory_saver-0.0.9rc2/MANIFEST.in ADDED Viewed

	@@ -0,0 +1 @@
1	+ include csrc/*.h

{torch_memory_saver-0.0.9rc1/torch_memory_saver.egg-info → torch_memory_saver-0.0.9rc2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torch_memory_saver
-Version: 0.0.9rc1
+Version: 0.0.9rc2
 Requires-Python: >=3.9
 License-File: LICENSE
 Dynamic: license-file

torch_memory_saver-0.0.9rc2/csrc/api_forwarder.h ADDED Viewed

@@ -0,0 +1,8 @@
+#pragma once
+#include <dlfcn.h>
+#include "macro.h"
+namespace APIForwarder {
+    cudaError_t call_real_cuda_malloc(void **ptr, size_t size);
+    cudaError_t call_real_cuda_free(void *ptr);
+}

torch_memory_saver-0.0.9rc2/csrc/core.h ADDED Viewed

@@ -0,0 +1,96 @@
+#pragma once
+#include <sys/types.h>
+#include <stdio.h>
+#include <unordered_map>
+#include <mutex>
+#include <string>
+#include "utils.h"
+#include "macro.h"
+enum class AllocationState {
+    // Memory is mapped and accessible
+    ACTIVE,
+    // Memory is unmapped and inaccessible
+    PAUSED
+};
+struct AllocationMetadata {
+    size_t size;
+    CUdevice device;
+    std::string tag;
+    AllocationState state;
+    bool enable_cpu_backup;
+    void* cpu_backup;
+#if defined(USE_CUDA)
+    CUmemGenericAllocationHandle allocHandle;
+#elif defined(USE_ROCM)
+    size_t aligned_size;
+    std::vector<hipMemGenericAllocationHandle_t> allocHandles;
+    std::vector<size_t> chunk_sizes;
+#else
+    #error "USE_PLATFORM is not set"
+#endif
+};
+#if defined(USE_ROCM)
+namespace DeviceUtils {
+    // Simple function to get global device ID from local device ID
+    static int get_global_device_id(hipDevice_t local_device_id) {
+        // Check for HIP_VISIBLE_DEVICES environment variable
+        const char* hip_visible = std::getenv("HIP_VISIBLE_DEVICES");
+        if (hip_visible && strlen(hip_visible) > 0) {
+            std::string devices_str(hip_visible);
+            std::stringstream ss(devices_str);
+            std::string device_str;
+            std::vector<int> device_list;
+            // Parse comma-separated device list
+            while (std::getline(ss, device_str, ',')) {
+                if (!device_str.empty()) {
+                    device_list.push_back(std::atoi(device_str.c_str()));
+                }
+            }
+            if (local_device_id < device_list.size()) {
+                int global_device_id = device_list[local_device_id];
+#ifdef TMS_DEBUG_LOG
+                std::cout << "[torch_memory_saver.cpp] HIP_VISIBLE_DEVICES=" << hip_visible
+                        << " local_device_id=" << local_device_id
+                        << " -> global_device_id=" << global_device_id << std::endl;
+#endif
+                return global_device_id;
+            }
+        }
+        // Fallback: return local device ID as-is
+#ifdef TMS_DEBUG_LOG
+        std::cout << "[torch_memory_saver.cpp] No HIP_VISIBLE_DEVICES, using local_device_id=" << local_device_id << std::endl;
+#endif
+        return local_device_id;
+    }
+}
+#endif
+class TorchMemorySaver {
+public:
+    static TorchMemorySaver& instance();
+    cudaError_t malloc(void** ptr, CUdevice device, size_t size, const std::string& tag, bool enable_cpu_backup);
+    cudaError_t free(void* ptr);
+    void pause(const std::string& tag);
+    void resume(const std::string& tag);
+private:
+    TorchMemorySaver();
+    ~TorchMemorySaver() = default;
+    TorchMemorySaver(const TorchMemorySaver&) = delete;
+    TorchMemorySaver& operator=(const TorchMemorySaver&) = delete;
+    std::mutex allocator_metadata_mutex_;
+    std::unordered_map<void*, AllocationMetadata> allocation_metadata_;
+};

torch_memory_saver-0.0.9rc2/csrc/macro.h ADDED Viewed

@@ -0,0 +1,40 @@
+#pragma once
+// Define platform macros and include appropriate headers
+#if defined(USE_ROCM)
+// Lookup the table to define the macros: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Driver_API_functions_supported_by_HIP.html
+// Lookup the table to define the macros: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Runtime_API_functions_supported_by_HIP.html?utm_source=chatgpt.com
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
+#include <sstream>
+#include <cstdlib>
+// Define a general alias
+#define CUresult hipError_t
+#define cudaError_t hipError_t
+#define CUDA_SUCCESS hipSuccess
+#define cudaSuccess hipSuccess
+#define cuGetErrorString hipDrvGetErrorString
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define CUdevice hipDevice_t
+#define cudaStream_t hipStream_t
+#define cudaMallocHost hipHostMalloc
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cuGetErrorString hipDrvGetErrorString
+#define cudaGetErrorString hipGetErrorString
+#define cuMemUnmap hipMemUnmap
+#define cuMemRelease hipMemRelease
+// #define cudaMalloc hipMalloc
+// #define cudaFree hipFree
+// #define CUdevice hipDevice_t
+// #define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
+#define MEMCREATE_CHUNK_SIZE (2 * 1024 * 1024)
+#define MIN(a, b) (a < b ? a : b)
+#elif defined(USE_CUDA)
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#else
+#error "USE_PLATFORM is not set"
+#endif

torch_memory_saver-0.0.9rc2/csrc/utils.h ADDED Viewed

@@ -0,0 +1,241 @@
+#pragma once
+#include <iostream>
+#include <vector>
+#include "macro.h"
+// #define TMS_DEBUG_LOG
+// Cannot use pytorch (libc10.so) since LD_PRELOAD happens earlier than `import torch`
+// Thus copy from torch Macros.h
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
+#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
+#else
+#define C10_LIKELY(expr) (expr)
+#define C10_UNLIKELY(expr) (expr)
+#endif
+#define SIMPLE_CHECK(COND, MSG) \
+  do { \
+    if (!(COND)) { \
+        std::cerr << "[torch_memory_saver.cpp] " << MSG \
+                  << " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
+                  << std::endl; \
+        exit(1); \
+    } \
+  } while (false)
+#define CURESULT_CHECK(EXPR) \
+  do { \
+    CUresult __result = (EXPR); \
+    if (__result != CUDA_SUCCESS) { \
+        const char* err_str = nullptr; \
+        cuGetErrorString(__result, &err_str); \
+        std::cerr << "[torch_memory_saver.cpp] CUresult error: " \
+                  << __result << " (" << (err_str ? err_str : "Unknown error") << ") " \
+                  << " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
+                  << std::endl; \
+        exit(1); \
+    } \
+  } while (false)
+#define CUDA_ERROR_CHECK(EXPR) \
+  do { \
+    cudaError_t __result = (EXPR); \
+    if (__result != cudaSuccess) { \
+        const char* err_str = cudaGetErrorString(__result); \
+        std::cerr << "[torch_memory_saver.cpp] cudaError error: " \
+                  << __result << " (" << (err_str ? err_str : "Unknown error") << ") " \
+                  << " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
+                  << std::endl; \
+        exit(1); \
+    } \
+  } while (false)
+namespace CUDAUtils {
+#if defined(USE_ROCM)
+    #if HIP_VERSION >= 60402000 // rocm/hip 6.4.2
+        #pragma message "Using ROCm/HIP 6.4.2+ implementation"
+        // Implement when rocm release >= 6.4.2 version
+    #else
+        #pragma message "Using ROCm/HIP < 6.4.2 implementation"
+        // hipMemCreate currently has issue in rocm-6.3.4. After it is fixed in rocm-7.0, we can use the same way to implement torch_memory_saver as CUDA side.
+        // Current, we based on the chuck-wise method to implement it.
+        static void cu_mem_create_and_map(hipDevice_t device,
+                                          size_t aligned_size,
+                                          void* d_mem,
+                                          std::vector<hipMemGenericAllocationHandle_t>& allocHandles,
+                                          std::vector<size_t>& chunk_sizes) {
+            hipMemAllocationProp prop = {};
+            prop.type = hipMemAllocationTypePinned;
+            prop.location.type = hipMemLocationTypeDevice;
+            prop.location.id = device;
+            // // Get granularity
+            // size_t granularity;
+            // CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop,
+            //             hipMemAllocationGranularityMinimum));
+            // // Make sure chunk size is aligned with hardware granularity
+            // size_t aligned_chunk_size = ((MEMCREATE_CHUNK_SIZE + granularity - 1) / granularity) * granularity;
+            // size_t num_chunks = (size + aligned_chunk_size - 1) / aligned_chunk_size;
+            // Get granularity, Make sure chunk size is aligned with hardware granularity
+            // size == aligned_size
+            size_t num_chunks = (aligned_size + MEMCREATE_CHUNK_SIZE - 1) / MEMCREATE_CHUNK_SIZE;
+            allocHandles.resize(num_chunks);
+            chunk_sizes.resize(num_chunks);
+            // Calculate chunk sizes
+            for (size_t i = 0; i < num_chunks; ++i) {
+                // chunk_sizes[i] = MIN(size - i * aligned_chunk_size, aligned_chunk_size);
+                chunk_sizes[i] = MIN(aligned_size - i * MEMCREATE_CHUNK_SIZE, MEMCREATE_CHUNK_SIZE);
+#ifdef TMS_DEBUG_LOG
+                std::cout << "[torch_memory_saver.cpp] chunk_sizes[" << i << "] = " << chunk_sizes[i] << std::endl;
+#endif
+            }
+            // Create memory handles for each chunk
+            for (size_t i = 0; i < num_chunks; ++i) {
+                CURESULT_CHECK(hipMemCreate(&allocHandles[i], chunk_sizes[i], &prop, 0));
+#ifdef TMS_DEBUG_LOG
+                std::cout << "[torch_memory_saver.cpp] allocHandles[" << i << "] = " << allocHandles[i] << std::endl;
+#endif
+            }
+            // Map each chunk
+            size_t allocated_size = 0;
+            for (size_t i = 0; i < num_chunks; ++i) {
+                void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
+                CURESULT_CHECK(hipMemMap((hipDeviceptr_t)map_addr, chunk_sizes[i], 0, allocHandles[i], 0));
+                allocated_size += chunk_sizes[i];
+#ifdef TMS_DEBUG_LOG
+                std::cout << "[torch_memory_saver.cpp] mapped chunk " << i << " at offset " << allocated_size - chunk_sizes[i] << std::endl;
+#endif
+            }
+            // Set access permissions
+            hipMemAccessDesc accessDesc = {};
+            accessDesc.location.type = hipMemLocationTypeDevice;
+            accessDesc.location.id = device;
+            accessDesc.flags = hipMemAccessFlagsProtReadWrite;
+            CURESULT_CHECK(hipMemSetAccess(d_mem, aligned_size, &accessDesc, 1));
+        }
+        static void cu_mem_unmap_and_release(hipDevice_t device,
+                                            size_t aligned_size,
+                                            hipDeviceptr_t d_mem,
+                                            const std::vector<hipMemGenericAllocationHandle_t>& allocHandles,
+                                            const std::vector<size_t>& chunk_sizes) {
+            // Unmap each chunk
+            size_t deallocated_size = 0;
+            for (size_t i = 0; i < allocHandles.size(); ++i) {
+                void* map_addr = (void*)((uintptr_t)d_mem + deallocated_size);
+                CURESULT_CHECK(hipMemUnmap((hipDeviceptr_t)map_addr, chunk_sizes[i]));
+                deallocated_size += chunk_sizes[i];
+#ifdef TMS_DEBUG_LOG
+                std::cout << "[torch_memory_saver.cpp] unmapped chunk " << i << " at offset " << deallocated_size - chunk_sizes[i] << std::endl;
+#endif
+            }
+            // Release each handle
+            for (size_t i = 0; i < allocHandles.size(); ++i) {
+                CURESULT_CHECK(hipMemRelease(allocHandles[i]));
+#ifdef TMS_DEBUG_LOG
+                std::cout << "[torch_memory_saver.cpp] released allocHandles[" << i << "]" << std::endl;
+#endif
+            }
+        }
+        static size_t cu_mem_get_granularity(hipDevice_t device) {
+            hipMemAllocationProp prop = {};
+            prop.type = hipMemAllocationTypePinned;
+            prop.location.type = hipMemLocationTypeDevice;
+            prop.location.id = device;
+            size_t granularity;
+            CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop, hipMemAllocationGranularityMinimum));
+            return granularity;
+        }
+        static CUdevice cu_ctx_get_device() {
+            CUdevice ans;
+            CURESULT_CHECK(hipCtxGetDevice(&ans));
+            return ans;
+        }
+        static CUdevice cu_device_get(int device_ordinal) {
+            CUdevice ans;
+            CURESULT_CHECK(hipDeviceGet(&ans, device_ordinal));
+            return ans;
+        }
+    #endif
+#elif defined(USE_CUDA)
+    static void cu_mem_create(CUmemGenericAllocationHandle *alloc_handle, size_t size, CUdevice device) {
+        CUmemAllocationProp prop = {};
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id = device;
+        int flag = 0;
+        CURESULT_CHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, device));
+        if (flag) {  // support GPUDirect RDMA if possible
+            prop.allocFlags.gpuDirectRDMACapable = 1;
+        }
+        CURESULT_CHECK(cuMemCreate(alloc_handle, size, &prop, 0));
+    }
+    static void cu_mem_set_access(void *ptr, size_t size, CUdevice device) {
+        CUmemAccessDesc access_desc = {};
+        access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        access_desc.location.id = device;
+        access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        CURESULT_CHECK(cuMemSetAccess((CUdeviceptr) ptr, size, &access_desc, 1));
+    }
+    static CUdevice cu_ctx_get_device() {
+        CUdevice ans;
+        CURESULT_CHECK(cuCtxGetDevice(&ans));
+        return ans;
+    }
+    static CUdevice cu_device_get(int device_ordinal) {
+        CUdevice ans;
+        CURESULT_CHECK(cuDeviceGet(&ans, device_ordinal));
+        return ans;
+    }
+#else
+    #error "USE_PLATFORM is not set"
+#endif
+}
+inline bool get_bool_env_var(const char* name) {
+    const char* env_cstr = std::getenv(name);
+    if (env_cstr == nullptr) {
+        return false;
+    }
+    std::string env_str(env_cstr);
+    if (env_str == "1" || env_str == "true" || env_str == "TRUE" || env_str == "yes" || env_str == "YES") {
+        return true;
+    }
+    if (env_str == "0" || env_str == "false" || env_str == "FALSE" || env_str == "no" || env_str == "NO") {
+        return false;
+    }
+    std::cerr << "[torch_memory_saver.cpp] Unsupported environment varialbe value "
+              << " name=" << name << " value=" << env_str
+              << std::endl;
+    exit(1);
+}

{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/setup.py RENAMED Viewed

@@ -102,7 +102,7 @@ def _create_ext_modules(platform):
             str((platform_home / 'lib64').resolve()),
             str((platform_home / 'lib64/stubs').resolve()),
         ]
-        libraries = ['cuda']
+        libraries = ['cuda', 'cudart']
         platform_macros = [('USE_CUDA', '1')]
         extra_compile_args = ['-std=c++17', '-O3']
@@ -146,9 +146,9 @@ class build_ext_for_platform(build_platform_ext):
 setup(
     name='torch_memory_saver',
-    version='0.0.9rc1',
+    version='0.0.9rc2',
     ext_modules=ext_modules,
     cmdclass={'build_ext': build_ext_for_platform},
     python_requires=">=3.9",
     packages=setuptools.find_packages(include=["torch_memory_saver", "torch_memory_saver.*"]),
-)
+)

{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2/torch_memory_saver.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torch_memory_saver
-Version: 0.0.9rc1
+Version: 0.0.9rc2
 Requires-Python: >=3.9
 License-File: LICENSE
 Dynamic: license-file

{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc2}/torch_memory_saver.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,9 +1,14 @@
 LICENSE
+MANIFEST.in
 README.md
 setup.py
 csrc/api_forwarder.cpp
+csrc/api_forwarder.h
 csrc/core.cpp
+csrc/core.h
 csrc/entrypoint.cpp
+csrc/macro.h
+csrc/utils.h
 test/test_examples.py
 torch_memory_saver/__init__.py
 torch_memory_saver/binary_wrapper.py