PyPI - torch-memory-saver - Versions diffs - 0.0.9rc2__tar.gz → 0.0.9rc3__tar.gz - Mend

torch-memory-saver 0.0.9rc2tar.gz → 0.0.9rc3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{torch_memory_saver-0.0.9rc2/torch_memory_saver.egg-info → torch_memory_saver-0.0.9rc3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torch_memory_saver
-Version: 0.0.9rc2
+Version: 0.0.9rc3
 Requires-Python: >=3.9
 License-File: LICENSE
 Dynamic: license-file

{torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/csrc/core.cpp RENAMED Viewed

@@ -3,6 +3,10 @@
 #include "macro.h"
 #include "api_forwarder.h"
+#if defined(USE_ROCM)
+#include "hardware_amd_support.h"
+#endif
 TorchMemorySaver::TorchMemorySaver() {}
 TorchMemorySaver &TorchMemorySaver::instance() {
@@ -12,85 +16,7 @@ TorchMemorySaver &TorchMemorySaver::instance() {
 cudaError_t TorchMemorySaver::malloc(void **ptr, CUdevice device, size_t size, const std::string& tag, const bool enable_cpu_backup) {
 #if defined(USE_ROCM)
-    // hipDevice_t device;
-    CURESULT_CHECK(hipCtxGetDevice(&device));
-    // // Get granularity and calculate aligned size
-    // size_t granularity = CUDAUtils::cu_mem_get_granularity(device);
-    // size_t aligned_size = (size + granularity - 1) & ~(granularity - 1);
-    // //// Reserve aligned memory address, rocm will check granularity
-    // CURESULT_CHECK(hipMemAddressReserve((hipDeviceptr_t *)ptr, aligned_size, granularity, 0, 0));
-    hipMemAllocationProp prop = {};
-    prop.type = hipMemAllocationTypePinned;
-    prop.location.type = hipMemLocationTypeDevice;
-    prop.location.id = device;
-    prop.allocFlags.compressionType = 0x0;
-    size_t granularity;
-    CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop,
-                                            hipMemAllocationGranularityMinimum));
-    size_t aligned_size = ((size + granularity - 1) / granularity) * granularity;
-    aligned_size = (aligned_size + MEMCREATE_CHUNK_SIZE - 1) / MEMCREATE_CHUNK_SIZE * MEMCREATE_CHUNK_SIZE;
-    assert(MEMCREATE_CHUNK_SIZE % granularity == 0);
-    assert(aligned_size % MEMCREATE_CHUNK_SIZE == 0);
-    assert(aligned_size % granularity == 0);
-    // Create allocation metadata
-    AllocationMetadata metadata;
-    metadata.size = size;
-    metadata.aligned_size = aligned_size;
-    metadata.device = device;
-    //// Not sure (Check these parameters)
-    metadata.tag = tag;
-    metadata.enable_cpu_backup = enable_cpu_backup;
-    metadata.cpu_backup = nullptr;
-    ////
-    // Get global device ID using our utility function
-    int global_device_id = DeviceUtils::get_global_device_id(device);
-    // rewrite numa node
-    uint64_t node_id = 0;
-    if (global_device_id > 3) {
-        node_id = 1;
-    }
-#ifdef TMS_DEBUG_LOG
-    std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
-              << " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
-              << " granularity=" << granularity
-              << " aligned_size=" << aligned_size
-              << " node_id=" << node_id
-              << " device=" << device
-              << " global_device_id=" << global_device_id
-              << std::endl;
-#endif
-    hipDeviceptr_t d_mem;
-    // Reserve aligned memory address, rocm will check granularity
-    CURESULT_CHECK(hipMemAddressReserve(&d_mem, aligned_size, granularity, 0, node_id));
-    *ptr = (void*)d_mem;
-    // Create and map chunks
-    // CUDAUtils::cu_mem_create_and_map(device, size, (hipDeviceptr_t)*ptr,
-    CUDAUtils::cu_mem_create_and_map(device, aligned_size, (hipDeviceptr_t)*ptr,
-                                    metadata.allocHandles, metadata.chunk_sizes);
-    size_t num_chunks = metadata.allocHandles.size();
-    {
-        const std::lock_guard<std::mutex> lock(allocator_metadata_mutex_);
-        allocation_metadata_.emplace(*ptr, std::move(metadata));
-    }
-#ifdef TMS_DEBUG_LOG
-    std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
-              << " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
-              << " metadata.aligned_size=" << metadata.aligned_size
-              << " num_chunks=" << num_chunks
-              << std::endl;
-#endif
+    return ROCmHIPImplementation::rocm_malloc(ptr, device, size, tag, enable_cpu_backup, allocation_metadata_, allocator_metadata_mutex_);
 #elif defined(USE_CUDA)
     CUmemGenericAllocationHandle allocHandle;
@@ -122,28 +48,8 @@ cudaError_t TorchMemorySaver::malloc(void **ptr, CUdevice device, size_t size, c
 cudaError_t TorchMemorySaver::free(void *ptr) {
 #if defined(USE_ROCM)
-    AllocationMetadata metadata;
-    {
-        const std::lock_guard<std::mutex> lock(allocator_metadata_mutex_);
-        SIMPLE_CHECK(allocation_metadata_.count(ptr), "Trying to free a pointer not allocated here");
-        metadata = std::move(allocation_metadata_[ptr]);
-        allocation_metadata_.erase(ptr);
-    }
-    // Unmap and release chunks
-    CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.size,
-                                        (hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
+    return ROCmHIPImplementation::rocm_free(ptr, allocation_metadata_, allocator_metadata_mutex_);
-    // Free the reserved address using stored aligned_size
-    CURESULT_CHECK(hipMemAddressFree((hipDeviceptr_t)ptr, metadata.aligned_size));
-#ifdef TMS_DEBUG_LOG
-    std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_free "
-              << " ptr=" << ptr << " metadata.size=" << metadata.size
-              << " metadata.aligned_size=" << metadata.aligned_size
-              << " num_chunks=" << metadata.allocHandles.size()
-              << std::endl;
-#endif
 #elif defined(USE_CUDA)
     AllocationMetadata metadata;
     {
@@ -179,41 +85,12 @@ cudaError_t TorchMemorySaver::free(void *ptr) {
 }
 void TorchMemorySaver::pause(const std::string& tag) {
-    const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
 #if defined(USE_ROCM)
-    for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
-        void *ptr = it->first;
-        AllocationMetadata &metadata = it->second;
+    ROCmHIPImplementation::rocm_pause(tag, allocation_metadata_, allocator_metadata_mutex_);
-        if (!tag.empty() && metadata.tag != tag) {
-            continue;
-        }
-        // Copy CUDA's code supporting cpu_backup to here
-        if (metadata.enable_cpu_backup) {
-            if (nullptr == metadata.cpu_backup) {
-                CUDA_ERROR_CHECK(hipMallocHost(&metadata.cpu_backup, metadata.aligned_size));
-            }
-            SIMPLE_CHECK(metadata.cpu_backup != nullptr, "cpu_backup should not be nullptr");
-            // TODO may use cudaMemcpyAsync if needed
-            CUDA_ERROR_CHECK(cudaMemcpy(metadata.cpu_backup, ptr, metadata.aligned_size, hipMemcpyDeviceToHost));
-        }
-        //
-        // Unmap and release chunks (but keep metadata for resume)
-        // CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.size,
-        CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.aligned_size,
-                                            (hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
-        #ifdef TMS_DEBUG_LOG
-            std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.pause"
-                    << " ptr=" << ptr << " metadata.size=" << metadata.size
-                    << " metadata.aligned_size=" << metadata.aligned_size
-                    << " num_chunks=" << metadata.allocHandles.size()
-                    << std::endl;
-        #endif
-    }
 #elif defined(USE_CUDA)
+    const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
     for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
         void *ptr = it->first;
         AllocationMetadata& metadata = it->second;
@@ -258,32 +135,12 @@ void TorchMemorySaver::pause(const std::string& tag) {
 }
 void TorchMemorySaver::resume(const std::string& tag) {
-    const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
 #if defined(USE_ROCM)
-    for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
-        void *ptr = it->first;
-        AllocationMetadata &metadata = it->second;
-        if (!tag.empty() && metadata.tag != tag) {
-            continue;
-        }
-        // Create new handles and map chunks
-        // CUDAUtils::cu_mem_create_and_map(metadata.device, metadata.size,
-        CUDAUtils::cu_mem_create_and_map(metadata.device, metadata.aligned_size,
-                                        (hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
-#ifdef TMS_DEBUG_LOG
-        std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.resume"
-                << " ptr=" << ptr << " metadata.size=" << metadata.size
-                << " metadata.aligned_size=" << metadata.aligned_size
-                << " num_chunks=" << metadata.allocHandles.size()
-                << std::endl;
-#endif
-    }
+    ROCmHIPImplementation::rocm_resume(tag, allocation_metadata_, allocator_metadata_mutex_);
 #elif defined(USE_CUDA)
+    const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
     for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
         void *ptr = it->first;
         AllocationMetadata &metadata = it->second;
@@ -329,4 +186,4 @@ void TorchMemorySaver::resume(const std::string& tag) {
 #else
     #error "USE_PLATFORM is not set"
 #endif
-}
+}

torch_memory_saver-0.0.9rc3/csrc/core.h ADDED Viewed

@@ -0,0 +1,58 @@
+#pragma once
+#include <sys/types.h>
+#include <stdio.h>
+#include <unordered_map>
+#include <mutex>
+#include <string>
+#include "utils.h"
+#include "macro.h"
+#if defined(USE_ROCM)
+#include "hardware_amd_support.h"
+#endif
+enum class AllocationState {
+    // Memory is mapped and accessible
+    ACTIVE,
+    // Memory is unmapped and inaccessible
+    PAUSED
+};
+struct AllocationMetadata {
+    size_t size;
+    CUdevice device;
+    std::string tag;
+    AllocationState state;
+    bool enable_cpu_backup;
+    void* cpu_backup;
+#if defined(USE_CUDA)
+    CUmemGenericAllocationHandle allocHandle;
+#elif defined(USE_ROCM)
+    size_t aligned_size;
+    std::vector<hipMemGenericAllocationHandle_t> allocHandles;
+    std::vector<size_t> chunk_sizes;
+#else
+    #error "USE_PLATFORM is not set"
+#endif
+};
+class TorchMemorySaver {
+public:
+    static TorchMemorySaver& instance();
+    cudaError_t malloc(void** ptr, CUdevice device, size_t size, const std::string& tag, bool enable_cpu_backup);
+    cudaError_t free(void* ptr);
+    void pause(const std::string& tag);
+    void resume(const std::string& tag);
+private:
+    TorchMemorySaver();
+    ~TorchMemorySaver() = default;
+    TorchMemorySaver(const TorchMemorySaver&) = delete;
+    TorchMemorySaver& operator=(const TorchMemorySaver&) = delete;
+    std::mutex allocator_metadata_mutex_;
+    std::unordered_map<void*, AllocationMetadata> allocation_metadata_;
+};

torch_memory_saver-0.0.9rc3/csrc/hardware_amd_support.h ADDED Viewed

@@ -0,0 +1,58 @@
+#pragma once
+#include "macro.h"
+#include "utils.h"
+#include <vector>
+#include <string>
+#include <sstream>
+#include <cstdlib>
+#include <unordered_map>
+#include <mutex>
+#if defined(USE_ROCM)
+// Forward declaration
+enum class AllocationState;
+struct AllocationMetadata;
+// Device utility functions for ROCm
+namespace DeviceUtils {
+    // Get global device ID from local device ID
+    int get_global_device_id(hipDevice_t local_device_id);
+}
+// High-level ROCm implementation functions
+namespace ROCmHIPImplementation {
+    // Malloc implementation for ROCm
+    cudaError_t rocm_malloc(
+        void **ptr,
+        CUdevice device,
+        size_t size,
+        const std::string& tag,
+        bool enable_cpu_backup,
+        std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
+        std::mutex& allocator_metadata_mutex
+    );
+    // Free implementation for ROCm
+    cudaError_t rocm_free(
+        void *ptr,
+        std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
+        std::mutex& allocator_metadata_mutex
+    );
+    // Pause implementation for ROCm
+    void rocm_pause(
+        const std::string& tag,
+        std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
+        std::mutex& allocator_metadata_mutex
+    );
+    // Resume implementation for ROCm
+    void rocm_resume(
+        const std::string& tag,
+        std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
+        std::mutex& allocator_metadata_mutex
+    );
+}
+#endif // USE_ROCM

torch_memory_saver-0.0.9rc3/csrc/macro.h ADDED Viewed

@@ -0,0 +1,54 @@
+#pragma once
+// Define platform macros and include appropriate headers
+#if defined(USE_ROCM)
+// Include HIP runtime headers for AMD ROCm platform
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_runtime.h>
+#include <sstream>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+/*
+ * ROCm API Mapping References:
+ * - CUDA Driver API to HIP: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Driver_API_functions_supported_by_HIP.html
+ * - CUDA Runtime API to HIP: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Runtime_API_functions_supported_by_HIP.html
+ */
+// --- Error Handling Types and Constants ---
+#define CUresult hipError_t
+#define cudaError_t hipError_t
+#define CUDA_SUCCESS hipSuccess
+#define cudaSuccess hipSuccess
+// --- Error Reporting Functions ---
+#define cuGetErrorString hipDrvGetErrorString
+#define cudaGetErrorString hipGetErrorString
+// --- Memory Management Functions ---
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define cuMemUnmap hipMemUnmap
+#define cuMemRelease hipMemRelease
+#define cudaMallocHost hipHostMalloc
+#define cudaMemcpy hipMemcpy
+// --- Memory Copy Direction Constants ---
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+// --- Device and Stream Types ---
+#define CUdevice hipDevice_t
+#define cudaStream_t hipStream_t
+// --- Memory Allocation Constants ---
+// Chunk size for memory creation operations (2 MB)
+#define MEMCREATE_CHUNK_SIZE (2 * 1024 * 1024)
+// --- Utility Macros ---
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+// ============================================================================
+// CUDA Platform Configuration (NVIDIA GPUs)
+// ============================================================================
+#elif defined(USE_CUDA)
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+// ============================================================================
+// Error: No Platform Specified
+// ============================================================================
+#else
+#error "USE_PLATFORM is not set"
+#endif

{torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/csrc/utils.h RENAMED Viewed

@@ -57,11 +57,11 @@
 namespace CUDAUtils {
 #if defined(USE_ROCM)
-    #if HIP_VERSION >= 60402000 // rocm/hip 6.4.2
-        #pragma message "Using ROCm/HIP 6.4.2+ implementation"
-        // Implement when rocm release >= 6.4.2 version
+    #if HIP_VERSION < 60304000 // rocm/hip 6.3.4
+        #pragma message "You need to implement torch_memory_saver in ROCm/HIP 6.3.4 or lower. We did not support it currently."
     #else
-        #pragma message "Using ROCm/HIP < 6.4.2 implementation"
+        // After rocm-7.0, we can use the same way to implement torch_memory_saver as CUDA side. --> Need to verify
+        #pragma message "Using ROCm/HIP >= 6.4.2 implementation"
         // hipMemCreate currently has issue in rocm-6.3.4. After it is fixed in rocm-7.0, we can use the same way to implement torch_memory_saver as CUDA side.
         // Current, we based on the chuck-wise method to implement it.
         static void cu_mem_create_and_map(hipDevice_t device,

{torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/setup.py RENAMED Viewed

@@ -87,11 +87,17 @@ def _create_ext_modules(platform):
     # Common define macros
     common_macros = [('Py_LIMITED_API', '0x03090000')]
+    # Common compile arguments
+    extra_compile_args = ['-std=c++17', '-O3']
     # Platform-specific configurations
     platform_home = Path(_find_platform_home(platform))
     if platform == "hip":
+        # Add ROCm-specific source file
+        sources.append('csrc/hardware_amd_support.cpp')
         include_dirs = [str(platform_home.resolve() / 'include')]
         library_dirs = [str(platform_home.resolve() / 'lib')]
         libraries = ['amdhip64', 'dl']
@@ -104,7 +110,6 @@ def _create_ext_modules(platform):
         ]
         libraries = ['cuda', 'cudart']
         platform_macros = [('USE_CUDA', '1')]
-        extra_compile_args = ['-std=c++17', '-O3']
     # Create extensions with different hook modes
     ext_modules = [
@@ -146,7 +151,7 @@ class build_ext_for_platform(build_platform_ext):
 setup(
     name='torch_memory_saver',
-    version='0.0.9rc2',
+    version='0.0.9rc3',
     ext_modules=ext_modules,
     cmdclass={'build_ext': build_ext_for_platform},
     python_requires=">=3.9",

{torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3/torch_memory_saver.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torch_memory_saver
-Version: 0.0.9rc2
+Version: 0.0.9rc3
 Requires-Python: >=3.9
 License-File: LICENSE
 Dynamic: license-file

{torch_memory_saver-0.0.9rc2 → torch_memory_saver-0.0.9rc3}/torch_memory_saver.egg-info/SOURCES.txt RENAMED Viewed

@@ -7,6 +7,7 @@ csrc/api_forwarder.h
 csrc/core.cpp
 csrc/core.h
 csrc/entrypoint.cpp
+csrc/hardware_amd_support.h
 csrc/macro.h
 csrc/utils.h
 test/test_examples.py

torch_memory_saver-0.0.9rc2/csrc/core.h DELETED Viewed

@@ -1,96 +0,0 @@
-#pragma once
-#include <sys/types.h>
-#include <stdio.h>
-#include <unordered_map>
-#include <mutex>
-#include <string>
-#include "utils.h"
-#include "macro.h"
-enum class AllocationState {
-    // Memory is mapped and accessible
-    ACTIVE,
-    // Memory is unmapped and inaccessible
-    PAUSED
-};
-struct AllocationMetadata {
-    size_t size;
-    CUdevice device;
-    std::string tag;
-    AllocationState state;
-    bool enable_cpu_backup;
-    void* cpu_backup;
-#if defined(USE_CUDA)
-    CUmemGenericAllocationHandle allocHandle;
-#elif defined(USE_ROCM)
-    size_t aligned_size;
-    std::vector<hipMemGenericAllocationHandle_t> allocHandles;
-    std::vector<size_t> chunk_sizes;
-#else
-    #error "USE_PLATFORM is not set"
-#endif
-};
-#if defined(USE_ROCM)
-namespace DeviceUtils {
-    // Simple function to get global device ID from local device ID
-    static int get_global_device_id(hipDevice_t local_device_id) {
-        // Check for HIP_VISIBLE_DEVICES environment variable
-        const char* hip_visible = std::getenv("HIP_VISIBLE_DEVICES");
-        if (hip_visible && strlen(hip_visible) > 0) {
-            std::string devices_str(hip_visible);
-            std::stringstream ss(devices_str);
-            std::string device_str;
-            std::vector<int> device_list;
-            // Parse comma-separated device list
-            while (std::getline(ss, device_str, ',')) {
-                if (!device_str.empty()) {
-                    device_list.push_back(std::atoi(device_str.c_str()));
-                }
-            }
-            if (local_device_id < device_list.size()) {
-                int global_device_id = device_list[local_device_id];
-#ifdef TMS_DEBUG_LOG
-                std::cout << "[torch_memory_saver.cpp] HIP_VISIBLE_DEVICES=" << hip_visible
-                        << " local_device_id=" << local_device_id
-                        << " -> global_device_id=" << global_device_id << std::endl;
-#endif
-                return global_device_id;
-            }
-        }
-        // Fallback: return local device ID as-is
-#ifdef TMS_DEBUG_LOG
-        std::cout << "[torch_memory_saver.cpp] No HIP_VISIBLE_DEVICES, using local_device_id=" << local_device_id << std::endl;
-#endif
-        return local_device_id;
-    }
-}
-#endif
-class TorchMemorySaver {
-public:
-    static TorchMemorySaver& instance();
-    cudaError_t malloc(void** ptr, CUdevice device, size_t size, const std::string& tag, bool enable_cpu_backup);
-    cudaError_t free(void* ptr);
-    void pause(const std::string& tag);
-    void resume(const std::string& tag);
-private:
-    TorchMemorySaver();
-    ~TorchMemorySaver() = default;
-    TorchMemorySaver(const TorchMemorySaver&) = delete;
-    TorchMemorySaver& operator=(const TorchMemorySaver&) = delete;
-    std::mutex allocator_metadata_mutex_;
-    std::unordered_map<void*, AllocationMetadata> allocation_metadata_;
-};

torch_memory_saver-0.0.9rc2/csrc/macro.h DELETED Viewed

@@ -1,40 +0,0 @@
-#pragma once
-// Define platform macros and include appropriate headers
-#if defined(USE_ROCM)
-// Lookup the table to define the macros: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Driver_API_functions_supported_by_HIP.html
-// Lookup the table to define the macros: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Runtime_API_functions_supported_by_HIP.html?utm_source=chatgpt.com
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_runtime.h>
-#include <sstream>
-#include <cstdlib>
-// Define a general alias
-#define CUresult hipError_t
-#define cudaError_t hipError_t
-#define CUDA_SUCCESS hipSuccess
-#define cudaSuccess hipSuccess
-#define cuGetErrorString hipDrvGetErrorString
-#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
-#define CUdevice hipDevice_t
-#define cudaStream_t hipStream_t
-#define cudaMallocHost hipHostMalloc
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cuGetErrorString hipDrvGetErrorString
-#define cudaGetErrorString hipGetErrorString
-#define cuMemUnmap hipMemUnmap
-#define cuMemRelease hipMemRelease
-// #define cudaMalloc hipMalloc
-// #define cudaFree hipFree
-// #define CUdevice hipDevice_t
-// #define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
-#define MEMCREATE_CHUNK_SIZE (2 * 1024 * 1024)
-#define MIN(a, b) (a < b ? a : b)
-#elif defined(USE_CUDA)
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#else
-#error "USE_PLATFORM is not set"
-#endif