torch-memory-saver 0.0.9rc1__tar.gz → 0.0.9rc3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torch_memory_saver-0.0.9rc3/MANIFEST.in +1 -0
- {torch_memory_saver-0.0.9rc1/torch_memory_saver.egg-info → torch_memory_saver-0.0.9rc3}/PKG-INFO +1 -1
- torch_memory_saver-0.0.9rc3/csrc/api_forwarder.h +8 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/csrc/core.cpp +13 -156
- torch_memory_saver-0.0.9rc3/csrc/core.h +58 -0
- torch_memory_saver-0.0.9rc3/csrc/hardware_amd_support.h +58 -0
- torch_memory_saver-0.0.9rc3/csrc/macro.h +54 -0
- torch_memory_saver-0.0.9rc3/csrc/utils.h +241 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/setup.py +9 -4
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3/torch_memory_saver.egg-info}/PKG-INFO +1 -1
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver.egg-info/SOURCES.txt +6 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/LICENSE +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/README.md +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/csrc/api_forwarder.cpp +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/csrc/entrypoint.cpp +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/setup.cfg +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/test/test_examples.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/__init__.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/binary_wrapper.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/entrypoint.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/__init__.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/base.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/mode_preload.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/mode_torch.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/testing_utils.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/utils.py +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver.egg-info/dependency_links.txt +0 -0
- {torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver.egg-info/top_level.txt +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
include csrc/*.h
|
@@ -3,6 +3,10 @@
|
|
3
3
|
#include "macro.h"
|
4
4
|
#include "api_forwarder.h"
|
5
5
|
|
6
|
+
#if defined(USE_ROCM)
|
7
|
+
#include "hardware_amd_support.h"
|
8
|
+
#endif
|
9
|
+
|
6
10
|
TorchMemorySaver::TorchMemorySaver() {}
|
7
11
|
|
8
12
|
TorchMemorySaver &TorchMemorySaver::instance() {
|
@@ -12,85 +16,7 @@ TorchMemorySaver &TorchMemorySaver::instance() {
|
|
12
16
|
|
13
17
|
cudaError_t TorchMemorySaver::malloc(void **ptr, CUdevice device, size_t size, const std::string& tag, const bool enable_cpu_backup) {
|
14
18
|
#if defined(USE_ROCM)
|
15
|
-
|
16
|
-
CURESULT_CHECK(hipCtxGetDevice(&device));
|
17
|
-
|
18
|
-
// // Get granularity and calculate aligned size
|
19
|
-
// size_t granularity = CUDAUtils::cu_mem_get_granularity(device);
|
20
|
-
// size_t aligned_size = (size + granularity - 1) & ~(granularity - 1);
|
21
|
-
|
22
|
-
// //// Reserve aligned memory address, rocm will check granularity
|
23
|
-
// CURESULT_CHECK(hipMemAddressReserve((hipDeviceptr_t *)ptr, aligned_size, granularity, 0, 0));
|
24
|
-
|
25
|
-
hipMemAllocationProp prop = {};
|
26
|
-
prop.type = hipMemAllocationTypePinned;
|
27
|
-
prop.location.type = hipMemLocationTypeDevice;
|
28
|
-
prop.location.id = device;
|
29
|
-
prop.allocFlags.compressionType = 0x0;
|
30
|
-
|
31
|
-
size_t granularity;
|
32
|
-
CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop,
|
33
|
-
hipMemAllocationGranularityMinimum));
|
34
|
-
size_t aligned_size = ((size + granularity - 1) / granularity) * granularity;
|
35
|
-
aligned_size = (aligned_size + MEMCREATE_CHUNK_SIZE - 1) / MEMCREATE_CHUNK_SIZE * MEMCREATE_CHUNK_SIZE;
|
36
|
-
|
37
|
-
assert(MEMCREATE_CHUNK_SIZE % granularity == 0);
|
38
|
-
assert(aligned_size % MEMCREATE_CHUNK_SIZE == 0);
|
39
|
-
assert(aligned_size % granularity == 0);
|
40
|
-
|
41
|
-
|
42
|
-
// Create allocation metadata
|
43
|
-
AllocationMetadata metadata;
|
44
|
-
metadata.size = size;
|
45
|
-
metadata.aligned_size = aligned_size;
|
46
|
-
metadata.device = device;
|
47
|
-
//// Not sure (Check these parameters)
|
48
|
-
metadata.tag = tag;
|
49
|
-
metadata.enable_cpu_backup = enable_cpu_backup;
|
50
|
-
metadata.cpu_backup = nullptr;
|
51
|
-
////
|
52
|
-
|
53
|
-
// Get global device ID using our utility function
|
54
|
-
int global_device_id = DeviceUtils::get_global_device_id(device);
|
55
|
-
|
56
|
-
// rewrite numa node
|
57
|
-
uint64_t node_id = 0;
|
58
|
-
if (global_device_id > 3) {
|
59
|
-
node_id = 1;
|
60
|
-
}
|
61
|
-
|
62
|
-
#ifdef TMS_DEBUG_LOG
|
63
|
-
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
|
64
|
-
<< " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
|
65
|
-
<< " granularity=" << granularity
|
66
|
-
<< " aligned_size=" << aligned_size
|
67
|
-
<< " node_id=" << node_id
|
68
|
-
<< " device=" << device
|
69
|
-
<< " global_device_id=" << global_device_id
|
70
|
-
<< std::endl;
|
71
|
-
#endif
|
72
|
-
|
73
|
-
hipDeviceptr_t d_mem;
|
74
|
-
// Reserve aligned memory address, rocm will check granularity
|
75
|
-
CURESULT_CHECK(hipMemAddressReserve(&d_mem, aligned_size, granularity, 0, node_id));
|
76
|
-
*ptr = (void*)d_mem;
|
77
|
-
|
78
|
-
// Create and map chunks
|
79
|
-
// CUDAUtils::cu_mem_create_and_map(device, size, (hipDeviceptr_t)*ptr,
|
80
|
-
CUDAUtils::cu_mem_create_and_map(device, aligned_size, (hipDeviceptr_t)*ptr,
|
81
|
-
metadata.allocHandles, metadata.chunk_sizes);
|
82
|
-
size_t num_chunks = metadata.allocHandles.size();
|
83
|
-
{
|
84
|
-
const std::lock_guard<std::mutex> lock(allocator_metadata_mutex_);
|
85
|
-
allocation_metadata_.emplace(*ptr, std::move(metadata));
|
86
|
-
}
|
87
|
-
#ifdef TMS_DEBUG_LOG
|
88
|
-
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_malloc "
|
89
|
-
<< " ptr=" << ptr << " *ptr=" << *ptr << " size=" << size
|
90
|
-
<< " metadata.aligned_size=" << metadata.aligned_size
|
91
|
-
<< " num_chunks=" << num_chunks
|
92
|
-
<< std::endl;
|
93
|
-
#endif
|
19
|
+
return ROCmHIPImplementation::rocm_malloc(ptr, device, size, tag, enable_cpu_backup, allocation_metadata_, allocator_metadata_mutex_);
|
94
20
|
|
95
21
|
#elif defined(USE_CUDA)
|
96
22
|
CUmemGenericAllocationHandle allocHandle;
|
@@ -122,28 +48,8 @@ cudaError_t TorchMemorySaver::malloc(void **ptr, CUdevice device, size_t size, c
|
|
122
48
|
|
123
49
|
cudaError_t TorchMemorySaver::free(void *ptr) {
|
124
50
|
#if defined(USE_ROCM)
|
125
|
-
|
126
|
-
{
|
127
|
-
const std::lock_guard<std::mutex> lock(allocator_metadata_mutex_);
|
128
|
-
SIMPLE_CHECK(allocation_metadata_.count(ptr), "Trying to free a pointer not allocated here");
|
129
|
-
metadata = std::move(allocation_metadata_[ptr]);
|
130
|
-
allocation_metadata_.erase(ptr);
|
131
|
-
}
|
132
|
-
|
133
|
-
// Unmap and release chunks
|
134
|
-
CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.size,
|
135
|
-
(hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
|
51
|
+
return ROCmHIPImplementation::rocm_free(ptr, allocation_metadata_, allocator_metadata_mutex_);
|
136
52
|
|
137
|
-
// Free the reserved address using stored aligned_size
|
138
|
-
CURESULT_CHECK(hipMemAddressFree((hipDeviceptr_t)ptr, metadata.aligned_size));
|
139
|
-
|
140
|
-
#ifdef TMS_DEBUG_LOG
|
141
|
-
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.cuda_free "
|
142
|
-
<< " ptr=" << ptr << " metadata.size=" << metadata.size
|
143
|
-
<< " metadata.aligned_size=" << metadata.aligned_size
|
144
|
-
<< " num_chunks=" << metadata.allocHandles.size()
|
145
|
-
<< std::endl;
|
146
|
-
#endif
|
147
53
|
#elif defined(USE_CUDA)
|
148
54
|
AllocationMetadata metadata;
|
149
55
|
{
|
@@ -179,41 +85,12 @@ cudaError_t TorchMemorySaver::free(void *ptr) {
|
|
179
85
|
}
|
180
86
|
|
181
87
|
void TorchMemorySaver::pause(const std::string& tag) {
|
182
|
-
const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
|
183
|
-
|
184
88
|
#if defined(USE_ROCM)
|
185
|
-
|
186
|
-
void *ptr = it->first;
|
187
|
-
AllocationMetadata &metadata = it->second;
|
89
|
+
ROCmHIPImplementation::rocm_pause(tag, allocation_metadata_, allocator_metadata_mutex_);
|
188
90
|
|
189
|
-
if (!tag.empty() && metadata.tag != tag) {
|
190
|
-
continue;
|
191
|
-
}
|
192
|
-
// Copy CUDA's code supporting cpu_backup to here
|
193
|
-
if (metadata.enable_cpu_backup) {
|
194
|
-
if (nullptr == metadata.cpu_backup) {
|
195
|
-
CUDA_ERROR_CHECK(hipMallocHost(&metadata.cpu_backup, metadata.aligned_size));
|
196
|
-
}
|
197
|
-
SIMPLE_CHECK(metadata.cpu_backup != nullptr, "cpu_backup should not be nullptr");
|
198
|
-
// TODO may use cudaMemcpyAsync if needed
|
199
|
-
CUDA_ERROR_CHECK(cudaMemcpy(metadata.cpu_backup, ptr, metadata.aligned_size, hipMemcpyDeviceToHost));
|
200
|
-
}
|
201
|
-
//
|
202
|
-
|
203
|
-
// Unmap and release chunks (but keep metadata for resume)
|
204
|
-
// CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.size,
|
205
|
-
CUDAUtils::cu_mem_unmap_and_release(metadata.device, metadata.aligned_size,
|
206
|
-
(hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
|
207
|
-
|
208
|
-
#ifdef TMS_DEBUG_LOG
|
209
|
-
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.pause"
|
210
|
-
<< " ptr=" << ptr << " metadata.size=" << metadata.size
|
211
|
-
<< " metadata.aligned_size=" << metadata.aligned_size
|
212
|
-
<< " num_chunks=" << metadata.allocHandles.size()
|
213
|
-
<< std::endl;
|
214
|
-
#endif
|
215
|
-
}
|
216
91
|
#elif defined(USE_CUDA)
|
92
|
+
const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
|
93
|
+
|
217
94
|
for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
|
218
95
|
void *ptr = it->first;
|
219
96
|
AllocationMetadata& metadata = it->second;
|
@@ -258,32 +135,12 @@ void TorchMemorySaver::pause(const std::string& tag) {
|
|
258
135
|
}
|
259
136
|
|
260
137
|
void TorchMemorySaver::resume(const std::string& tag) {
|
261
|
-
const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
|
262
|
-
|
263
138
|
#if defined(USE_ROCM)
|
264
|
-
|
265
|
-
void *ptr = it->first;
|
266
|
-
AllocationMetadata &metadata = it->second;
|
267
|
-
|
268
|
-
if (!tag.empty() && metadata.tag != tag) {
|
269
|
-
continue;
|
270
|
-
}
|
271
|
-
|
272
|
-
// Create new handles and map chunks
|
273
|
-
// CUDAUtils::cu_mem_create_and_map(metadata.device, metadata.size,
|
274
|
-
CUDAUtils::cu_mem_create_and_map(metadata.device, metadata.aligned_size,
|
275
|
-
(hipDeviceptr_t)ptr, metadata.allocHandles, metadata.chunk_sizes);
|
276
|
-
|
277
|
-
#ifdef TMS_DEBUG_LOG
|
278
|
-
std::cout << "[torch_memory_saver.cpp] TorchMemorySaver.resume"
|
279
|
-
<< " ptr=" << ptr << " metadata.size=" << metadata.size
|
280
|
-
<< " metadata.aligned_size=" << metadata.aligned_size
|
281
|
-
<< " num_chunks=" << metadata.allocHandles.size()
|
282
|
-
<< std::endl;
|
283
|
-
#endif
|
284
|
-
}
|
139
|
+
ROCmHIPImplementation::rocm_resume(tag, allocation_metadata_, allocator_metadata_mutex_);
|
285
140
|
|
286
141
|
#elif defined(USE_CUDA)
|
142
|
+
const std::lock_guard <std::mutex> lock(allocator_metadata_mutex_);
|
143
|
+
|
287
144
|
for (auto it = allocation_metadata_.begin(); it != allocation_metadata_.end(); ++it) {
|
288
145
|
void *ptr = it->first;
|
289
146
|
AllocationMetadata &metadata = it->second;
|
@@ -329,4 +186,4 @@ void TorchMemorySaver::resume(const std::string& tag) {
|
|
329
186
|
#else
|
330
187
|
#error "USE_PLATFORM is not set"
|
331
188
|
#endif
|
332
|
-
}
|
189
|
+
}
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include <sys/types.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <unordered_map>
|
5
|
+
#include <mutex>
|
6
|
+
#include <string>
|
7
|
+
#include "utils.h"
|
8
|
+
#include "macro.h"
|
9
|
+
|
10
|
+
#if defined(USE_ROCM)
|
11
|
+
#include "hardware_amd_support.h"
|
12
|
+
#endif
|
13
|
+
|
14
|
+
enum class AllocationState {
|
15
|
+
// Memory is mapped and accessible
|
16
|
+
ACTIVE,
|
17
|
+
// Memory is unmapped and inaccessible
|
18
|
+
PAUSED
|
19
|
+
};
|
20
|
+
|
21
|
+
struct AllocationMetadata {
|
22
|
+
size_t size;
|
23
|
+
CUdevice device;
|
24
|
+
std::string tag;
|
25
|
+
AllocationState state;
|
26
|
+
bool enable_cpu_backup;
|
27
|
+
void* cpu_backup;
|
28
|
+
|
29
|
+
#if defined(USE_CUDA)
|
30
|
+
CUmemGenericAllocationHandle allocHandle;
|
31
|
+
#elif defined(USE_ROCM)
|
32
|
+
size_t aligned_size;
|
33
|
+
std::vector<hipMemGenericAllocationHandle_t> allocHandles;
|
34
|
+
std::vector<size_t> chunk_sizes;
|
35
|
+
#else
|
36
|
+
#error "USE_PLATFORM is not set"
|
37
|
+
#endif
|
38
|
+
};
|
39
|
+
|
40
|
+
class TorchMemorySaver {
|
41
|
+
public:
|
42
|
+
static TorchMemorySaver& instance();
|
43
|
+
|
44
|
+
cudaError_t malloc(void** ptr, CUdevice device, size_t size, const std::string& tag, bool enable_cpu_backup);
|
45
|
+
cudaError_t free(void* ptr);
|
46
|
+
|
47
|
+
void pause(const std::string& tag);
|
48
|
+
void resume(const std::string& tag);
|
49
|
+
|
50
|
+
private:
|
51
|
+
TorchMemorySaver();
|
52
|
+
~TorchMemorySaver() = default;
|
53
|
+
TorchMemorySaver(const TorchMemorySaver&) = delete;
|
54
|
+
TorchMemorySaver& operator=(const TorchMemorySaver&) = delete;
|
55
|
+
|
56
|
+
std::mutex allocator_metadata_mutex_;
|
57
|
+
std::unordered_map<void*, AllocationMetadata> allocation_metadata_;
|
58
|
+
};
|
@@ -0,0 +1,58 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include "macro.h"
|
3
|
+
#include "utils.h"
|
4
|
+
#include <vector>
|
5
|
+
#include <string>
|
6
|
+
#include <sstream>
|
7
|
+
#include <cstdlib>
|
8
|
+
#include <unordered_map>
|
9
|
+
#include <mutex>
|
10
|
+
|
11
|
+
#if defined(USE_ROCM)
|
12
|
+
|
13
|
+
// Forward declaration
|
14
|
+
enum class AllocationState;
|
15
|
+
struct AllocationMetadata;
|
16
|
+
|
17
|
+
// Device utility functions for ROCm
|
18
|
+
namespace DeviceUtils {
|
19
|
+
// Get global device ID from local device ID
|
20
|
+
int get_global_device_id(hipDevice_t local_device_id);
|
21
|
+
}
|
22
|
+
|
23
|
+
// High-level ROCm implementation functions
|
24
|
+
namespace ROCmHIPImplementation {
|
25
|
+
// Malloc implementation for ROCm
|
26
|
+
cudaError_t rocm_malloc(
|
27
|
+
void **ptr,
|
28
|
+
CUdevice device,
|
29
|
+
size_t size,
|
30
|
+
const std::string& tag,
|
31
|
+
bool enable_cpu_backup,
|
32
|
+
std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
|
33
|
+
std::mutex& allocator_metadata_mutex
|
34
|
+
);
|
35
|
+
|
36
|
+
// Free implementation for ROCm
|
37
|
+
cudaError_t rocm_free(
|
38
|
+
void *ptr,
|
39
|
+
std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
|
40
|
+
std::mutex& allocator_metadata_mutex
|
41
|
+
);
|
42
|
+
|
43
|
+
// Pause implementation for ROCm
|
44
|
+
void rocm_pause(
|
45
|
+
const std::string& tag,
|
46
|
+
std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
|
47
|
+
std::mutex& allocator_metadata_mutex
|
48
|
+
);
|
49
|
+
|
50
|
+
// Resume implementation for ROCm
|
51
|
+
void rocm_resume(
|
52
|
+
const std::string& tag,
|
53
|
+
std::unordered_map<void*, AllocationMetadata>& allocation_metadata,
|
54
|
+
std::mutex& allocator_metadata_mutex
|
55
|
+
);
|
56
|
+
}
|
57
|
+
|
58
|
+
#endif // USE_ROCM
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
// Define platform macros and include appropriate headers
|
4
|
+
#if defined(USE_ROCM)
|
5
|
+
// Include HIP runtime headers for AMD ROCm platform
|
6
|
+
#include <hip/hip_runtime_api.h>
|
7
|
+
#include <hip/hip_runtime.h>
|
8
|
+
#include <sstream>
|
9
|
+
#include <cstdlib>
|
10
|
+
#include <cstring>
|
11
|
+
#include <cassert>
|
12
|
+
/*
|
13
|
+
* ROCm API Mapping References:
|
14
|
+
* - CUDA Driver API to HIP: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Driver_API_functions_supported_by_HIP.html
|
15
|
+
* - CUDA Runtime API to HIP: https://rocm.docs.amd.com/projects/HIPIFY/en/latest/reference/tables/CUDA_Runtime_API_functions_supported_by_HIP.html
|
16
|
+
*/
|
17
|
+
// --- Error Handling Types and Constants ---
|
18
|
+
#define CUresult hipError_t
|
19
|
+
#define cudaError_t hipError_t
|
20
|
+
#define CUDA_SUCCESS hipSuccess
|
21
|
+
#define cudaSuccess hipSuccess
|
22
|
+
// --- Error Reporting Functions ---
|
23
|
+
#define cuGetErrorString hipDrvGetErrorString
|
24
|
+
#define cudaGetErrorString hipGetErrorString
|
25
|
+
// --- Memory Management Functions ---
|
26
|
+
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
|
27
|
+
#define cuMemUnmap hipMemUnmap
|
28
|
+
#define cuMemRelease hipMemRelease
|
29
|
+
#define cudaMallocHost hipHostMalloc
|
30
|
+
#define cudaMemcpy hipMemcpy
|
31
|
+
// --- Memory Copy Direction Constants ---
|
32
|
+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
33
|
+
// --- Device and Stream Types ---
|
34
|
+
#define CUdevice hipDevice_t
|
35
|
+
#define cudaStream_t hipStream_t
|
36
|
+
// --- Memory Allocation Constants ---
|
37
|
+
// Chunk size for memory creation operations (2 MB)
|
38
|
+
#define MEMCREATE_CHUNK_SIZE (2 * 1024 * 1024)
|
39
|
+
// --- Utility Macros ---
|
40
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
41
|
+
|
42
|
+
// ============================================================================
|
43
|
+
// CUDA Platform Configuration (NVIDIA GPUs)
|
44
|
+
// ============================================================================
|
45
|
+
#elif defined(USE_CUDA)
|
46
|
+
#include <cuda_runtime_api.h>
|
47
|
+
#include <cuda.h>
|
48
|
+
|
49
|
+
// ============================================================================
|
50
|
+
// Error: No Platform Specified
|
51
|
+
// ============================================================================
|
52
|
+
#else
|
53
|
+
#error "USE_PLATFORM is not set"
|
54
|
+
#endif
|
@@ -0,0 +1,241 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include <iostream>
|
3
|
+
#include <vector>
|
4
|
+
#include "macro.h"
|
5
|
+
|
6
|
+
// #define TMS_DEBUG_LOG
|
7
|
+
|
8
|
+
// Cannot use pytorch (libc10.so) since LD_PRELOAD happens earlier than `import torch`
|
9
|
+
// Thus copy from torch Macros.h
|
10
|
+
#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
|
11
|
+
#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
|
12
|
+
#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
|
13
|
+
#else
|
14
|
+
#define C10_LIKELY(expr) (expr)
|
15
|
+
#define C10_UNLIKELY(expr) (expr)
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#define SIMPLE_CHECK(COND, MSG) \
|
19
|
+
do { \
|
20
|
+
if (!(COND)) { \
|
21
|
+
std::cerr << "[torch_memory_saver.cpp] " << MSG \
|
22
|
+
<< " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
|
23
|
+
<< std::endl; \
|
24
|
+
exit(1); \
|
25
|
+
} \
|
26
|
+
} while (false)
|
27
|
+
|
28
|
+
#define CURESULT_CHECK(EXPR) \
|
29
|
+
do { \
|
30
|
+
CUresult __result = (EXPR); \
|
31
|
+
if (__result != CUDA_SUCCESS) { \
|
32
|
+
const char* err_str = nullptr; \
|
33
|
+
cuGetErrorString(__result, &err_str); \
|
34
|
+
std::cerr << "[torch_memory_saver.cpp] CUresult error: " \
|
35
|
+
<< __result << " (" << (err_str ? err_str : "Unknown error") << ") " \
|
36
|
+
<< " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
|
37
|
+
<< std::endl; \
|
38
|
+
exit(1); \
|
39
|
+
} \
|
40
|
+
} while (false)
|
41
|
+
|
42
|
+
#define CUDA_ERROR_CHECK(EXPR) \
|
43
|
+
do { \
|
44
|
+
cudaError_t __result = (EXPR); \
|
45
|
+
if (__result != cudaSuccess) { \
|
46
|
+
const char* err_str = cudaGetErrorString(__result); \
|
47
|
+
std::cerr << "[torch_memory_saver.cpp] cudaError error: " \
|
48
|
+
<< __result << " (" << (err_str ? err_str : "Unknown error") << ") " \
|
49
|
+
<< " file=" << __FILE__ << " func=" << __func__ << " line=" << __LINE__ \
|
50
|
+
<< std::endl; \
|
51
|
+
exit(1); \
|
52
|
+
} \
|
53
|
+
} while (false)
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
namespace CUDAUtils {
|
58
|
+
#if defined(USE_ROCM)
|
59
|
+
|
60
|
+
#if HIP_VERSION < 60304000 // rocm/hip 6.3.4
|
61
|
+
#pragma message "You need to implement torch_memory_saver in ROCm/HIP 6.3.4 or lower. We did not support it currently."
|
62
|
+
#else
|
63
|
+
// After rocm-7.0, we can use the same way to implement torch_memory_saver as CUDA side. --> Need to verify
|
64
|
+
#pragma message "Using ROCm/HIP >= 6.4.2 implementation"
|
65
|
+
// hipMemCreate currently has issue in rocm-6.3.4. After it is fixed in rocm-7.0, we can use the same way to implement torch_memory_saver as CUDA side.
|
66
|
+
// Current, we based on the chuck-wise method to implement it.
|
67
|
+
static void cu_mem_create_and_map(hipDevice_t device,
|
68
|
+
size_t aligned_size,
|
69
|
+
void* d_mem,
|
70
|
+
std::vector<hipMemGenericAllocationHandle_t>& allocHandles,
|
71
|
+
std::vector<size_t>& chunk_sizes) {
|
72
|
+
|
73
|
+
hipMemAllocationProp prop = {};
|
74
|
+
prop.type = hipMemAllocationTypePinned;
|
75
|
+
prop.location.type = hipMemLocationTypeDevice;
|
76
|
+
prop.location.id = device;
|
77
|
+
|
78
|
+
// // Get granularity
|
79
|
+
// size_t granularity;
|
80
|
+
// CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop,
|
81
|
+
// hipMemAllocationGranularityMinimum));
|
82
|
+
|
83
|
+
// // Make sure chunk size is aligned with hardware granularity
|
84
|
+
// size_t aligned_chunk_size = ((MEMCREATE_CHUNK_SIZE + granularity - 1) / granularity) * granularity;
|
85
|
+
// size_t num_chunks = (size + aligned_chunk_size - 1) / aligned_chunk_size;
|
86
|
+
|
87
|
+
// Get granularity, Make sure chunk size is aligned with hardware granularity
|
88
|
+
// size == aligned_size
|
89
|
+
size_t num_chunks = (aligned_size + MEMCREATE_CHUNK_SIZE - 1) / MEMCREATE_CHUNK_SIZE;
|
90
|
+
|
91
|
+
allocHandles.resize(num_chunks);
|
92
|
+
chunk_sizes.resize(num_chunks);
|
93
|
+
|
94
|
+
// Calculate chunk sizes
|
95
|
+
for (size_t i = 0; i < num_chunks; ++i) {
|
96
|
+
// chunk_sizes[i] = MIN(size - i * aligned_chunk_size, aligned_chunk_size);
|
97
|
+
chunk_sizes[i] = MIN(aligned_size - i * MEMCREATE_CHUNK_SIZE, MEMCREATE_CHUNK_SIZE);
|
98
|
+
#ifdef TMS_DEBUG_LOG
|
99
|
+
std::cout << "[torch_memory_saver.cpp] chunk_sizes[" << i << "] = " << chunk_sizes[i] << std::endl;
|
100
|
+
#endif
|
101
|
+
}
|
102
|
+
|
103
|
+
// Create memory handles for each chunk
|
104
|
+
for (size_t i = 0; i < num_chunks; ++i) {
|
105
|
+
CURESULT_CHECK(hipMemCreate(&allocHandles[i], chunk_sizes[i], &prop, 0));
|
106
|
+
#ifdef TMS_DEBUG_LOG
|
107
|
+
std::cout << "[torch_memory_saver.cpp] allocHandles[" << i << "] = " << allocHandles[i] << std::endl;
|
108
|
+
#endif
|
109
|
+
}
|
110
|
+
|
111
|
+
// Map each chunk
|
112
|
+
size_t allocated_size = 0;
|
113
|
+
for (size_t i = 0; i < num_chunks; ++i) {
|
114
|
+
void* map_addr = (void*)((uintptr_t)d_mem + allocated_size);
|
115
|
+
CURESULT_CHECK(hipMemMap((hipDeviceptr_t)map_addr, chunk_sizes[i], 0, allocHandles[i], 0));
|
116
|
+
allocated_size += chunk_sizes[i];
|
117
|
+
#ifdef TMS_DEBUG_LOG
|
118
|
+
std::cout << "[torch_memory_saver.cpp] mapped chunk " << i << " at offset " << allocated_size - chunk_sizes[i] << std::endl;
|
119
|
+
#endif
|
120
|
+
}
|
121
|
+
|
122
|
+
// Set access permissions
|
123
|
+
hipMemAccessDesc accessDesc = {};
|
124
|
+
accessDesc.location.type = hipMemLocationTypeDevice;
|
125
|
+
accessDesc.location.id = device;
|
126
|
+
accessDesc.flags = hipMemAccessFlagsProtReadWrite;
|
127
|
+
CURESULT_CHECK(hipMemSetAccess(d_mem, aligned_size, &accessDesc, 1));
|
128
|
+
}
|
129
|
+
|
130
|
+
|
131
|
+
static void cu_mem_unmap_and_release(hipDevice_t device,
|
132
|
+
size_t aligned_size,
|
133
|
+
hipDeviceptr_t d_mem,
|
134
|
+
const std::vector<hipMemGenericAllocationHandle_t>& allocHandles,
|
135
|
+
const std::vector<size_t>& chunk_sizes) {
|
136
|
+
|
137
|
+
// Unmap each chunk
|
138
|
+
size_t deallocated_size = 0;
|
139
|
+
for (size_t i = 0; i < allocHandles.size(); ++i) {
|
140
|
+
void* map_addr = (void*)((uintptr_t)d_mem + deallocated_size);
|
141
|
+
CURESULT_CHECK(hipMemUnmap((hipDeviceptr_t)map_addr, chunk_sizes[i]));
|
142
|
+
deallocated_size += chunk_sizes[i];
|
143
|
+
#ifdef TMS_DEBUG_LOG
|
144
|
+
std::cout << "[torch_memory_saver.cpp] unmapped chunk " << i << " at offset " << deallocated_size - chunk_sizes[i] << std::endl;
|
145
|
+
#endif
|
146
|
+
}
|
147
|
+
|
148
|
+
// Release each handle
|
149
|
+
for (size_t i = 0; i < allocHandles.size(); ++i) {
|
150
|
+
CURESULT_CHECK(hipMemRelease(allocHandles[i]));
|
151
|
+
#ifdef TMS_DEBUG_LOG
|
152
|
+
std::cout << "[torch_memory_saver.cpp] released allocHandles[" << i << "]" << std::endl;
|
153
|
+
#endif
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
static size_t cu_mem_get_granularity(hipDevice_t device) {
|
158
|
+
hipMemAllocationProp prop = {};
|
159
|
+
prop.type = hipMemAllocationTypePinned;
|
160
|
+
prop.location.type = hipMemLocationTypeDevice;
|
161
|
+
prop.location.id = device;
|
162
|
+
|
163
|
+
size_t granularity;
|
164
|
+
CURESULT_CHECK(hipMemGetAllocationGranularity(&granularity, &prop, hipMemAllocationGranularityMinimum));
|
165
|
+
return granularity;
|
166
|
+
}
|
167
|
+
|
168
|
+
static CUdevice cu_ctx_get_device() {
|
169
|
+
CUdevice ans;
|
170
|
+
CURESULT_CHECK(hipCtxGetDevice(&ans));
|
171
|
+
return ans;
|
172
|
+
}
|
173
|
+
|
174
|
+
static CUdevice cu_device_get(int device_ordinal) {
|
175
|
+
CUdevice ans;
|
176
|
+
CURESULT_CHECK(hipDeviceGet(&ans, device_ordinal));
|
177
|
+
return ans;
|
178
|
+
}
|
179
|
+
#endif
|
180
|
+
|
181
|
+
#elif defined(USE_CUDA)
|
182
|
+
static void cu_mem_create(CUmemGenericAllocationHandle *alloc_handle, size_t size, CUdevice device) {
|
183
|
+
CUmemAllocationProp prop = {};
|
184
|
+
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
185
|
+
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
186
|
+
prop.location.id = device;
|
187
|
+
|
188
|
+
int flag = 0;
|
189
|
+
CURESULT_CHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, device));
|
190
|
+
if (flag) { // support GPUDirect RDMA if possible
|
191
|
+
prop.allocFlags.gpuDirectRDMACapable = 1;
|
192
|
+
}
|
193
|
+
|
194
|
+
CURESULT_CHECK(cuMemCreate(alloc_handle, size, &prop, 0));
|
195
|
+
}
|
196
|
+
|
197
|
+
static void cu_mem_set_access(void *ptr, size_t size, CUdevice device) {
|
198
|
+
CUmemAccessDesc access_desc = {};
|
199
|
+
access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
200
|
+
access_desc.location.id = device;
|
201
|
+
access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
202
|
+
CURESULT_CHECK(cuMemSetAccess((CUdeviceptr) ptr, size, &access_desc, 1));
|
203
|
+
}
|
204
|
+
|
205
|
+
static CUdevice cu_ctx_get_device() {
|
206
|
+
CUdevice ans;
|
207
|
+
CURESULT_CHECK(cuCtxGetDevice(&ans));
|
208
|
+
return ans;
|
209
|
+
}
|
210
|
+
|
211
|
+
static CUdevice cu_device_get(int device_ordinal) {
|
212
|
+
CUdevice ans;
|
213
|
+
CURESULT_CHECK(cuDeviceGet(&ans, device_ordinal));
|
214
|
+
return ans;
|
215
|
+
}
|
216
|
+
|
217
|
+
#else
|
218
|
+
#error "USE_PLATFORM is not set"
|
219
|
+
|
220
|
+
#endif
|
221
|
+
}
|
222
|
+
|
223
|
+
inline bool get_bool_env_var(const char* name) {
|
224
|
+
const char* env_cstr = std::getenv(name);
|
225
|
+
if (env_cstr == nullptr) {
|
226
|
+
return false;
|
227
|
+
}
|
228
|
+
|
229
|
+
std::string env_str(env_cstr);
|
230
|
+
if (env_str == "1" || env_str == "true" || env_str == "TRUE" || env_str == "yes" || env_str == "YES") {
|
231
|
+
return true;
|
232
|
+
}
|
233
|
+
if (env_str == "0" || env_str == "false" || env_str == "FALSE" || env_str == "no" || env_str == "NO") {
|
234
|
+
return false;
|
235
|
+
}
|
236
|
+
|
237
|
+
std::cerr << "[torch_memory_saver.cpp] Unsupported environment varialbe value "
|
238
|
+
<< " name=" << name << " value=" << env_str
|
239
|
+
<< std::endl;
|
240
|
+
exit(1);
|
241
|
+
}
|
@@ -87,11 +87,17 @@ def _create_ext_modules(platform):
|
|
87
87
|
|
88
88
|
# Common define macros
|
89
89
|
common_macros = [('Py_LIMITED_API', '0x03090000')]
|
90
|
+
|
91
|
+
# Common compile arguments
|
92
|
+
extra_compile_args = ['-std=c++17', '-O3']
|
90
93
|
|
91
94
|
# Platform-specific configurations
|
92
95
|
platform_home = Path(_find_platform_home(platform))
|
93
96
|
|
94
97
|
if platform == "hip":
|
98
|
+
# Add ROCm-specific source file
|
99
|
+
sources.append('csrc/hardware_amd_support.cpp')
|
100
|
+
|
95
101
|
include_dirs = [str(platform_home.resolve() / 'include')]
|
96
102
|
library_dirs = [str(platform_home.resolve() / 'lib')]
|
97
103
|
libraries = ['amdhip64', 'dl']
|
@@ -102,9 +108,8 @@ def _create_ext_modules(platform):
|
|
102
108
|
str((platform_home / 'lib64').resolve()),
|
103
109
|
str((platform_home / 'lib64/stubs').resolve()),
|
104
110
|
]
|
105
|
-
libraries = ['cuda']
|
111
|
+
libraries = ['cuda', 'cudart']
|
106
112
|
platform_macros = [('USE_CUDA', '1')]
|
107
|
-
extra_compile_args = ['-std=c++17', '-O3']
|
108
113
|
|
109
114
|
# Create extensions with different hook modes
|
110
115
|
ext_modules = [
|
@@ -146,9 +151,9 @@ class build_ext_for_platform(build_platform_ext):
|
|
146
151
|
|
147
152
|
setup(
|
148
153
|
name='torch_memory_saver',
|
149
|
-
version='0.0.
|
154
|
+
version='0.0.9rc3',
|
150
155
|
ext_modules=ext_modules,
|
151
156
|
cmdclass={'build_ext': build_ext_for_platform},
|
152
157
|
python_requires=">=3.9",
|
153
158
|
packages=setuptools.find_packages(include=["torch_memory_saver", "torch_memory_saver.*"]),
|
154
|
-
)
|
159
|
+
)
|
{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver.egg-info/SOURCES.txt
RENAMED
@@ -1,9 +1,15 @@
|
|
1
1
|
LICENSE
|
2
|
+
MANIFEST.in
|
2
3
|
README.md
|
3
4
|
setup.py
|
4
5
|
csrc/api_forwarder.cpp
|
6
|
+
csrc/api_forwarder.h
|
5
7
|
csrc/core.cpp
|
8
|
+
csrc/core.h
|
6
9
|
csrc/entrypoint.cpp
|
10
|
+
csrc/hardware_amd_support.h
|
11
|
+
csrc/macro.h
|
12
|
+
csrc/utils.h
|
7
13
|
test/test_examples.py
|
8
14
|
torch_memory_saver/__init__.py
|
9
15
|
torch_memory_saver/binary_wrapper.py
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/binary_wrapper.py
RENAMED
File without changes
|
{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/entrypoint.py
RENAMED
File without changes
|
{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/__init__.py
RENAMED
File without changes
|
{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/base.py
RENAMED
File without changes
|
{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/mode_preload.py
RENAMED
File without changes
|
{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/hooks/mode_torch.py
RENAMED
File without changes
|
{torch_memory_saver-0.0.9rc1 → torch_memory_saver-0.0.9rc3}/torch_memory_saver/testing_utils.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|